In [1]:
cd ..

/Users/jisooryu/Projects/lease-version-reliability


#### Model Training (train.py) -> Get classifier and processed df, df_all

In [4]:
import asyncio
import pickle 
import structlog

from train.common.logging import initialize_logging
from train.common.file_io import upload_models, upload_dataset
from train.config.settings import settings
from train.data.database_io import (
    get_all_data,
    get_labels,
    get_reliable_data,
)

from train.features.features import feature_engineering
from train.model.model import (
    get_column_names,
    get_split_columns,
    train_multioutput_classifiers,
)


In [5]:
initialize_logging(settings.ENV)
logger = structlog.get_logger(
    
)
# training data (masters with >3 versions within it)
reliable_data = get_reliable_data()

# all version data needed to export a reliability score
all_data = get_all_data()

# # submitter name for display purposes when exporting validation data
# submitter_name = get_submitter_info()

print(len(reliable_data))
print(len(all_data))

attributes = settings.ATTRIBUTES

col_names_correct, col_names_filled, col_names_label = get_column_names(
    attributes,
)

logger.info("Creating Data Labels")
data = get_labels(reliable_data, attributes)
all_data = get_labels(all_data, attributes)

logger.info("Feature Engineering - Reliable Data")
df = feature_engineering(
    data,
    col_names_label,
    col_names_filled,
    col_names_correct,
    attributes,
)

logger.info("Feature Engineering - All Data")
df_all = feature_engineering(
    all_data, 
    col_names_label,
    col_names_filled,
    col_names_correct,
    attributes,
)

logger.info("Model Training")
x_cols, y_cols = get_split_columns(df.columns)
model_dict = train_multioutput_classifiers(df, x_cols, y_cols)

Snowflake Connector for Python Version: 2.7.0, Python Version: 3.9.10, Platform: macOS-12.6.1-arm64-arm-64bit
This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
Setting use_openssl_only mode to False
query: [SELECT cv.id, cv.submitter_person_id, lds.logo, cv.comp_data_id AS comp_data_id_...]
query execution done
Snowflake Connector for Python Version: 2.7.0, Python Version: 3.9.10, Platform: macOS-12.6.1-arm64-arm-64bit
This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
query: [SELECT cv.id, cv.submitter_person_id, lds.logo, cv.comp_data_id AS comp_data_id_...]
query execution done
closed
No async queries seem to be 

In [17]:
%store df

Stored 'df' (DataFrame)


In [18]:
%store df_all

Stored 'df_all' (DataFrame)


### Upload Model to S3

In [12]:
import os
import shutil
import typing

import boto3
import botocore
import structlog

from train.config.settings import settings

logger = structlog.get_logger()


def get_web_identity_token() -> str:
    """
    Get token value from filepath
    """
    token = ""
    with open(settings.AWS_WEB_IDENTITY_TOKEN_FILE) as f:  # type: ignore
        token = f.read().strip()

    return token


def get_aws_cred() -> typing.Any:
    """
    Get AWS credential
    """
    token = get_web_identity_token()
    sts_client = boto3.client("sts")
    assumed_role_object = sts_client.assume_role_with_web_identity(
        RoleArn=settings.AWS_ROLE_ARN,
        RoleSessionName="SalesLinkageSession",
        WebIdentityToken=token,
    )

    return assumed_role_object["Credentials"]


def get_s3_resource() -> typing.Any:
    """
    Get S3 resource
    """
    if not settings.AWS_WEB_IDENTITY_TOKEN_FILE:
        s3_resource = boto3.resource("s3")
    else:
        cred = get_aws_cred()
        s3_resource = boto3.resource(
            "s3",
            aws_access_key_id=cred["AccessKeyId"],
            aws_secret_access_key=cred["SecretAccessKey"],
            aws_session_token=cred["SessionToken"],
        )

    return s3_resource

In [21]:
from batch.config.settings import settings 

def upload_models() -> None:
    """
    Upload the model to S3 bucket
    """

    s3 = get_s3_resource()
    object_name = f"{settings.PROJECT_NAME}/models/{settings.ENV}.model.tar.gz"
    file_name = f"{settings.MODEL_DIR}model.tar.gz"

    try:
        shutil.make_archive(
            settings.MODEL_DIR + "model",
            "gztar",
            settings.MODEL_DIR,
        )

        s3.Bucket(settings.MODELS_S3_BUCKET).upload_file(
            file_name,
            object_name.format(settings.ENV),
        )

        os.remove(file_name)
        logger.debug("Successfully uploaded models")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("Permission denied when trying to upload file.")
        else:
            raise

In [22]:
with open(f"{settings.MODEL_DIR}/{settings.MODEL_NAME}", "wb") as handle:
    pickle.dump(model_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

upload_models()

KeyboardInterrupt: 

### BATCH

In [23]:
import pandas as pd 

df = pd.read_pickle(f"{settings.DATA_DIR}/processed/reliable_data")
df_all = pd.read_pickle(f"{settings.DATA_DIR}/processed/all_data")