In [1]:
cd ..

/Users/jisooryu/Projects/lease-version-reliability


#### Model Training (train.py) -> Get classifier and processed df, df_all

In [2]:
import asyncio
import pickle 
import structlog

from train.common.logging import initialize_logging
from train.common.file_io import upload_models, upload_dataset
from train.config.settings import settings
from train.data.database_io import (
    get_all_data,
    get_labels,
    get_reliable_data,
)

from train.features.features import feature_engineering
from train.model.model import (
    get_column_names,
    get_split_columns,
    train_multioutput_classifiers,
)


In [3]:
initialize_logging(settings.ENV)
logger = structlog.get_logger(
    
)
# training data (masters with >3 versions within it)
reliable_data = get_reliable_data()

# all version data needed to export a reliability score
all_data = get_all_data()

# # submitter name for display purposes when exporting validation data
# submitter_name = get_submitter_info()

print(len(reliable_data))
print(len(all_data))

attributes = settings.ATTRIBUTES

col_names_correct, col_names_filled, col_names_label = get_column_names(
    attributes,
)

logger.info("Creating Data Labels")
data = get_labels(reliable_data, attributes)
all_data = get_labels(all_data, attributes)

logger.info("Feature Engineering - Reliable Data")
df = feature_engineering(
    data,
    col_names_label,
    col_names_filled,
    col_names_correct,
    attributes,
)

logger.info("Feature Engineering - All Data")
df_all = feature_engineering(
    all_data, 
    col_names_label,
    col_names_filled,
    col_names_correct,
    attributes,
)

logger.info("Model Training")
x_cols, y_cols = get_split_columns(df.columns)
model_dict = train_multioutput_classifiers(df, x_cols, y_cols)

Snowflake Connector for Python Version: 2.7.0, Python Version: 3.9.10, Platform: macOS-12.6.1-arm64-arm-64bit
This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
Setting use_openssl_only mode to False
query: [SELECT cv.id, cv.submitter_person_id, lds.logo, cv.comp_data_id AS comp_data_id_...]
query execution done
Snowflake Connector for Python Version: 2.7.0, Python Version: 3.9.10, Platform: macOS-12.6.1-arm64-arm-64bit
This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
query: [SELECT cv.id, cv.submitter_person_id, lds.logo, cv.comp_data_id AS comp_data_id_...]
query execution done
closed
No async queries seem to be 

In [4]:
# %store df
# %store df_all

In [5]:
# model_dict

### Upload Model to S3

In [6]:
import os
import shutil
import typing

import boto3
import botocore
import structlog

from train.config.settings import settings

logger = structlog.get_logger()


def get_web_identity_token() -> str:
    """
    Get token value from filepath
    """
    token = ""
    with open(settings.AWS_WEB_IDENTITY_TOKEN_FILE) as f:  # type: ignore
        token = f.read().strip()

    return token


def get_aws_cred() -> typing.Any:
    """
    Get AWS credential
    """
    token = get_web_identity_token()
    sts_client = boto3.client("sts")
    assumed_role_object = sts_client.assume_role_with_web_identity(
        RoleArn=settings.AWS_ROLE_ARN,
        RoleSessionName="SalesLinkageSession",
        WebIdentityToken=token,
    )

    return assumed_role_object["Credentials"]


def get_s3_resource() -> typing.Any:
    """
    Get S3 resource
    """
    if not settings.AWS_WEB_IDENTITY_TOKEN_FILE:
        s3_resource = boto3.resource("s3")
    else:
        cred = get_aws_cred()
        s3_resource = boto3.resource(
            "s3",
            aws_access_key_id=cred["AccessKeyId"],
            aws_secret_access_key=cred["SecretAccessKey"],
            aws_session_token=cred["SessionToken"],
        )

    return s3_resource

In [7]:
from train.config.settings import settings 

def upload_models() -> None:
    """
    Upload the model to S3 bucket
    """

    s3 = get_s3_resource()
    object_name = f"{settings.PROJECT_NAME}/models/{settings.ENV}.model.tar.gz"
    file_name = f"{settings.MODEL_DIR}/model.tar.gz"

    try:
        shutil.make_archive(
            settings.MODEL_DIR + "/model",
            "gztar",
            settings.MODEL_DIR,
        )

        s3.Bucket(settings.MODELS_S3_BUCKET).upload_file(
            file_name,
            object_name.format(settings.ENV),
        )

        os.remove(file_name)
        logger.debug("Successfully uploaded models")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("Permission denied when trying to upload file.")
        else:
            raise

In [8]:
# with open(f"{settings.MODEL_DIR}/{settings.MODEL_NAME}", "wb") as handle:
#     pickle.dump(model_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# upload_models()

### Download model from S3 (batch)

In [9]:
from batch.config.settings import settings 

def download_models() -> None:
    """
    Get models from S3 bucket
    """
    s3 = get_s3_resource()
    object_name = f"{settings.PROJECT_NAME}/models/{settings.ENV}.model.tar.gz"
    file_name = f"{settings.MODEL_DIR}/model.tar.gz"

    try:
        s3.Bucket(settings.MODELS_S3_BUCKET).download_file(
            object_name,
            file_name,
        )
        shutil.unpack_archive(file_name, settings.MODEL_DIR)
        os.remove(file_name)
        logger.debug("Successfully downloaded models")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("The object does not exist.")
        else:
            raise

In [10]:
# download_models()

### Upload dataset to S3

In [11]:
def upload_dataset(directory:str) -> None: 
    """
    Upload processed dataset to S3 bucket 
    """
        
    s3 = get_s3_resource()
    object_name = f"{settings.PROJECT_NAME}/{settings.DATA_DIR}/{directory}/{settings.ENV}.dataset.tar.gz"
    file_name = f"{settings.DATA_DIR}/{directory}/dataset.tar.gz"

    try:
        shutil.make_archive(
            f"{settings.DATA_DIR}/{directory}" + "/dataset",
            "gztar",
            f"{settings.DATA_DIR}/{directory}",
        )

        s3.Bucket(settings.MODELS_S3_BUCKET).upload_file(
            file_name,
            object_name.format(settings.ENV),
        )

        os.remove(file_name)
        logger.debug("Successfully uploaded dataset")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("Permission denied when trying to upload file.")
        else:
            raise

In [12]:
# import pickle 

# with open(f"{settings.DATA_DIR}"+"/processed"+"/reliable_data", "wb") as handle:
#     pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open(f"{settings.DATA_DIR}"+"/processed"+"/all_data", "wb") as handle:
#     pickle.dump(df_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# upload_dataset("processed") 

In [14]:
def download_dataset(directory: str) -> None:
    """
    Get dataset from S3 bucket
    """
    s3 = get_s3_resource()
    object_name = (
        f"{settings.PROJECT_NAME}/{settings.DATA_DIR}/{directory}/{settings.ENV}.dataset.tar.gz"
    )
    file_name = f"{settings.DATA_DIR}/{directory}/dataset.tar.gz"

    try:
        s3.Bucket(settings.MODELS_S3_BUCKET).download_file(
            object_name,
            file_name,
        )
        shutil.unpack_archive(file_name, f"{settings.DATA_DIR}/{directory}")
        os.remove(file_name)
        logger.debug("Successfully downloaded dataset")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("The object does not exist.")
        else:
            raise

In [15]:
# download_dataset("processed")

### BATCH

In [16]:
import asyncio

import pandas as pd
import structlog

from batch.common.file_io import download_dataset, download_models
from batch.common.logging import initialize_logging
from batch.config.settings import settings
from batch.data.database_io import (
    get_split_columns,
    get_submitter_info,
    write_submitter_df_snowflake,
    write_version_realiability_df_snowflake,
)
from batch.data.output_data import (
    get_submitter_reliability,
    get_version_reliability,
)

In [17]:
#download processed dataset 

df = pd.read_pickle(f"{settings.DATA_DIR}/processed/reliable_data")
df_all = pd.read_pickle(f"{settings.DATA_DIR}/processed/all_data")

In [18]:
#download attribute classifiers 

model_dict = pd.read_pickle(f"{settings.MODEL_DIR}/{settings.MODEL_NAME}")

In [19]:
submitter_name = get_submitter_info()

Snowflake Connector for Python Version: 2.7.0, Python Version: 3.9.10, Platform: macOS-12.6.1-arm64-arm-64bit
This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
query: [SELECT  per.id, CONCAT(per.first_name, ' ',  per.last_name) AS submitter_name FR...]
query execution done


In [20]:
x_cols, y_cols = get_split_columns(df.columns)

In [21]:
attributes = settings.ATTRIBUTES

In [22]:
submitter_df, _ = await get_submitter_reliability(
        df,
        x_cols,
        y_cols,
        model_dict,
        submitter_name,
    )

In [23]:
version_reliability_df = get_version_reliability(
    df_all,
    attributes,
    x_cols,
    y_cols,
    model_dict,
)

In [24]:
print(submitter_df.shape)
print(version_reliability_df.shape)

(3637, 18)
(2756893, 30)


### Creating datetime_now 

In [25]:
from batch.data.database import get_snowflake_connection
import datetime

In [26]:
# submitter_df.columns
version_reliability_df.columns

Index(['comp_data_id_version', 'comp_data_id_master', 'tenant_name_version',
       'tenant_name_prob', 'space_type_id_version', 'space_type_id_prob',
       'transaction_size_version', 'transaction_size_prob',
       'starting_rent_version', 'starting_rent_prob', 'execution_date_version',
       'execution_date_prob', 'commencement_date_version',
       'commencement_date_prob', 'lease_term_version', 'lease_term_prob',
       'expiration_date_version', 'expiration_date_prob', 'work_value_version',
       'work_value_prob', 'free_months_version', 'free_months_prob',
       'transaction_type_id_version', 'transaction_type_id_prob',
       'rent_bumps_percent_bumps_version', 'rent_bumps_percent_bumps_prob',
       'rent_bumps_dollar_bumps_version', 'rent_bumps_dollar_bumps_prob',
       'lease_type_id_version', 'lease_type_id_prob'],
      dtype='object')

In [27]:
# values = list(
#     zip(
#         submitter_df["submitter_person_id"].tolist(),
#         submitter_df["tenant_name_reliability"].tolist(),
#         submitter_df["space_type_id_reliability"].tolist(),
#         submitter_df["transaction_size_reliability"].tolist(),
#         [datetime.datetime.now() for _ in range(len(submitter_df))],
#     ),
# )

# values[0:5]

In [28]:
# temp = submitter_df.copy()
# temp = temp[["submitter_person_id","tenant_name_reliability","space_type_id_reliability","transaction_size_reliability"]]
# temp["date_created"] = datetime.datetime.now()
# values = list(temp.itertuples(index=False, name=None))
# values[0:5]

In [29]:
sumbitter_cols = [
    'submitter_person_id', 
    'tenant_name_reliability',
    'space_type_id_reliability',
    'transaction_size_reliability',
    'starting_rent_reliability',
    'execution_date_reliability',
    'commencement_date_reliability',
    'lease_term_reliability',
    'expiration_date_reliability',
    'work_value_reliability',
    'free_months_reliability',
    'transaction_type_id_reliability',
    'rent_bumps_percent_bumps_reliability',
    'rent_bumps_dollar_bumps_reliability',
    'lease_type_id_reliability',
    'general_reliability'
]

#### Testing out Snowflake import -> import_snowflake.ipynb

In [30]:
from batch.data.database_io import read_file
from batch.data.database import get_snowflake_connection, get_snowflake_ml_pipeline_connection
from batch.data.database_io import write_version_realiability_df_snowflake

import numpy as np 

In [32]:
# # export version result to Snowflake
# logger = structlog.get_logger()
# initialize_logging(settings.ENV)

# logger.info("Exporting <VERSION RELIABILITY> into Snowflake")
# logger.info(f"Total len of {len(version_reliability_df)}")

# for i, chunk in enumerate(np.array_split(version_reliability_df, 10)):
#     logger.info(f"processing batch: {i + 1}/10")
#     write_version_realiability_df_snowflake(
#         chunk,
#         "ML_PIPELINE_DEV_DB.LEASE_VERSION_RELIABILITY",
#         "VERSION",
#     )


In [None]:
id = [89413,133830, 116766, 128857, 120557]

submitter_df[submitter_df['submitter_person_id'].isin(id)]

Unnamed: 0,submitter_name,submitter_person_id,n_support,tenant_name_reliability,space_type_id_reliability,transaction_size_reliability,starting_rent_reliability,execution_date_reliability,commencement_date_reliability,lease_term_reliability,expiration_date_reliability,work_value_reliability,free_months_reliability,transaction_type_id_reliability,rent_bumps_percent_bumps_reliability,rent_bumps_dollar_bumps_reliability,lease_type_id_reliability,general_reliability
3632,Sabrina Miranda,120557,1,0.0,0.02,0.03,0.033623,0.287696,0.354583,0.159048,0.680759,0.575924,0.182618,0.405,0.352857,0.603735,0.046431,0.195714
3633,Matt Fisher,128857,1,0.0,0.02,0.03,0.033623,0.287696,0.354583,0.159048,0.680759,0.575924,0.182618,0.405,0.352857,0.603735,0.046431,0.195714
3634,Brian Caldwell,116766,1,0.0,0.02,0.03,0.033623,0.287696,0.354583,0.159048,0.680759,0.575924,0.182618,0.405,0.352857,0.603735,0.046431,0.195714
3635,Maggie Divirgilio,133830,1,0.0,0.02,0.03,0.033623,0.287696,0.354583,0.159048,0.680759,0.575924,0.182618,0.405,0.352857,0.603735,0.046431,0.195714
3636,Julie Hamm,89413,1,0.0,0.02,0.03,0.033623,0.287696,0.354583,0.159048,0.680759,0.575924,0.182618,0.405,0.352857,0.603735,0.046431,0.195714
