In [2]:
cd ..

/Users/jisooryu/Projects/lease-version-reliability


In [3]:
import pandas as pd 
import typing 
import structlog
from lease_version_reliability.models.train import train_model
from lease_version_reliability.models.inference import run_inference

from lease_version_reliability.config.settings import settings 
from lease_version_reliability.data.database import (
    CompstakServicesMySQL,
    get_snowflake_connection,
)
from lease_version_reliability.data.database import cs_mysql_instance as mysql
from lease_version_reliability.data.database_io import read_file

logger = structlog.get_logger()

In [3]:
await run_inference(download=True)

2023-03-01 11:41:43 [info     ] Connecting to MySQL


INFO:databases:Connected to database mysql://admin:********@localhost:3308/compstak


2023-03-01 11:41:52 [debug    ] Successfully downloaded models
2023-03-01 11:41:52 [info     ] Reading Reliable Data from MySQL
2023-03-01 11:41:52 [info     ] Start processing lease data
2023-03-01 11:41:52 [info     ] Processing 500000/4017690


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


2023-03-01 11:42:59 [info     ] Processing 1000000/4017690
2023-03-01 11:43:35 [info     ] Processing 1500000/4017690
2023-03-01 11:44:14 [info     ] Processing 2000000/4017690
2023-03-01 11:45:16 [info     ] Processing 2500000/4017690
2023-03-01 11:46:10 [info     ] Processing 3000000/4017690
2023-03-01 11:47:10 [info     ] Processing 3500000/4017690
2023-03-01 11:47:58 [info     ] Processing 4000000/4017690
2023-03-01 11:48:41 [info     ] Processing 4500000/4017690
2023-03-01 11:48:45 [info     ] Reading All Data from MySQL
2023-03-01 11:48:45 [info     ] Start processing lease data
2023-03-01 11:48:45 [info     ] Processing 500000/4017690
2023-03-01 11:50:13 [info     ] Processing 1000000/4017690
2023-03-01 11:51:04 [info     ] Processing 1500000/4017690
2023-03-01 11:52:06 [info     ] Processing 2000000/4017690
2023-03-01 11:53:41 [info     ] Processing 2500000/4017690
2023-03-01 11:55:11 [info     ] Processing 3000000/4017690
2023-03-01 11:56:53 [info     ] Processing 3500000/4017

### Importing Submitter_df

In [130]:
from lease_version_reliability.config.attributes import attributes

col = attributes.copy()

In [132]:
col_reliability = [s + '_reliability' for s in col]
col.insert(0,'submitter_person_id')
col.insert(len(col), 'general_reliability')
col_reliability.insert(0,'submitter_person_id')
col_reliability.insert(len(col), 'general_reliability')

In [133]:
temp = submitter_df[col_reliability]

In [116]:
temp = temp.set_axis(col, axis=1)

In [117]:
import datetime as dt 

temp['date_created'] = pd.Timestamp.now()
temp['date_created'] = temp['date_created'].dt.strftime('%Y-%m-%d %X')
temp.columns = map(lambda x: str(x).upper(), temp.columns)

In [1]:
from sqlalchemy import create_engine
from snowflake.connector.pandas_tools import pd_writer

conn = get_snowflake_connection()
engine = create_engine(f"snowflake://{settings.SNOWFLAKE_ACCOUNT}.{settings.SNOWFLAKE_REGION}.snowflakecomputing.com", creator=lambda: conn)

with engine.connect() as con:
    temp.to_sql('submitter', engine, schema = 'LEASE_VERSION_RELIABILITY', index=False, if_exists='append', chunksize=10000, method=pd_writer)

### Version_df

In [139]:
col = attributes.copy()
col_reliability = [s + '_prob' for s in col]
col.insert(0,'comp_data_id_version')
col_reliability.insert(0,'comp_data_id_version')

In [140]:
temp = version_df[col_reliability]

In [141]:
temp = temp.set_axis(col, axis=1)

In [142]:
temp['date_created'] = pd.Timestamp.now()
temp['date_created'] = temp['date_created'].dt.strftime('%Y-%m-%d %X')
temp.columns = map(lambda x: str(x).upper(), temp.columns)

In [3]:
# conn = get_snowflake_connection()
# engine = create_engine(f"snowflake://{settings.SNOWFLAKE_ACCOUNT}.{settings.SNOWFLAKE_REGION}.snowflakecomputing.com", creator=lambda: conn)

# with engine.connect() as con:
#     temp.to_sql('version', engine, schema = 'LEASE_VERSION_RELIABILITY', index=False, if_exists='append', chunksize=10000, method=pd_writer)

### OOM Error - Read ALL_DATA in Batches

In [4]:
from lease_version_reliability.data.database_io import get_logo_df, get_all_data, get_reliable_data

In [16]:
async def get_version_max_id(db: CompstakServicesMySQL) -> typing.Any:
    """
    Retrun max id of comp_version table
    """

    query = read_file(settings.SQL_QUERY, "version_max_id.sql")

    return await db.fetch_val(query)

In [5]:
async def get_all_versions(
    db: CompstakServicesMySQL,
    min: int,
    max: int,
) -> pd.DataFrame:
    """
    Return version data from MySQL
    """
    query = read_file(settings.SQL_QUERY, "all_data.sql").format(min=min, max=max)
    data = [dict(item) for item in await db.fetch_all(query)]

    return pd.DataFrame(data)

In [6]:
async def temp_get_all_data(db:CompstakServicesMySQL) -> pd.DataFrame:
    id = await get_version_max_id(mysql)
    logger.info("Start processing lease data")
    all_df = pd.DataFrame()
    for i in range(0, id, settings.BATCH_CONFIG.BATCH_SIZE):
        logger.info(f"Processing {i + settings.BATCH_CONFIG.BATCH_SIZE}/{id}")
        data = await get_all_versions(mysql, i, i + settings.BATCH_CONFIG.BATCH_SIZE)
        all_df = pd.concat([all_df, data], ignore_index=True)
    
    all_df = await get_logo_df(all_df)

    return all_df

In [7]:
await mysql.connect()

temp_all_df = await temp_get_all_data(mysql)

await mysql.disconnect()

INFO:databases:Connected to database mysql://admin:********@localhost:3308/compstak


2023-03-01 10:30:33 [info     ] Start processing lease data
2023-03-01 10:30:33 [info     ] Processing 500000/4017690


  data = [dict(item) for item in await db.fetch_all(query)]


2023-03-01 10:31:57 [info     ] Processing 1000000/4017690
2023-03-01 10:32:45 [info     ] Processing 1500000/4017690
2023-03-01 10:33:44 [info     ] Processing 2000000/4017690
2023-03-01 10:35:15 [info     ] Processing 2500000/4017690
2023-03-01 10:36:42 [info     ] Processing 3000000/4017690
2023-03-01 10:38:19 [info     ] Processing 3500000/4017690
2023-03-01 10:40:23 [info     ] Processing 4000000/4017690
2023-03-01 10:42:08 [info     ] Processing 4500000/4017690


INFO:databases:Disconnected from database mysql://admin:********@localhost:3308/compstak


In [12]:
temp_all_df.shape

(2722225, 34)

In [10]:
await mysql.connect()
all_df = await get_all_data(mysql)
await mysql.disconnect()

INFO:databases:Disconnected from database mysql://admin:********@localhost:3308/compstak


### Reliable Data - Batch

In [18]:
async def get_reliable_data(
    db: CompstakServicesMySQL,
    min: int,
    max: int,
) -> pd.DataFrame:
    """
    Return reliable data (more than 3 submitted versions) from MySQL
    """
    query = read_file(settings.SQL_QUERY, "reliable_data.sql").format(min=min, max=max)
    data = [dict(item) for item in await db.fetch_all(query)]
    
    return pd.DataFrame(data)

In [19]:
async def temp_get_reliable_data() -> pd.DataFrame:
    id = await get_version_max_id(mysql)
    logger.info("Start processing lease data")
    df = pd.DataFrame()
    for i in range(0, id, settings.BATCH_CONFIG.BATCH_SIZE):
        logger.info(f"Processing {i + settings.BATCH_CONFIG.BATCH_SIZE}/{id}")
        data = await get_reliable_data(mysql, i, i + settings.BATCH_CONFIG.BATCH_SIZE)
        df = pd.concat([df, data], ignore_index=True)
    
    df = await get_logo_df(df)

    return df

In [10]:
await mysql.connect()
reliable_data = await get_reliable_data(mysql)
await mysql.disconnect()

  data = [dict(item) for item in await db.fetch_all(query=query)]
INFO:databases:Disconnected from database mysql://admin:********@localhost:3308/compstak


In [20]:
await mysql.connect()
temp_reliable_data = await temp_get_reliable_data()
await mysql.disconnect()

2023-03-01 11:27:14 [info     ] Start processing lease data
2023-03-01 11:27:14 [info     ] Processing 500000/4017690
2023-03-01 11:28:26 [info     ] Processing 1000000/4017690
2023-03-01 11:29:03 [info     ] Processing 1500000/4017690
2023-03-01 11:29:45 [info     ] Processing 2000000/4017690
2023-03-01 11:30:49 [info     ] Processing 2500000/4017690
2023-03-01 11:31:46 [info     ] Processing 3000000/4017690
2023-03-01 11:32:48 [info     ] Processing 3500000/4017690
2023-03-01 11:33:39 [info     ] Processing 4000000/4017690
2023-03-01 11:34:25 [info     ] Processing 4500000/4017690


INFO:databases:Disconnected from database mysql://admin:********@localhost:3308/compstak


### Label attributes to vectorization 

In [6]:
await mysql.connect()
reliable_data = await get_reliable_data()
await mysql.disconnect()

INFO:databases:Connected to database mysql://admin:********@localhost:3308/compstak


2023-03-01 12:07:54 [info     ] Get reliable data and logorithm data
2023-03-01 12:07:54 [info     ] Processing 500000/4017690
2023-03-01 12:09:02 [info     ] Processing 1000000/4017690
2023-03-01 12:09:39 [info     ] Processing 1500000/4017690
2023-03-01 12:10:19 [info     ] Processing 2000000/4017690
2023-03-01 12:11:21 [info     ] Processing 2500000/4017690
2023-03-01 12:12:16 [info     ] Processing 3000000/4017690
2023-03-01 12:13:15 [info     ] Processing 3500000/4017690
2023-03-01 12:14:04 [info     ] Processing 4000000/4017690
2023-03-01 12:14:48 [info     ] Processing 4500000/4017690


INFO:databases:Disconnected from database mysql://admin:********@localhost:3308/compstak


In [11]:
temp = reliable_data.copy()

In [18]:
import numpy as np
from datetime import timedelta

def label_date(data, att):

    idx_null = np.where((data[att + '_version'].isnull()) | (data[att + '_master'].isnull()))[0]
    idx_execution_date =  np.where((data[att + '_version'] <= data[att + '_master']+timedelta(days=90)) & (data[att + '_version'] >= data[att + '_master']-timedelta(days=90)))[0]

    data[att + '_label'] = 0 
    data.loc[idx_null, att + '_label'] = -1
    data.loc[idx_execution_date, att + '_label'] = 1

    return data 

Unnamed: 0,id,submitter_person_id,comp_data_id_version,comp_data_id_master,tenant_name_version,space_type_id_version,transaction_size_version,starting_rent_version,execution_date_version,commencement_date_version,...,expiration_date_master,work_value_master,free_months_master,transaction_type_id_master,rent_bumps_percent_bumps_master,rent_bumps_dollar_bumps_master,lease_type_id_master,specialk_id,logo,execution_date_label
0,1,1,262141,1,Kenyon & Kenyon LLP,1.0,195651.0,31.0,2010-03-01,2010-03-01,...,2020-08-28,28.0,6.0,3.0,,,7.0,,,1
1,2,139,262142,1,Kenyon & Kenyon,1.0,195651.0,31.0,2010-03-01,2010-03-01,...,2020-08-28,28.0,6.0,3.0,,,7.0,,,1
2,3,1,262143,2,Mirae Asset Global Investments,1.0,7022.0,85.0,2011-02-01,2011-02-01,...,2016-11-28,65.0,9.0,2.0,,,2.0,,,1
3,4,12,262144,2,Mirae Asset Global Investments,1.0,7022.0,85.0,2011-02-01,2011-02-01,...,2016-11-28,65.0,9.0,2.0,,,2.0,,,1
4,5,60,262145,2,Mirae Asset Global Investments,1.0,7022.0,85.0,2011-02-01,2011-02-01,...,2016-11-28,65.0,9.0,2.0,,,2.0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546151,4017604,16127,6173803,2486591,,,,,,,...,,0.0,0.0,2.0,,,5.0,,,-1
1546152,4017616,16127,6173815,2486591,Dent & Scratch Pro,2.0,3080.0,16.2,2019-06-01,,...,,0.0,0.0,2.0,,,5.0,,,0
1546153,4017686,41286,6173889,163117,,,,,,,...,,60.0,8.0,2.0,3.0,,1.0,,,-1
1546154,4017687,41286,6173890,163117,,,,,,,...,,60.0,8.0,2.0,3.0,,1.0,,,-1


In [None]:
def label_execution_date(
    subject: str,
    target: str,
) -> float:
    """
    Replace execution_date attribute column with indicator values
    Given date threshold for masters and versions
    """
    if pd.isnull(subject) or pd.isnull(target):
        return -1
    subject = str(subject)
    target = str(target)
    if parser.parse(subject) <= parser.parse(target) + timedelta(
        days=90,
    ) and parser.parse(subject) >= parser.parse(target) - timedelta(days=90):
        return 1
    return 0


def label_commencement_date(
    subject: str,
    target: str,
) -> float:
    """
    Replace commencement_date attribute column with indicator values
    Given date threshold for masters and versions
    """
    if pd.isnull(subject) or pd.isnull(target):
        return -1
    subject = str(subject)
    target = str(target)
    if parser.parse(subject) <= parser.parse(target) + timedelta(
        days=90,
    ) and parser.parse(subject) >= parser.parse(target) - timedelta(days=90):
        return 1
    
    return 0


def label_expiration_date(
    subject: typing.Any,
    target: typing.Any,
) -> typing.Any:
    """
    Replace expiration_date attribute column with indicator values
    Given date threshold for masters and versions
    """
    if pd.isnull(subject) or pd.isnull(target):
        return -1
    subject = str(subject)
    target = str(target)
    if parser.parse(subject) <= parser.parse(target) + timedelta(
        days=90,
    ) and parser.parse(subject) >= parser.parse(target) - timedelta(days=90):
        return 1
    return 0


def label_lease_term(
    subject: float,
    target: float,
) -> float:
    """
    Replace lease_term attribute column with indicator values
    Given term threshold for masters and versions
    """
    if pd.isnull(subject) or pd.isnull(target):
        return -1
    if subject >= target * 0.92 and subject <= target * 1.08:
        return 1
    return 0