In [None]:
import os
import gc
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
import scipy.stats as stats

from zillow.config.feature_engineering import FeaturesDtypeConversionConfig_v1
from zillow.utils.common import read_data, find_shared_cols, find_unshared_cols, get_feat_nature_types, throw_col_not_exist_warning, modify_dataclass
from zillow.config.config import load_config_no_wrap, create_config_from_dict, merge_configs
from zillow.config.paths import PROCESSED_DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR, REPORTS_DIR, ANALYSIS_RESULTS_DIR

cfg = load_config_no_wrap('default')
cur_cfg = create_config_from_dict({
    'load_all_data': False,
    'main_train_path': INTERIM_DATA_DIR / 'cleaned_train_2016_v1.0.parquet',
    'main_test_path': INTERIM_DATA_DIR / 'cleaned_properties_2016_v1.0.parquet',
})
cfg = merge_configs(cfg, cur_cfg)

np.random.seed(cfg.RSEED)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")

zillow_dictionary = pd.read_csv(RAW_DATA_DIR / "zillow_data_dictionary.csv")

if cfg.to_load_all_data:
    properties_2016 = read_data(path=INTERIM_DATA_DIR / "cleaned_properties_2016_v1.0.parquet", dtype='default')
    properties_2017 = read_data(path=INTERIM_DATA_DIR / "cleaned_properties_2017_v1.0.parquet", dtype='default')
    train_2016 = read_data(path=INTERIM_DATA_DIR / "cleaned_train_2016_v1.0.parquet", dtype='default')
    train_2017 = read_data(path=INTERIM_DATA_DIR / "cleaned_train_2017_v1.0.parquet", dtype='default')

train = read_data(cfg.main_train_path, dtype='default')

features_dtype_cfg = FeaturesDtypeConversionConfig_v1()


In [12]:
def split_time_series_2016(df_file_name=MAIN_TRAIN_FILENAME):
    train_path = f"{DATA_DIR}train_train_2016.parquet"
    val_path = f"{DATA_DIR}val_train_2016.parquet"
    if os.path.exists(train_path) and os.path.exists(val_path):
        return read_data(train_path, dtype='default'), read_data(val_path, dtype='default')

    df = read_data(path=f"{DATA_DIR}{df_file_name}", dtype='default')
    val = df[df['trans_month'].isin([10, 11, 12])].copy()
    train = df[~df.index.isin(val.index)].copy()

    drop_cols = features_dtype_cfg.dtype_break_down_mapping['date'].keys()
    train.drop(columns=drop_cols, inplace=True)
    val.drop(columns=drop_cols, inplace=True)

    val.reset_index(drop=True, inplace=True)
    train.reset_index(drop=True, inplace=True)

    val.to_parquet(val_path)
    train.to_parquet(train_path)

    return train, val

#### Baseline

In [None]:
test = read_data(path=f"{DATA_DIR}{MAIN_TEST_FILENAME}", dtype='default')
test = test.drop(columns=['assessmentyear', cfg.index_col])

train, val = split_time_series_2016(df_file_name=MAIN_TRAIN_FILENAME)
model = LGBMRegressor(
    random_state=cfg.RSEED
)

validation = Validation(
    train,
    val,
    model, 
    cfg.target,
    metric='neg_mean_absolute_error', 
    cv=10
)

In [None]:
def run_validation(validation):
    split_res = validation.run_split()
    cv_res = abs(validation.run_cross_validation())
    tm_res = abs(validation.run_tm_split())

    cv_diff = abs(split_res['mae'] - cv_res.mean())
    tm_diff = abs(split_res['mae'] - tm_res.mean())
    print(f"Result: {split_res}, with cv difference: {cv_diff}, and tm cv {tm_diff}.")

def run_submission(train, test):
    X = train.drop(columns=[cfg.index_col, cfg.target])
    model.fit(X, train[cfg.target])
    ypred = model.predict(test)

    submission = Submission(ypred, sample_submission['ParcelId'])
    submission.save_submission("lgbmr_baseline_2016.csv", SUBMISSIONS_PATH)