In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, boxcox, pearsonr, ks_2samp
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, HuberRegressor, Lasso, ElasticNet, ElasticNetCV, LogisticRegression

import json
import os
os.chdir("..")

from core.Dataset import Dataset, DataUnit
from core.MetaModel import PredictionAggregator
from core.ModelOptimizer import (
    lgbm_optimizer, 
    xgb_optimizer, 
    rf_optimizer, 
    svr_optimizer, 
    ridge_optimizer, 
    huber_optimizer,
    knn_optimizer,
    lasso_optimizer,
    elasticnet_optimizer,
    model_box
)

def metric_train(output, truth):
    return spearmanr(output, truth).correlation

In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"

FEATURE_SELECTION_PATH = "features/feature_selection_lasso.json"
feature_selection = json.load(open(FEATURE_SELECTION_PATH, "r"))

In [3]:
de_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_de.csv").set_index("ID")
fr_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_fr.csv").set_index("ID")
exotic_train = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_train.csv").set_index("ID")

de_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_de.csv").set_index("ID")
fr_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_fr.csv").set_index("ID")
exotic_test = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_test.csv").set_index("ID")

full_train = pd.read_csv(f"{FULL_PATH}/median_imputed_train.csv").set_index("ID")

In [4]:
rank_max = full_train.RANK.max()

In [5]:
de_dataset = Dataset(de_train, feature_selection["de"], ["RANK"], name="de", valid_ratio=0.2)
fr_dataset = Dataset(fr_train, feature_selection["fr"], ["RANK"], name="fr", valid_ratio=0.2)
exotic_dataset = Dataset(exotic_train, feature_selection["exotic"], ["RANK"], name="exo", valid_ratio=0.2)

de_testset = Dataset(de_test, feature_selection["de"], None, name="de_test")
fr_testset = Dataset(fr_test, feature_selection["fr"], None, name="fr_test")
exotic_testset = Dataset(exotic_test, feature_selection["exotic"], None, name="exo_test")

In [6]:
def _get_opt(model_name, opt_param_dict):
    opt_dict = {
        'lgbm': lgbm_optimizer(**opt_param_dict),
        'xgb': xgb_optimizer(**opt_param_dict),
        'rf': rf_optimizer(**opt_param_dict),
        'svr': svr_optimizer(**opt_param_dict),
        'ridge': ridge_optimizer(**opt_param_dict),
        'huber': huber_optimizer(**opt_param_dict),
        'knn': knn_optimizer(**opt_param_dict),
        'lasso': lasso_optimizer(**opt_param_dict),
        'elasticnet': elasticnet_optimizer(**opt_param_dict), 
    }
    return opt_dict[model_name]

def optimize_models(
    dataset, 
    region,
    strat='voting_models_remake', 
    models_list=['lgbm', 'xgb', 'rf', 'svr', 'ridge', 'huber', 'knn', 'lasso', 'elasticnet'], 
    cv=0, 
    dump=True
):
    opt_param_dict = {
                'dataset': dataset,
                'cv': cv
            }
    for model_name in models_list:
        print(model_name)
        opt = _get_opt(model_name=model_name, opt_param_dict=opt_param_dict)
        opt.run()
        if dump: opt.dump_best_model(f"{strat}/{model_name}_{region}.json")

In [9]:
STRATEGY = "models/voting_models_all_feat"
# STRATEGY = "lasso_selected_features_linear_models"

# for ds in [
#     de_dataset, 
#     fr_dataset, 
#     exotic_dataset,
#     ]:
#     optimize_models(
#         ds, 
#         region=ds.name, 
#         cv=5, 
#         models_list=["lasso", "ridge", "huber", "elasticnet"],
#         strat=STRATEGY)

In [10]:
mbox = model_box(STRATEGY)
model_candidates, model_scores = mbox.to_dict()
model_scores

{'ridge': {'exo': 0.1270053475935829,
  'fr': 0.10201482845572965,
  'de': 0.36128022590072567},
 'elasticnet': {'exo': 0.2032467532467533,
  'fr': 0.15957210874999814,
  'de': 0.3632048578051145},
 'lasso': {'exo': 0.14613827349121467,
  'fr': 0.11965875232994194,
  'de': 0.36432709760454585},
 'lgbm': {'exo': 0.16572956455309398,
  'fr': 0.15567968185648373,
  'de': 0.37777705526120864},
 'rf': {'exo': 0.18019289533995417,
  'fr': 0.1779263058388038,
  'de': 0.3744084327552349},
 'svr': {'exo': 0.13487585943468297,
  'fr': 0.11185190967864349,
  'de': 0.3245715102195982},
 'huber': {'exo': 0.2036783804430863,
  'fr': 0.15858261885872812,
  'de': 0.3633788852241109},
 'xgb': {'exo': 0.15051948051948055,
  'fr': 0.17299595558005731,
  'de': 0.3703558557141113},
 'knn': {'exo': 0.13647058823529412,
  'fr': 0.16303104383092218,
  'de': 0.3199868381162239}}

In [11]:
def visualise_preds(pred_ys, true_y):
    fig, axs = plt.subplots(3, 3, figsize=(12, 12))
    for i, col in enumerate(pred_ys.columns):
        axs[i//3][i%3].scatter(pred_ys[col].rank(), true_y.rank(), label=col)
        axs[i//3][i%3].set_title(col)
    fig.tight_layout()
    plt.show()

In [12]:
de_pa = PredictionAggregator(model_candidates, 'de')
# de_pred = de_pa.fit_predict(de_dataset.dtrain, de_dataset.dvalid.X)
de_pred = de_pa.fit_predict(de_dataset.dtrain, de_dataset.dvalid.X, n_bootstrap=100, bootstrap_fraction=0.7)
de_pred.apply(lambda x: metric_train(x, de_dataset.dvalid.y))

ridge         0.417302
elasticnet    0.360856
lasso         0.391458
lgbm          0.349385
rf            0.305579
svr           0.303019
huber         0.353561
xgb           0.345701
knn           0.333838
dtype: float64

In [13]:
fr_pa = PredictionAggregator(model_candidates, 'fr')
# fr_pred = fr_pa.fit_predict(fr_dataset.dtrain, fr_dataset.dvalid.X)
fr_pred = fr_pa.fit_predict(fr_dataset.dtrain, fr_dataset.dvalid.X, n_bootstrap=100, bootstrap_fraction=0.7)
fr_pred.apply(lambda x: metric_train(x, fr_dataset.dvalid.y))

ridge         0.174514
elasticnet    0.112822
lasso         0.179760
lgbm          0.225411
rf            0.137196
svr           0.197823
huber         0.114669
xgb           0.142213
knn           0.244511
dtype: float64

In [14]:
exotic_pa = PredictionAggregator(model_candidates, 'exo')
# exotic_pred = exotic_pa.fit_predict(exotic_dataset.dtrain, exotic_dataset.dvalid.X)
exotic_pred = exotic_pa.fit_predict(exotic_dataset.dtrain, exotic_dataset.dvalid.X, n_bootstrap=100, bootstrap_fraction=0.7)
exotic_pred.apply(lambda x: metric_train(x, exotic_dataset.dvalid.y))

ridge         0.244146
elasticnet    0.291630
lasso         0.254680
lgbm          0.160198
rf            0.185155
svr           0.314966
huber         0.300705
xgb           0.273155
knn           0.156632
dtype: float64

In [41]:
# feature_selection_lasso = {}
# feature_selection_lasso["de"] = de_dataset.dtrain.X.columns[de_pa.fitted_models["lasso"].coef_ > 0.1].to_list()
# feature_selection_lasso["fr"] = fr_dataset.dtrain.X.columns[fr_pa.fitted_models["lasso"].coef_ > 0.1]
# feature_selection_lasso["exotic"] = exotic_dataset.dtrain.X.columns[exotic_pa.fitted_models["lasso"].coef_ > 0.1]
# json.dump(feature_selection, open("features/feature_selection_lasso.json", "w"))

In [16]:
# full_test_pred = pd.concat(
#     [
#         de_pa.fit_predict(de_dataset.dfull, de_testset.dfull.X),
#         fr_pa.fit_predict(fr_dataset.dfull, fr_testset.dfull.X),
#         exotic_pa.fit_predict(exotic_dataset.dfull, exotic_testset.dfull.X)
#     ], axis=0)

full_test_pred = pd.concat(
    [
        de_pa.fit_predict(de_dataset.dfull, de_testset.dfull.X, n_bootstrap=100, bootstrap_fraction=0.7),
        fr_pa.fit_predict(fr_dataset.dfull, fr_testset.dfull.X, n_bootstrap=100, bootstrap_fraction=0.7),
        exotic_pa.fit_predict(exotic_dataset.dfull, exotic_testset.dfull.X, n_bootstrap=100, bootstrap_fraction=0.7)
    ], axis=0)

full_test_pred.to_csv(f"{STRATEGY}/test_pred_bootstrap.csv")