In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, boxcox, pearsonr, ks_2samp
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, HuberRegressor, Lasso, ElasticNet, ElasticNetCV, LogisticRegression

import json

from base_dataset import Dataset, DataUnit
from MetaModel import PredictionAggregator
from ModelOptimizer import (
    lgbm_optimizer, 
    xgb_optimizer, 
    rf_optimizer, 
    svr_optimizer, 
    ridge_optimizer, 
    huber_optimizer,
    knn_optimizer,
    lasso_optimizer,
    elasticnet_optimizer,
    model_box
)

def metric_train(output, truth):
    return spearmanr(output, truth).correlation

In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"

FEATURE_SELECTION_PATH = "features/feature_selection_lasso.json"
feature_selection = json.load(open(FEATURE_SELECTION_PATH, "r"))

In [3]:
de_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_de.csv").set_index("ID")
fr_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_fr.csv").set_index("ID")
exotic_train = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_train.csv").set_index("ID")

de_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_de.csv").set_index("ID")
fr_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_fr.csv").set_index("ID")
exotic_test = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_test.csv").set_index("ID")

full_train = pd.read_csv(f"{FULL_PATH}/median_imputed_train.csv").set_index("ID")

In [4]:
rank_max = full_train.RANK.max()

In [5]:
de_dataset = Dataset(de_train, feature_selection["de"], ["RANK"], name="de", valid_ratio=0.2)
fr_dataset = Dataset(fr_train, feature_selection["fr"], ["RANK"], name="fr", valid_ratio=0.2)
exotic_dataset = Dataset(exotic_train, feature_selection["exotic"], ["RANK"], name="exo", valid_ratio=0.2)

de_testset = Dataset(de_test, feature_selection["de"], None, name="de_test")
fr_testset = Dataset(fr_test, feature_selection["fr"], None, name="fr_test")
exotic_testset = Dataset(exotic_test, feature_selection["exotic"], None, name="exo_test")

In [6]:
def _get_opt(model_name, opt_param_dict):
    opt_dict = {
        'lgbm': lgbm_optimizer(**opt_param_dict),
        'xgb': xgb_optimizer(**opt_param_dict),
        'rf': rf_optimizer(**opt_param_dict),
        'svr': svr_optimizer(**opt_param_dict),
        'ridge': ridge_optimizer(**opt_param_dict),
        'huber': huber_optimizer(**opt_param_dict),
        'knn': knn_optimizer(**opt_param_dict),
        'lasso': lasso_optimizer(**opt_param_dict),
        'elasticnet': elasticnet_optimizer(**opt_param_dict), 
    }
    return opt_dict[model_name]

def optimize_models(
    dataset, 
    region,
    strat='voting_models_remake', 
    models_list=['lgbm', 'xgb', 'rf', 'svr', 'ridge', 'huber', 'knn', 'lasso', 'elasticnet'], 
    cv=0, 
    dump=True
):
    opt_param_dict = {
                'dataset': dataset,
                'cv': cv
            }
    for model_name in models_list:
        print(model_name)
        opt = _get_opt(model_name=model_name, opt_param_dict=opt_param_dict)
        opt.run()
        if dump: opt.dump_best_model(f"{strat}/{model_name}_{region}.json")

In [7]:
for ds in [
    de_dataset, 
    fr_dataset, 
    exotic_dataset,
    ]:
    optimize_models(
        ds, 
        region=ds.name, 
        cv=5, 
        models_list=["lasso", "ridge", "huber", "elasticnet"],
        strat="lasso_selected_features_linear_models")

[32m[I 2023-09-17 22:03:48,920][0m A new study created in memory with name: no-name-6639e479-4700-40b3-8b93-2c1d28756583[0m


lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best trial among 200 trials:
  Value: 0.36432709760454585
ridge
Best trial among 200 trials:
  Value: 0.36128022590072567
huber
Best trial among 200 trials:
  Value: 0.3633788852241109
elasticnet
Best trial among 200 trials:
  Value: 0.3632048578051145
lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best trial among 200 trials:
  Value: 0.11965875232994194
ridge
Best trial among 200 trials:
  Value: 0.10201482845572965
huber
Best trial among 200 trials:
  Value: 0.15858261885872812
elasticnet
Best trial among 200 trials:
  Value: 0.15957210874999814
lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best trial among 200 trials:
  Value: 0.14613827349121467
ridge
Best trial among 200 trials:
  Value: 0.1270053475935829
huber
Best trial among 200 trials:
  Value: 0.2036783804430863
elasticnet
Best trial among 200 trials:
  Value: 0.2032467532467533


In [8]:
IGNORE_MODEL_LIST = ['svr', 'knn', 'xgb', 'lgbm', 'rf']
mbox = model_box(
    'lasso_selected_features_linear_models', 
    # ignore_models=IGNORE_MODEL_LIST,
)
model_candidates, model_scores = mbox.to_dicts()
model_scores

{'lasso': {'exo': 0.14613827349121467,
  'fr': 0.11965875232994194,
  'de': 0.36432709760454585},
 'ridge': {'exo': 0.1270053475935829,
  'fr': 0.10201482845572965,
  'de': 0.36128022590072567},
 'elasticnet': {'exo': 0.2032467532467533,
  'fr': 0.15957210874999814,
  'de': 0.3632048578051145},
 'huber': {'exo': 0.2036783804430863,
  'fr': 0.15858261885872812,
  'de': 0.3633788852241109}}

In [9]:
def visualise_preds(pred_ys, true_y):
    fig, axs = plt.subplots(3, 3, figsize=(12, 12))
    for i, col in enumerate(pred_ys.columns):
        axs[i//3][i%3].scatter(pred_ys[col].rank(), true_y.rank(), label=col)
        axs[i//3][i%3].set_title(col)
    fig.tight_layout()
    plt.show()

In [10]:
de_pa = PredictionAggregator(model_candidates, 'de')
de_pred = de_pa.fit_predict(de_dataset.dtrain, de_dataset.dvalid.X)
de_pred.apply(lambda x: metric_train(x, de_dataset.dvalid.y))

lasso         0.397205
ridge         0.416956
elasticnet    0.359766
huber         0.360605
dtype: float64

In [11]:
fr_pa = PredictionAggregator(model_candidates, 'fr')
fr_pred = fr_pa.fit_predict(fr_dataset.dtrain, fr_dataset.dvalid.X)
fr_pred.apply(lambda x: metric_train(x, fr_dataset.dvalid.y))

lasso         0.153025
ridge         0.159713
elasticnet    0.102871
huber         0.114194
dtype: float64

In [13]:
exotic_pa = PredictionAggregator(model_candidates, 'exo')
exotic_pred = exotic_pa.fit_predict(exotic_dataset.dtrain, exotic_dataset.dvalid.X)
exotic_pred.apply(lambda x: metric_train(x, exotic_dataset.dvalid.y))

lasso         0.290171
ridge         0.272668
elasticnet    0.289523
huber         0.289361
dtype: float64

In [14]:
# feature_selection_lasso = {}
# feature_selection_lasso["de"] = de_dataset.dtrain.X.columns[de_pa.fitted_models["lasso"].coef_ > 0.1].to_list()
# feature_selection_lasso["fr"] = fr_dataset.dtrain.X.columns[fr_pa.fitted_models["lasso"].coef_ > 0.1]
# feature_selection_lasso["exotic"] = exotic_dataset.dtrain.X.columns[exotic_pa.fitted_models["lasso"].coef_ > 0.1]
# json.dump(feature_selection, open("features/feature_selection_lasso.json", "w"))

In [25]:
full_test_pred = pd.concat(
    [
        de_pa.fit_predict(de_dataset.dfull, de_testset.dfull.X),
        fr_pa.fit_predict(fr_dataset.dfull, fr_testset.dfull.X),
        exotic_pa.fit_predict(exotic_dataset.dfull, exotic_testset.dfull.X)
    ], axis=0)

In [26]:
full_test_pred = full_test_pred[["lasso", "ridge"]]

In [27]:
sub_dummy = pd.read_csv("y_test_random_final.csv", index_col="ID", usecols=["ID"])

test_pred = full_test_pred.mean(axis=1).rename("TARGET")
test_pred = sub_dummy.join(test_pred)
test_pred.to_csv("sub27_lasso_ridge.csv")

test_pred

Unnamed: 0_level_0,TARGET
ID,Unnamed: 1_level_1
1115,705.202952
1202,791.662606
1194,537.828722
1084,807.251257
1135,566.704834
...,...
879,445.020198
673,717.141736
1641,943.063301
712,1192.457289


In [28]:
ref_sub = pd.read_csv("sub24_ElasticNet.csv")
metric_train(ref_sub["TARGET"], test_pred["TARGET"])

0.8560363493353361