In [1]:
import json
import os
os.chdir("..")

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

def metric_train(truth, output):
    return spearmanr(truth, output).correlation

ag_scorer = make_scorer(name='spearmanr',
                        score_func=metric_train,
                        optimum=1,
                        greater_is_better=True)

In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"
# either correlation-based or lasso feature selection
FEATURE_SELECTION_PATH = "features/feature_d2_selection.json"
feature_selection = json.load(open(FEATURE_SELECTION_PATH, "r"))

STAGE = "median_imputed_d2"

de_train = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train_de.csv").set_index("ID")
fr_train = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train_fr.csv").set_index("ID")
exotic_train = pd.read_csv(f"{EXOTIC_PATH}/{STAGE}_train.csv").set_index("ID")

de_test = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test_de.csv").set_index("ID")
fr_test = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test_fr.csv").set_index("ID")
exotic_test = pd.read_csv(f"{EXOTIC_PATH}/{STAGE}_test.csv").set_index("ID")

In [3]:
label = "RANK"

de_train_data = TabularDataset(de_train[feature_selection["de"] + [label]])
de_test_data = TabularDataset(de_test[feature_selection["de"]])

fr_train_data = TabularDataset(fr_train[feature_selection["fr"] + [label]])
fr_test_data = TabularDataset(fr_test[feature_selection["fr"]])

exotic_train_data = TabularDataset(exotic_train[feature_selection["exotic"] + [label]])
exotic_test_data = TabularDataset(exotic_test[feature_selection["exotic"]])

In [4]:
TRIAL = "all-features-d2"

de_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, path=f"AutogluonModels/{TRIAL}/de_model", 
    verbosity=0
).fit(train_data=de_train_data, presets='medium_quality')
de_predictions = de_predictor.predict(de_test_data)

fr_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, 
    path=f"AutogluonModels/{TRIAL}/fr_model", 
    verbosity=0
).fit(train_data=fr_train_data, presets='medium_quality')
fr_predictions = fr_predictor.predict(fr_test_data)

exotic_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, 
    path=f"AutogluonModels/{TRIAL}/exotic_model", 
    verbosity=0
).fit(train_data=exotic_train_data, presets='medium_quality')
exotic_predictions = exotic_predictor.predict(exotic_test_data)

		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/chantom/opt/anaconda3/lib/python3.9/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: <D21A7969-4567-3BC7-94ED-6A9E83AE9D78> /Users/chantom/opt/anaconda3/lib/python3.9/site-packages/lightgbm/lib_lightgbm.so
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file, not in dyld cache)
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/chantom/opt/anaconda3/lib/python3.9/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenc

In [5]:
de_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.409727,0.023774,2.284358,0.000352,0.211992,2,True,9
1,NeuralNetTorch,0.408313,0.011525,1.061232,0.011525,1.061232,1,True,8
2,CatBoost,0.348362,0.004625,1.145027,0.004625,1.145027,1,True,4
3,RandomForestMSE,0.285549,0.021777,0.733864,0.021777,0.733864,1,True,3
4,ExtraTreesMSE,0.282553,0.020852,0.302494,0.020852,0.302494,1,True,5
5,NeuralNetFastAI,0.197216,0.009794,0.967099,0.009794,0.967099,1,True,6
6,KNeighborsDist,0.154959,0.001185,0.003813,0.001185,0.003813,1,True,2
7,KNeighborsUnif,0.150119,0.00797,6.087446,0.00797,6.087446,1,True,1
8,XGBoost,0.133848,0.002103,0.044035,0.002103,0.044035,1,True,7


In [6]:
fr_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.328673,0.020201,3.024876,0.000343,0.208669,2,True,9
1,NeuralNetFastAI,0.285046,0.003898,0.401892,0.003898,0.401892,1,True,6
2,CatBoost,0.279459,0.003038,1.071093,0.003038,1.071093,1,True,4
3,NeuralNetTorch,0.256513,0.01118,1.312765,0.01118,1.312765,1,True,8
4,RandomForestMSE,0.244147,0.021014,0.782351,0.021014,0.782351,1,True,3
5,ExtraTreesMSE,0.231102,0.020504,0.321878,0.020504,0.321878,1,True,5
6,KNeighborsDist,0.115477,0.001409,0.004314,0.001409,0.004314,1,True,2
7,KNeighborsUnif,0.105851,0.001573,0.004588,0.001573,0.004588,1,True,1
8,XGBoost,0.077553,0.001742,0.030457,0.001742,0.030457,1,True,7


In [7]:
exotic_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.314804,0.018008,1.547647,0.000345,0.199898,2,True,9
1,NeuralNetFastAI,0.274127,0.004462,0.50559,0.004462,0.50559,1,True,6
2,NeuralNetTorch,0.250628,0.010928,0.814771,0.010928,0.814771,1,True,8
3,RandomForestMSE,0.061826,0.016841,0.277115,0.016841,0.277115,1,True,3
4,XGBoost,0.060033,0.002273,0.027388,0.002273,0.027388,1,True,7
5,CatBoost,0.038328,0.001474,0.546519,0.001474,0.546519,1,True,4
6,ExtraTreesMSE,0.013856,0.016667,0.208975,0.016667,0.208975,1,True,5
7,KNeighborsDist,-0.005753,0.00075,0.003242,0.00075,0.003242,1,True,2
8,KNeighborsUnif,-0.031035,0.001481,0.003329,0.001481,0.003329,1,True,1


In [8]:
full_test_pred = pd.concat([de_predictions, fr_predictions, exotic_predictions], axis=0)

In [9]:
sub_dummy = pd.read_csv("submissions/y_test_random_final.csv", index_col="ID", usecols=["ID"])
full_test_pred = sub_dummy.join(full_test_pred).rename({"RANK": "TARGET"}, axis=1)

In [11]:
submission_name = "sub33_autogluon_feat_d2"
# full_test_pred.to_csv(f"submissions/{submission_name}.csv")

In [14]:
test_pred = pd.read_csv(f"submissions/{submission_name}.csv")

ref_sub_num = "sub31"
ref_filename = [f for f in os.listdir("submissions") if ref_sub_num in f][0]
ref_sub = pd.read_csv(f"submissions/{ref_filename}")
metric_train(ref_sub["TARGET"], test_pred["TARGET"])

0.16636755569145065