In [1]:
import json
import os
os.chdir("..")

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

def metric_train(truth, output):
    return spearmanr(truth, output).correlation

ag_scorer = make_scorer(name='spearmanr',
                        score_func=metric_train,
                        optimum=1,
                        greater_is_better=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"
# either correlation-based or lasso feature selection
FEATURE_SELECTION_PATH = "features/feature_selection.json"
# FEATURE_SELECTION_PATH = "features/feature_d2_selection.json"
feature_selection = json.load(open(FEATURE_SELECTION_PATH, "r"))

STAGE = "median_imputed"
# STAGE = "median_imputed_d2"
de_train = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train_de.csv").set_index("ID")
fr_train = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train_fr.csv").set_index("ID")
exotic_train = pd.read_csv(f"{EXOTIC_PATH}/{STAGE}_train.csv").set_index("ID")

de_test = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test_de.csv").set_index("ID")
fr_test = pd.read_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test_fr.csv").set_index("ID")
exotic_test = pd.read_csv(f"{EXOTIC_PATH}/{STAGE}_test.csv").set_index("ID")

In [3]:
label = "RANK"

de_train_data = TabularDataset(de_train[feature_selection["de"] + [label]])
de_test_data = TabularDataset(de_test[feature_selection["de"]])

fr_train_data = TabularDataset(fr_train[feature_selection["fr"] + [label]])
fr_test_data = TabularDataset(fr_test[feature_selection["fr"]])

exotic_train_data = TabularDataset(exotic_train[feature_selection["exotic"] + [label]])
exotic_test_data = TabularDataset(exotic_test[feature_selection["exotic"]])

In [4]:
TRIAL = "feat-selected-best"

de_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, path=f"AutogluonModels/{TRIAL}/de_model", 
    verbosity=0
).fit(train_data=de_train_data, presets='best_quality')
de_predictions = de_predictor.predict(de_test_data)

fr_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, 
    path=f"AutogluonModels/{TRIAL}/fr_model", 
    verbosity=0
).fit(train_data=fr_train_data, presets='best_quality')
fr_predictions = fr_predictor.predict(fr_test_data)

exotic_predictor = TabularPredictor(
    label=label, 
    eval_metric=ag_scorer, 
    path=f"AutogluonModels/{TRIAL}/exotic_model", 
    verbosity=0
).fit(train_data=exotic_train_data, presets='best_quality')
exotic_predictions = exotic_predictor.predict(exotic_test_data)

  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
  with pd.option

In [5]:
de_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.395555,0.061415,53.392481,0.000264,0.205594,2,True,12
1,NeuralNetTorch_BAG_L1,0.367009,0.030226,35.170752,0.030226,35.170752,1,True,10
2,CatBoost_BAG_L1,0.337074,0.00621,2.126142,0.00621,2.126142,1,True,6
3,NeuralNetFastAI_BAG_L1,0.330201,0.016552,4.947192,0.016552,4.947192,1,True,8
4,LightGBMXT_BAG_L1,0.321746,0.004244,4.968811,0.004244,4.968811,1,True,3
5,RandomForestMSE_BAG_L1,0.311331,0.039029,0.351195,0.039029,0.351195,1,True,5
6,LightGBM_BAG_L1,0.310136,0.003536,4.931467,0.003536,4.931467,1,True,4
7,ExtraTreesMSE_BAG_L1,0.301375,0.037804,0.15084,0.037804,0.15084,1,True,7
8,XGBoost_BAG_L1,0.287812,0.006593,3.168665,0.006593,3.168665,1,True,9
9,LightGBMLarge_BAG_L1,0.285841,0.003698,15.744014,0.003698,15.744014,1,True,11


In [6]:
fr_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.244045,0.110877,37.78536,0.000273,0.205415,2,True,12
1,NeuralNetTorch_BAG_L1,0.232726,0.038726,28.261724,0.038726,28.261724,1,True,10
2,NeuralNetFastAI_BAG_L1,0.192426,0.029957,5.314766,0.029957,5.314766,1,True,8
3,RandomForestMSE_BAG_L1,0.183588,0.038653,0.31192,0.038653,0.31192,1,True,5
4,LightGBMXT_BAG_L1,0.168155,0.003268,3.691535,0.003268,3.691535,1,True,3
5,CatBoost_BAG_L1,0.162746,0.005219,1.507541,0.005219,1.507541,1,True,6
6,ExtraTreesMSE_BAG_L1,0.160026,0.037966,0.149444,0.037966,0.149444,1,True,7
7,LightGBM_BAG_L1,0.152764,0.004868,6.318728,0.004868,6.318728,1,True,4
8,LightGBMLarge_BAG_L1,0.149337,0.004759,17.854152,0.004759,17.854152,1,True,11
9,KNeighborsDist_BAG_L1,0.091458,0.013575,0.001299,0.013575,0.001299,1,True,2


In [7]:
exotic_predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.266305,0.058107,23.77567,0.000224,0.164516,2,True,12
1,CatBoost_BAG_L1,0.174652,0.004349,1.537231,0.004349,1.537231,1,True,6
2,NeuralNetTorch_BAG_L1,0.171139,0.021549,8.553987,0.021549,8.553987,1,True,10
3,XGBoost_BAG_L1,0.166185,0.005153,2.324448,0.005153,2.324448,1,True,9
4,LightGBM_BAG_L1,0.146556,0.002942,2.571498,0.002942,2.571498,1,True,4
5,NeuralNetFastAI_BAG_L1,0.134751,0.021224,4.134024,0.021224,4.134024,1,True,8
6,LightGBMLarge_BAG_L1,0.124934,0.002665,4.489966,0.002665,4.489966,1,True,11
7,RandomForestMSE_BAG_L1,0.117964,0.031745,0.163186,0.031745,0.163186,1,True,5
8,LightGBMXT_BAG_L1,0.106082,0.002595,1.690663,0.002595,1.690663,1,True,3
9,ExtraTreesMSE_BAG_L1,0.06406,0.029696,0.139773,0.029696,0.139773,1,True,7


In [8]:
full_test_pred = pd.concat([de_predictions, fr_predictions, exotic_predictions], axis=0)

In [9]:
sub_dummy = pd.read_csv("submissions/y_test_random_final.csv", index_col="ID", usecols=["ID"])
full_test_pred = sub_dummy.join(full_test_pred).rename({"RANK": "TARGET"}, axis=1)

In [10]:
submission_name = "sub32_autogluon_best_quality"
# full_test_pred.to_csv(f"submissions/{submission_name}.csv")

In [12]:
test_pred = pd.read_csv(f"submissions/{submission_name}.csv")

ref_sub_num = "sub31"
ref_filename = [f for f in os.listdir("submissions") if ref_sub_num in f][0]
ref_sub = pd.read_csv(f"submissions/{ref_filename}")
metric_train(ref_sub["TARGET"], test_pred["TARGET"])

0.3974621487106597