In [1]:
import json
import os
os.chdir("..")

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

def metric_train(truth, output):
    return spearmanr(truth, output).correlation

ag_scorer = make_scorer(name='spearmanr',
                        score_func=metric_train,
                        optimum=1,
                        greater_is_better=True)

In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"
# either correlation-based or lasso feature selection
FEATURE_SELECTION_PATH = "features/all_features.json"
feature_selection = json.load(open(FEATURE_SELECTION_PATH, "r"))

de_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_de.csv").set_index("ID")
fr_train = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_train_fr.csv").set_index("ID")
exotic_train = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_train.csv").set_index("ID")

de_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_de.csv").set_index("ID")
fr_test = pd.read_csv(f"{NON_EXOTIC_PATH}/median_imputed_test_fr.csv").set_index("ID")
exotic_test = pd.read_csv(f"{EXOTIC_PATH}/median_imputed_test.csv").set_index("ID")

full_train = pd.read_csv(f"{FULL_PATH}/median_imputed_train.csv").set_index("ID")

In [3]:
de_train_data = TabularDataset(de_train[feature_selection["de"] + ["RANK"]])
de_test_data = TabularDataset(de_test[feature_selection["de"]])

fr_train_data = TabularDataset(fr_train[feature_selection["fr"] + ["RANK"]])
fr_test_data = TabularDataset(fr_test[feature_selection["fr"]])

exotic_train_data = TabularDataset(exotic_train[feature_selection["exotic"] + ["RANK"]])
exotic_test_data = TabularDataset(exotic_test[feature_selection["exotic"]])

In [4]:
TRIAL = "all-features"

de_predictor = TabularPredictor(label='RANK', eval_metric=ag_scorer, path=f"AutogluonModels/{TRIAL}/de_model").fit(train_data=de_train_data, presets='medium_quality')
de_predictions = de_predictor.predict(de_test_data)

fr_predictor = TabularPredictor(label='RANK', eval_metric=ag_scorer, path=f"AutogluonModels/{TRIAL}/fr_model").fit(train_data=fr_train_data, presets='medium_quality')
fr_predictions = fr_predictor.predict(fr_test_data)

exotic_predictor = TabularPredictor(label='RANK', eval_metric=ag_scorer, path=f"AutogluonModels/{TRIAL}/exotic_model").fit(train_data=exotic_train_data, presets='medium_quality')
exotic_predictions = exotic_predictor.predict(exotic_test_data)

Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/all-features/de_model/"
AutoGluon Version:  0.8.2
Python Version:     3.9.12
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:17:35 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T8112
Disk Space Avail:   400.22 GB / 494.38 GB (81.0%)
Train Data Rows:    643
Train Data Columns: 32
Label Column: RANK
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1493.0, 9.0, 760.95257, 472.60538)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipeline

In [5]:
de_predictor.leaderboard()

                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L2   0.452007       0.008540  2.461281                0.001722           0.212031            2       True          9
1       NeuralNetTorch   0.440496       0.004858  1.410232                0.004858           1.410232            1       True          8
2              XGBoost   0.406334       0.001960  0.839018                0.001960           0.839018            1       True          7
3      NeuralNetFastAI   0.370254       0.008697  1.023602                0.008697           1.023602            1       True          6
4      RandomForestMSE   0.329562       0.020838  0.467790                0.020838           0.467790            1       True          3
5             CatBoost   0.325934       0.004383  0.858500                0.004383           0.858500            1       True          4
6        ExtraTreesMSE   0.315032       0

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.452007,0.00854,2.461281,0.001722,0.212031,2,True,9
1,NeuralNetTorch,0.440496,0.004858,1.410232,0.004858,1.410232,1,True,8
2,XGBoost,0.406334,0.00196,0.839018,0.00196,0.839018,1,True,7
3,NeuralNetFastAI,0.370254,0.008697,1.023602,0.008697,1.023602,1,True,6
4,RandomForestMSE,0.329562,0.020838,0.46779,0.020838,0.46779,1,True,3
5,CatBoost,0.325934,0.004383,0.8585,0.004383,0.8585,1,True,4
6,ExtraTreesMSE,0.315032,0.020996,0.24486,0.020996,0.24486,1,True,5
7,KNeighborsUnif,0.131203,0.009669,5.878124,0.009669,5.878124,1,True,1
8,KNeighborsDist,0.128064,0.001444,0.004298,0.001444,0.004298,1,True,2


In [6]:
fr_predictor.leaderboard()

                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L2   0.301303       0.011159  2.246138                0.000355           0.208593            2       True          9
1       NeuralNetTorch   0.261025       0.005466  0.813615                0.005466           0.813615            1       True          8
2      NeuralNetFastAI   0.249668       0.003586  0.446340                0.003586           0.446340            1       True          6
3        ExtraTreesMSE   0.228187       0.020364  0.241639                0.020364           0.241639            1       True          5
4              XGBoost   0.224526       0.001752  0.777590                0.001752           0.777590            1       True          7
5      RandomForestMSE   0.216590       0.020558  0.431229                0.020558           0.431229            1       True          3
6             CatBoost   0.203600       0

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.301303,0.011159,2.246138,0.000355,0.208593,2,True,9
1,NeuralNetTorch,0.261025,0.005466,0.813615,0.005466,0.813615,1,True,8
2,NeuralNetFastAI,0.249668,0.003586,0.44634,0.003586,0.44634,1,True,6
3,ExtraTreesMSE,0.228187,0.020364,0.241639,0.020364,0.241639,1,True,5
4,XGBoost,0.224526,0.001752,0.77759,0.001752,0.77759,1,True,7
5,RandomForestMSE,0.21659,0.020558,0.431229,0.020558,0.431229,1,True,3
6,CatBoost,0.2036,0.001259,0.534238,0.001259,0.534238,1,True,4
7,KNeighborsUnif,0.142876,0.003154,0.008117,0.003154,0.008117,1,True,1
8,KNeighborsDist,0.142255,0.001433,0.00649,0.001433,0.00649,1,True,2


In [7]:
exotic_predictor.leaderboard()

                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L2   0.231505       0.006360  1.229885                0.000340           0.200850            2       True          9
1       NeuralNetTorch   0.225833       0.004473  0.471790                0.004473           0.471790            1       True          8
2             CatBoost   0.069851       0.002469  0.366386                0.002469           0.366386            1       True          4
3              XGBoost  -0.000567       0.001547  0.557245                0.001547           0.557245            1       True          7
4      NeuralNetFastAI  -0.011263       0.004384  0.472588                0.004384           0.472588            1       True          6
5       KNeighborsDist  -0.083543       0.001774  0.004753                0.001774           0.004753            1       True          2
6       KNeighborsUnif  -0.094725       0

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.231505,0.00636,1.229885,0.00034,0.20085,2,True,9
1,NeuralNetTorch,0.225833,0.004473,0.47179,0.004473,0.47179,1,True,8
2,CatBoost,0.069851,0.002469,0.366386,0.002469,0.366386,1,True,4
3,XGBoost,-0.000567,0.001547,0.557245,0.001547,0.557245,1,True,7
4,NeuralNetFastAI,-0.011263,0.004384,0.472588,0.004384,0.472588,1,True,6
5,KNeighborsDist,-0.083543,0.001774,0.004753,0.001774,0.004753,1,True,2
6,KNeighborsUnif,-0.094725,0.001798,0.005645,0.001798,0.005645,1,True,1
7,ExtraTreesMSE,-0.167977,0.016689,0.198145,0.016689,0.198145,1,True,5
8,RandomForestMSE,-0.231181,0.016239,0.229223,0.016239,0.229223,1,True,3


In [8]:
full_test_pred = pd.concat([de_predictions, fr_predictions, exotic_predictions], axis=0)

In [9]:
sub_dummy = pd.read_csv("submissions/y_test_random_final.csv", index_col="ID", usecols=["ID"])
full_test_pred = sub_dummy.join(full_test_pred).rename({"RANK": "TARGET"}, axis=1)

In [31]:
full_test_pred.to_csv("submissions/sub32_autogluon_best_quality.csv")