In [2]:
"""
       This notebook was used to extract the metafeatures of the test datasets into the test_features.csv
       There are also some sanity checks for MyDataset() and other functions used for data preparation and extracting the top N configurations from the data
"""
import pandas as pd
import statistics
from IPython.display import display
import numpy as np
from project_utils import  get_task_metafeatures, load_test_data, meta_feature_names, \
    default_config, test_ids, get_best_config_per_task,dataset_to_task, get_test_metafeatures, get_all_metafeatures,get_dataset_to_task

In [None]:
"""
       Copied from utils
"""
hyperparameters = ['num_round', 'eta', 'gamma', 'lambda', 'alpha', 'subsample',
       'max_depth', 'min_child_weight', 'colsample_bytree',
       'colsample_bylevel']

meta_features = ['MajorityClassSize',
       'MaxNominalAttDistinctValues', 'MinorityClassSize', 'NumberOfClasses',
       'NumberOfFeatures', 'NumberOfInstances',
       'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
       'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures']

In [None]:
#get the single best performing HPC for each dataset - based on historical data
best_configs = get_best_config_per_task()
display(best_configs[['data_id','name','avg_auc']])

In [None]:
display(best_configs[meta_features])
display(best_configs[hyperparameters])

In [None]:
"""
Here we see that there are only 2 missing values both for the meta feature:
MaxNominalAttDistinctValues
"""
best_configs.isna().sum()

In [None]:
"""
The best configs all have a runtime under 30minutes (1800seconds)

"""
best_configs['avg_time'].sort_values()

In [None]:
runs = pd.read_csv('./data/xgboost_meta_data.csv')
grouped = runs.groupby(by='data_id').first().reset_index()
display(grouped)

In [None]:
test_features = get_test_metafeatures(address='./data/test_features.csv')
test_features

In [3]:
"""Imputation using the median, moved later to one of the utils functions"""
train_data, test_data = get_all_metafeatures(impute=True)

#Train data 0-16000 but most values under 100
#train_data[nan_column].plot(kind='kde')
#Test data 0-75
#test_data[nan_column].plot(kind='hist', edgecolor='black')

# #Finding median across both train and test datasets
# train_not_nan = train_data[~train_data[nan_column].isna()][nan_column].tolist()
# test_not_nan = test_data[~test_data[nan_column].isna()][nan_column].tolist()
# nan_column_median = int(statistics.median(train_not_nan + test_not_nan))
#
# #Replacing NaNs with median
# train_data[nan_column] = train_data[nan_column].fillna(nan_column_median)
# test_data[nan_column] = test_data[nan_column].fillna(nan_column_median)


"""
Old code for imputing using the mean value - switched to median due to long tail distribution
"""
# total_length = (train_data[[nan_column]].shape[0] -train_data[[nan_column]].isna().sum()) + (test_data[[nan_column]].shape[0] - test_data[[nan_column]].isna().sum())
# total_length = total_length.tolist()[0]
# maxnominal_avg = (train_data[[nan_column]].sum(skipna=True) + test_data[[nan_column]].sum(skipna=True))/total_length
# maxnominal_avg = maxnominal_avg.tolist()[0]
# print(maxnominal_avg)
# train_data[nan_column] = train_data[nan_column].fillna(maxnominal_avg)
# test_data[nan_column] = test_data[nan_column].fillna(maxnominal_avg)
# display(train_data, test_data)
"""
MaxNominalAttributeDistinctValues (largest number of categories among all cat variables)
"""
#Sanity check should be zero for all columns
print(train_data.isna().sum(),test_data.isna().sum())

display(test_data,train_data)


data_id                               0
name                                  0
status                                0
MajorityClassSize                     0
MaxNominalAttDistinctValues           0
MinorityClassSize                     0
NumberOfClasses                       0
NumberOfFeatures                      0
NumberOfInstances                     0
NumberOfInstancesWithMissingValues    0
NumberOfMissingValues                 0
NumberOfNumericFeatures               0
NumberOfSymbolicFeatures              0
task_id                               0
dtype: int64 name                                  0
status                                0
MajorityClassSize                     0
MaxNominalAttDistinctValues           0
MinorityClassSize                     0
NumberOfClasses                       0
NumberOfFeatures                      0
NumberOfInstances                     0
NumberOfInstancesWithMissingValues    0
NumberOfMissingValues                 0
NumberOfNumericFeatures    

Unnamed: 0,name,status,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures,data_id,task_id
0,mfeat-karhunen,active,200.0,10.0,200.0,10.0,65.0,2000.0,0.0,0.0,64.0,1.0,16.0,16.0
1,mfeat-zernike,active,200.0,10.0,200.0,10.0,48.0,2000.0,0.0,0.0,47.0,1.0,22.0,22.0
2,credit-g,active,700.0,10.0,300.0,2.0,21.0,1000.0,0.0,0.0,7.0,14.0,31.0,31.0
3,satimage,active,1531.0,6.0,625.0,6.0,37.0,6430.0,0.0,0.0,36.0,1.0,182.0,2074.0
4,eucalyptus,active,214.0,27.0,105.0,5.0,20.0,736.0,95.0,448.0,14.0,6.0,188.0,2079.0
5,monks-problems-2,active,395.0,4.0,206.0,2.0,7.0,601.0,0.0,0.0,0.0,7.0,334.0,3493.0
6,mc1,active,9398.0,2.0,68.0,2.0,39.0,9466.0,0.0,0.0,38.0,1.0,1056.0,3907.0
7,kc2,active,415.0,2.0,107.0,2.0,22.0,522.0,0.0,0.0,21.0,1.0,1063.0,3913.0
8,micro-mass,active,60.0,20.0,11.0,20.0,1301.0,571.0,0.0,0.0,1300.0,1.0,1515.0,9950.0
9,phoneme,active,3818.0,2.0,1586.0,2.0,6.0,5404.0,0.0,0.0,5.0,1.0,1489.0,9952.0


Unnamed: 0,data_id,name,status,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures,task_id
0,3,kr-vs-kp,active,1669,3.0,1527,2,37,3196,0,0,0,37,3
1,6,letter,active,813,26.0,734,26,17,20000,0,0,16,1,6
2,11,balance-scale,active,288,3.0,49,3,5,625,0,0,4,1,11
3,12,mfeat-factors,active,200,10.0,200,10,217,2000,0,0,216,1,12
4,14,mfeat-fourier,active,200,10.0,200,10,77,2000,0,0,76,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,41163,dilbert,active,2049,5.0,1913,5,2001,10000,0,0,2000,1,168770
90,41164,fabert,active,1927,7.0,502,7,801,8237,0,0,800,1,168760
91,41166,volkert,active,12806,10.0,1361,10,181,58310,0,0,180,1,168331
92,41168,jannis,active,38522,4.0,1687,4,55,83733,0,0,54,1,168330


In [1]:
"""
       Testing MyDataset implementation
"""
from python.FFN_regression import MyDataset
import torch
# my_data = MyDataset()
#
# x_train = my_data.x_train
# y_train = my_data.y_train
# x_range = torch.sub(torch.max(x_train,0)[0],torch.min(x_train,0)[0])
# y_range = torch.sub(torch.max(y_train,0)[0],torch.min(y_train,0)[0])
# print(x_range,y_range)

x_train: torch.Size([94, 10])  y_train: torch.Size([94, 10])


tensor([[1.5000e+02, 8.0000e+00, 1.0500e+02, 8.0000e+00, 8.2000e+01, 1.0800e+03,
         5.2800e+02, 1.3960e+03, 7.7000e+01, 5.0000e+00],
        [4.2080e+03, 1.2000e+01, 3.9160e+03, 2.0000e+00, 2.3000e+01, 8.1240e+03,
         2.4800e+03, 2.4800e+03, 0.0000e+00, 2.3000e+01],
        [3.1700e+02, 4.0000e+00, 5.5000e+01, 4.0000e+00, 7.1000e+01, 8.4100e+02,
         0.0000e+00, 0.0000e+00, 7.0000e+01, 1.0000e+00],
        [4.5800e+02, 2.0000e+00, 2.4100e+02, 2.0000e+00, 1.0000e+01, 6.9900e+02,
         1.6000e+01, 1.6000e+01, 9.0000e+00, 1.0000e+00],
        [2.2050e+03, 4.0000e+00, 3.2800e+02, 4.0000e+00, 2.5000e+01, 5.4560e+03,
         0.0000e+00, 0.0000e+00, 2.4000e+01, 1.0000e+00],
        [1.0000e+02, 6.0000e+00, 1.0000e+02, 6.0000e+00, 6.1000e+01, 6.0000e+02,
         0.0000e+00, 0.0000e+00, 6.0000e+01, 1.0000e+00],
        [1.6690e+03, 3.0000e+00, 1.5270e+03, 2.0000e+00, 3.7000e+01, 3.1960e+03,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 3.7000e+01],
        [6.2600e+02, 3.0000