In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [3]:
classifier_names = [
    'SVC',
    'SGDClassifier',
    'RidgeClassifierCV',
    'RidgeClassifier',
    'Perceptron',
    'PassiveAggressiveClassifier',
    'LogisticRegressionCV',
    'LogisticRegression',
    'LinearSVC',
    'RandomForestClassifier',
    'HistGradientBoostingClassifier',
    'GradientBoostingClassifier',
    'ExtraTreesClassifier',
    'AdaBoostClassifier',
    'XGBClassifier',
    'LGBMClassifier',
    'CatBoostClassifier',
    'RadiusNeighborsClassifier',
    'KNeighborsClassifier',
    'NearestCentroid',
    'QuadraticDiscriminantAnalysis',
    'LinearDiscriminantAnalysis',
    'GaussianNB',
    'BernoulliNB',
    'MLPClassifier',
    'ExtraTreeClassifier',
    'DecisionTreeClassifier',
    'LabelSpreading',
    'LabelPropagation',
    'DummyClassifier'
]

In [4]:
attributes = pd.read_csv("attributes.csv")

In [5]:
classifier_data = {}

In [6]:
for classifier in classifier_names:
  classifier_data[classifier] = []

In [7]:
for index, row in enumerate(attributes["dataset"]):
  labels_df = pd.read_csv(os.path.join('generated_datasets_attributes', row.replace(".csv", ""), 'results.csv'))
  np.nan_to_num(labels_df['score'], nan=np.mean(labels_df['score']), copy=False)
  np.nan_to_num(labels_df['time'], nan=np.mean(labels_df['time']), copy=False)
  for index, i in enumerate(labels_df['classifier']):
    classifier_data[i].append(labels_df['score'][index]-(labels_df['time'][index]/100000))

In [8]:
for key in classifier_data.keys():
  attributes[key] = classifier_data[key]

In [9]:
attributes.drop("dataset", inplace=True, axis=1)

In [10]:
X = attributes.iloc[:, :-30].values
y = attributes.iloc[:, -30:].values
scaler_X = StandardScaler()
scaler_X.fit(X)
X = scaler_X.transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
print(y_train)

[[0.88495914 0.87749549 0.88495224 ... 0.83499691 0.82499668 0.88499999]
 [0.9749999  0.97249994 0.9699998  ... 0.96999929 0.96999955 0.9675    ]
 [0.99999732 0.99999732 0.99999156 ... 0.99999903 0.99999939 0.99999998]
 ...
 [0.9999986  0.9999986  0.99999697 ... 0.99999968 0.99999992 1.        ]
 [0.49999999 0.49999999 0.49999997 ... 0.49999999 0.49999999 0.5       ]
 [0.76666664 0.63333332 0.64999741 ... 0.71666661 0.7166666  0.7       ]]


In [41]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

model = CatBoostRegressor(
    iterations=10000,
    depth=6,
    learning_rate=0.01,
    loss_function="MultiRMSEWithMissingValues",
    verbose=1,
    early_stopping_rounds=50
)

model.fit(X_train, y_train, plot=True, use_best_model=False)
# Predict on the test set
y_pred = model.predict(X_test)
        
# Calculate the Mean Squared Error for each output target
mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
            
# Print the Mean Squared Error for each output target
for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8582623	total: 10.7ms	remaining: 1m 46s
1:	learn: 0.8554618	total: 24.8ms	remaining: 2m 3s
2:	learn: 0.8528739	total: 30.2ms	remaining: 1m 40s
3:	learn: 0.8501012	total: 43.9ms	remaining: 1m 49s
4:	learn: 0.8474315	total: 71.3ms	remaining: 2m 22s
5:	learn: 0.8443694	total: 87.4ms	remaining: 2m 25s
6:	learn: 0.8417868	total: 95ms	remaining: 2m 15s
7:	learn: 0.8395256	total: 104ms	remaining: 2m 9s
8:	learn: 0.8368232	total: 111ms	remaining: 2m 3s
9:	learn: 0.8343783	total: 118ms	remaining: 1m 58s
10:	learn: 0.8315447	total: 126ms	remaining: 1m 54s
11:	learn: 0.8293693	total: 133ms	remaining: 1m 50s
12:	learn: 0.8264802	total: 142ms	remaining: 1m 48s
13:	learn: 0.8237179	total: 151ms	remaining: 1m 47s
14:	learn: 0.8213042	total: 159ms	remaining: 1m 45s
15:	learn: 0.8188620	total: 167ms	remaining: 1m 43s
16:	learn: 0.8160401	total: 174ms	remaining: 1m 41s
17:	learn: 0.8136241	total: 182ms	remaining: 1m 40s
18:	learn: 0.8114344	total: 189ms	remaining: 1m 39s
19:	learn: 0.8092713

In [39]:
# spearman ranking
from scipy.stats import rankdata, spearmanr

spearman_results = []
for index in range(len(y_pred)):
  rank_predictions = rankdata(y_pred[index])
  rank_y_test = rankdata(y_test[index])
  statistic, pvalue = spearmanr(rank_predictions, rank_y_test)
  if statistic is not np.nan:
    spearman_results.append(statistic)
print(np.median(spearman_results))

0.40493937284727577


In [40]:
# guarentee located in the top X
ranks = 0
for index in range(len(y_pred)):
  top_3_models = np.argsort(y_test[index])[-10:]
  rank_predictions = rankdata(y_pred[index])

  for model in top_3_models:
    if rank_predictions[model] <= 10:
      ranks += 1
      break

print(ranks / len(y_pred))

0.7476635514018691


In [21]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=8, num_leaves=64, objective='regression_l1', boosting_type='gbdt', n_jobs=-1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.892683
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.877495
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 247, 

Exception ignored on calling ctypes callback function: <function _log_callback at 0x15f844ae0>
Traceback (most recent call last):
  File "/Users/ericzhang/Desktop/Code/MTF-Paper/MTF/.venv/lib/python3.11/site-packages/lightgbm/basic.py", line 224, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.888889
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.919354
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 776
[LightGBM

In [32]:
# spearman ranking
from scipy.stats import rankdata, spearmanr

spearman_results = []
for index in range(len(y_pred)):
  rank_predictions = rankdata(y_pred[index])
  rank_y_test = rankdata(y_test[index])
  statistic, pvalue = spearmanr(rank_predictions, rank_y_test)
  if statistic is not np.nan:
    spearman_results.append(statistic)
print(np.median(spearman_results))

0.30945494994438266


In [33]:
# guarentee located in the top X
ranks = 0
for index in range(len(y_pred)):
  top_3_models = np.argsort(y_test[index])[-5:]
  rank_predictions = rankdata(y_pred[index])

  for model in top_3_models:
    if rank_predictions[model] <= 10:
      ranks += 1
      break

print(ranks / len(y_pred))

0.7102803738317757


In [31]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.01, max_depth=8, objective='reg:squarederror', n_jobs=-1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')

Mean Squared Error for target 1: 0.0675594888377355
Mean Squared Error for target 2: 0.08052909849168653
Mean Squared Error for target 3: 0.06808970819385832
Mean Squared Error for target 4: 0.07002960402600138
Mean Squared Error for target 5: 0.09338603236335508
Mean Squared Error for target 6: 0.08279348254904338
Mean Squared Error for target 7: 0.06843279758107676
Mean Squared Error for target 8: 0.06996021616838644
Mean Squared Error for target 9: 0.06585223581607959
Mean Squared Error for target 10: 0.0665365920925343
Mean Squared Error for target 11: 0.07345891278198495
Mean Squared Error for target 12: 0.07223117239424713
Mean Squared Error for target 13: 0.07085864025843541
Mean Squared Error for target 14: 0.07559602087436691
Mean Squared Error for target 15: 0.07503656379880086
Mean Squared Error for target 16: 0.06967869409437526
Mean Squared Error for target 17: 0.06481926057277732
Mean Squared Error for target 18: 0.06408730553998385
Mean Squared Error for target 19: 0.068