In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [3]:
classifier_names = [
    'SVC',
    'SGDClassifier',
    'RidgeClassifierCV',
    'RidgeClassifier',
    'Perceptron',
    'PassiveAggressiveClassifier',
    'LogisticRegressionCV',
    'LogisticRegression',
    'LinearSVC',
    'RandomForestClassifier',
    'HistGradientBoostingClassifier',
    'GradientBoostingClassifier',
    'ExtraTreesClassifier',
    'AdaBoostClassifier',
    'XGBClassifier',
    'LGBMClassifier',
    'CatBoostClassifier',
    'RadiusNeighborsClassifier',
    'KNeighborsClassifier',
    'NearestCentroid',
    'QuadraticDiscriminantAnalysis',
    'LinearDiscriminantAnalysis',
    'GaussianNB',
    'BernoulliNB',
    'MLPClassifier',
    'ExtraTreeClassifier',
    'DecisionTreeClassifier',
    'LabelSpreading',
    'LabelPropagation',
    'DummyClassifier'
]

In [4]:
attributes = pd.read_csv("attributes.csv")

In [44]:
# loop through rows
count = 0
for i, row in attributes.iterrows():
    # get the row values
    if row['output_distribution'] == 1:
        # delete the row
        attributes.drop(i, inplace=True)
        count += 1

print(count)

75


In [45]:
classifier_data = {}

In [46]:
for classifier in classifier_names:
  classifier_data[classifier] = []

In [7]:
for index, row in enumerate(attributes["dataset"]):
  labels_df = pd.read_csv(os.path.join('generated_datasets_attributes', row.replace(".csv", ""), 'results.csv'))
  np.nan_to_num(labels_df['score'], nan=np.mean(labels_df['score']), copy=False)
  np.nan_to_num(labels_df['time'], nan=np.mean(labels_df['time']), copy=False)
  for index, i in enumerate(labels_df['classifier']):
    classifier_data[i].append(labels_df['score'][index]-(labels_df['time'][index]/100000))

In [8]:
for key in classifier_data.keys():
  attributes[key] = classifier_data[key]

In [9]:
attributes.drop("dataset", inplace=True, axis=1)

In [10]:
X = attributes.iloc[:, :-30].values
y = attributes.iloc[:, -30:].values
scaler_X = StandardScaler()
scaler_X.fit(X)
X = scaler_X.transform(X)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [48]:
print(y_train)

[[0.99999896 0.99999896 0.99999999 ... 0.99999947 0.9999996  0.99999999]
 [0.97999993 0.97999986 0.97999727 ... 0.96999909 0.96999943 0.97999999]
 [0.54249912 0.52499997 0.53249934 ... 0.54749928 0.5374996  0.5325    ]
 ...
 [0.90740739 0.96296295 0.96296204 ... 0.94444443 0.94444443 0.87037037]
 [0.97499998 0.98749999 0.96249629 ... 0.96249976 0.96249994 0.575     ]
 [0.63934425 0.60655737 0.63934425 ... 0.65573768 0.67213108 0.75409836]]


In [68]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

model = CatBoostRegressor(
    iterations=1000,
    depth=6,
    learning_rate=0.01,
    loss_function="MultiRMSEWithMissingValues",
    verbose=1,
    early_stopping_rounds=50
)

model.fit(X_train, y_train, plot=True, use_best_model=True)
# Predict on the test set
y_pred = model.predict(X_test)
        
# Calculate the Mean Squared Error for each output target
mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
            
# Print the Mean Squared Error for each output target
for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.7717058	total: 16.6ms	remaining: 16.6s
1:	learn: 0.7688477	total: 27.8ms	remaining: 13.9s
2:	learn: 0.7661202	total: 34.6ms	remaining: 11.5s
3:	learn: 0.7637750	total: 40.9ms	remaining: 10.2s
4:	learn: 0.7614852	total: 47.2ms	remaining: 9.4s
5:	learn: 0.7584791	total: 53.6ms	remaining: 8.88s
6:	learn: 0.7557662	total: 60.3ms	remaining: 8.56s
7:	learn: 0.7529754	total: 66.9ms	remaining: 8.3s
8:	learn: 0.7507459	total: 75ms	remaining: 8.26s
9:	learn: 0.7485483	total: 82.5ms	remaining: 8.17s
10:	learn: 0.7461751	total: 88.7ms	remaining: 7.98s
11:	learn: 0.7438636	total: 96.7ms	remaining: 7.96s
12:	learn: 0.7418996	total: 103ms	remaining: 7.8s
13:	learn: 0.7399669	total: 109ms	remaining: 7.7s
14:	learn: 0.7375721	total: 111ms	remaining: 7.32s
15:	learn: 0.7351596	total: 119ms	remaining: 7.29s
16:	learn: 0.7327828	total: 126ms	remaining: 7.27s
17:	learn: 0.7306159	total: 132ms	remaining: 7.18s
18:	learn: 0.7284417	total: 138ms	remaining: 7.11s
19:	learn: 0.7261822	total: 145ms	r

In [69]:
# spearman ranking
from scipy.stats import rankdata, spearmanr

spearman_results = []
for index in range(len(y_pred)):
  rank_predictions = rankdata(y_pred[index])
  rank_y_test = rankdata(y_test[index])
  statistic, pvalue = spearmanr(rank_predictions, rank_y_test)
  if statistic is not np.nan:
    spearman_results.append(statistic)
print(np.median(spearman_results))

0.33125695216907675


In [70]:
# guarentee located in the top X
ranks = 0
for index in range(len(y_pred)):
  top_3_models = np.argsort(y_test[index])[-10:]
  rank_predictions = rankdata(y_pred[index])

  for model in top_3_models:
    if rank_predictions[model] <= 10:
      ranks += 1
      break

print(ranks / len(y_pred))

0.8130841121495327


In [53]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=8, num_leaves=64, objective='regression_l1', boosting_type='gbdt', n_jobs=-1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 785
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.907407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 785
[LightGBM] [Info] Number of data points in the train set: 247, number of used features: 16
[LightGBM] [Info] Start training from score 0.891304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

In [57]:
# spearman ranking
from scipy.stats import rankdata, spearmanr

spearman_results = []
for index in range(len(y_pred)):
  rank_predictions = rankdata(y_pred[index])
  rank_y_test = rankdata(y_test[index])
  statistic, pvalue = spearmanr(rank_predictions, rank_y_test)
  if statistic is not np.nan:
    spearman_results.append(statistic)
print(np.median(spearman_results))

0.29343715239154616


In [58]:
# guarentee located in the top X
ranks = 0
for index in range(len(y_pred)):
  top_3_models = np.argsort(y_test[index])[-5:]
  rank_predictions = rankdata(y_pred[index])

  for model in top_3_models:
    if rank_predictions[model] <= 10:
      ranks += 1
      break

print(ranks / len(y_pred))

0.7383177570093458


In [56]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.01, max_depth=8, objective='reg:squarederror', n_jobs=-1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

for i, mse_value in enumerate(mse):
    print(f'Mean Squared Error for target {i + 1}: {mse_value}')

Mean Squared Error for target 1: 0.0805680860923838
Mean Squared Error for target 2: 0.08681549629259329
Mean Squared Error for target 3: 0.07936780210915464
Mean Squared Error for target 4: 0.07921065347354822
Mean Squared Error for target 5: 0.09527610845801594
Mean Squared Error for target 6: 0.0967835039599304
Mean Squared Error for target 7: 0.0763410035093667
Mean Squared Error for target 8: 0.08083374148692682
Mean Squared Error for target 9: 0.07910643963314261
Mean Squared Error for target 10: 0.08488479539696736
Mean Squared Error for target 11: 0.08777801504447066
Mean Squared Error for target 12: 0.08408746193482974
Mean Squared Error for target 13: 0.08375599330543074
Mean Squared Error for target 14: 0.08475634012360528
Mean Squared Error for target 15: 0.08562574953546383
Mean Squared Error for target 16: 0.07993211492807849
Mean Squared Error for target 17: 0.0847474218127481
Mean Squared Error for target 18: 0.07938026495065305
Mean Squared Error for target 19: 0.08218