In [14]:
import pandas as pd 
import numpy as np

In [15]:
BASE_MODEL_FOLDER = '../results_ensemble'
MODEL_LIST = [
    #'AE_SWA',
    'AE_SWA_ensemble_mean',
    'ALS',
    #'NCF_dist_exp_2_embeddings_SWA',
    'NCF_dist_exp_SWA',
    'SVDpp_ensemble_gaussian'
]

In [16]:
def create_val_matrix(split):
    tmp = []
    for m in MODEL_LIST:
        df = pd.read_csv(f'{BASE_MODEL_FOLDER}/{m}/{m}_split_{split}_val_results.csv')
        t = df['Prediction'].to_numpy()

        tmp.append(t)
    return np.column_stack(tmp)



In [17]:
def create_final_matrix():
    tmp = []
    for m in MODEL_LIST:
        df = pd.read_csv(f'{BASE_MODEL_FOLDER}/{m}/{m}_final_results.csv')
        t = df['Prediction'].to_numpy()

        tmp.append(t)
    return np.column_stack(tmp)



In [18]:
def rmse(x,y):
    return np.sqrt(np.mean((x-y)**2))

In [19]:
val_splits = []
for i in range(0, 5):
    val_splits.append(create_val_matrix(i))

In [20]:
val_truth = []
for i in range(0, 5):
    val_truth.append(pd.read_csv(f'../data_val_train_kfold/partition_{i}_val.csv')['Prediction'].to_numpy())

In [21]:
def combine_models(yhat, coeff):
    coeff = np.array(coeff)
    coeff = coeff / coeff.sum()
    return np.matmul(yhat, coeff)

In [22]:
def save_predictions(res_path, predictions):
    test_pd = pd.read_csv('../data/sampleSubmission.csv')
    test_pd = test_pd.astype({'Prediction': 'float'})
    test_pd.iloc[:, 1] = predictions
    test_pd.to_csv(res_path, index=False, float_format='%.3f')

In [23]:
def run_manual():
    coeffs = [0.0, 0.0, 0.0, 1.0]
   
    score = 0
    for i in range(0,5):
        yhat = combine_models(val_splits[i], coeffs)
        score += rmse(yhat, val_truth[i])
    
    return score / 5
#run_manual()

In [24]:
def run_trial(trial):
    coeffs = []
    for i in range(len(MODEL_LIST)):
        coeffs.append(trial.suggest_float(f'c{i}', 0.0, 1.0))

    score = 0
    for i in range(0,5):
        yhat = combine_models(val_splits[i], coeffs)
        score += rmse(yhat, val_truth[i])
    
    return score / 5

In [25]:
from optuna_single_gpu import run_optuna
EXPERIMENT_NAME = 'Ensemble'
N_TRIALS = 1000

DIR_RESULTS = '/cluster/scratch/piattigi/CIL/res_optuna/'
import os 
os.makedirs(DIR_RESULTS+EXPERIMENT_NAME, exist_ok=True)
best_params = run_optuna(run_trial, EXPERIMENT_NAME, N_TRIALS)



[32m[I 2022-07-19 20:58:29,861][0m A new study created in memory with name: Ensemble[0m
[32m[I 2022-07-19 20:58:29,883][0m Trial 0 finished with value: 0.9753357485241636 and parameters: {'c0': 0.7557708103649379, 'c1': 0.10596485805917322, 'c2': 0.7134776617062208, 'c3': 0.3009265353427687}. Best is trial 0 with value: 0.9753357485241636.[0m
[32m[I 2022-07-19 20:58:29,898][0m Trial 1 finished with value: 0.9726434860785466 and parameters: {'c0': 0.28056813234759237, 'c1': 0.9662548444189094, 'c2': 0.3409874454618508, 'c3': 0.49531311159770575}. Best is trial 1 with value: 0.9726434860785466.[0m
[32m[I 2022-07-19 20:58:29,912][0m Trial 2 finished with value: 0.9741918166483051 and parameters: {'c0': 0.7768162054785338, 'c1': 0.4789150916787124, 'c2': 0.793483221658893, 'c3': 0.32137840399608064}. Best is trial 1 with value: 0.9726434860785466.[0m
[32m[I 2022-07-19 20:58:29,927][0m Trial 3 finished with value: 0.9727634684821561 and parameters: {'c0': 0.9478425749256326, '

[OPTUNA]  Best score: 0.9712157962033036
[OPTUNA]  Best params: {'c0': 0.0004205308181259561, 'c1': 0.4334330278631244, 'c2': 0.2432778711367058, 'c3': 0.914882171071947}


In [28]:
#for AE_SWA_ensemble_mean-ALS-NCF_dist_exp_SWA-SVDpp_ensemble_gaussian-results
best_params =  {'c0': 0.0004205308181259561, 'c1': 0.4334330278631244, 'c2': 0.2432778711367058, 'c3': 0.914882171071947}

In [29]:
matrix_final = create_final_matrix()
res_final = combine_models(matrix_final, list(best_params.values()))

name = ''
for m in MODEL_LIST:
    name+=m
    name+='-'
name+='results.csv'
save_predictions(name, res_final)