In [19]:
import pandas as pd 
import numpy as np

In [20]:
BASE_MODEL_FOLDER = '../results_ensemble'

In [21]:
def rmse(x,y):
    return np.sqrt(np.mean((x-y)**2))

In [22]:
val_truth = []
for i in range(0, 5):
    val_truth.append(pd.read_csv(f'../data_val_train_kfold/partition_{i}_val.csv')['Prediction'].to_numpy())

In [23]:
def create_val_matrix(split, model_list):
    tmp = []
    for m in model_list:
        df = pd.read_csv(f'{BASE_MODEL_FOLDER}/{m}/{m}_split_{split}_val_results.csv')
        t = df['Prediction'].to_numpy()

        tmp.append(t)
    return np.column_stack(tmp)


def combine_models(yhat, coeff):
    coeff = np.array(coeff)
    return np.matmul(yhat, coeff)

In [24]:
MODEL_LIST = [
    'AE_SWA_large',
    #'AE_SWA',
    #'AE_SWA_ensemble_mean',
    'PNCF',
    'ALS_old',
    'SVDpp_ensemble_gaussian'
]
models = []
acc = []
for m in MODEL_LIST:
    models.append(m)
    tmp = []
    val_splits = []
    for i in range(0, 5):
        val_splits.append(create_val_matrix(i, models))

    for val_on  in range(0, 5):
        a = []
        for i in range(0,5):
            if i != val_on:
                coeffs = np.linalg.lstsq(val_splits[i], val_truth[i])[0]
                a.append(coeffs)
        coeffs = np.column_stack(a).mean(axis=1)
        
        res = combine_models(val_splits[val_on], coeffs)
        tmp.append(rmse(res, val_truth[val_on]))

    res = pd.DataFrame(index=[m])
    tmp = np.array(tmp)
    res['mean'] = np.mean(tmp)
    res['std'] = np.std(tmp)
    acc.append(res)


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



In [25]:
df  = pd.concat(acc)
df

Unnamed: 0,mean,std
AE_SWA_large,0.983403,0.002607
PNCF,0.977979,0.002373
ALS_old,0.974047,0.002329
SVDpp_ensemble_gaussian,0.970985,0.002393


In [28]:
import plotly.express as px

df['models'] = df.index
fig = px.line(df, x='models', y=['mean'], title='Ensemble improvement')
fig.update_xaxes(ticktext=['AE', 'PNCF', 'ALS', 'SVDpp'], tickvals=[
    'AE_SWA_large',
    'PNCF',
    'ALS_old',
    'SVDpp_ensemble_gaussian'
], title_text='model added to the ensemble')

newnames ={'mean': 'ensemble'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     ))
fig.update_yaxes(title_text='RMSE')
fig.update_layout(legend_title_text='', showlegend=True)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.update_layout(width =500, height=200, 
                  font_family="Serif", font_size=12, title_font_size=13, 
                  margin_l=5, margin_t=1, margin_b=1, margin_r=5)

fig.update_yaxes(nticks=10)
import plotly.io as pio
#save a figure of 300dpi, width 1.5 inches, height 0.75inches
pio.write_image(fig, "./img/ensemble.pdf", width=1.5*300, height=0.75*300)
fig.show()

In [27]:
pd.concat(acc)

Unnamed: 0,mean,std
AE_SWA_large,0.983403,0.002607
PNCF,0.977979,0.002373
ALS_old,0.974047,0.002329
SVDpp_ensemble_gaussian,0.970985,0.002393
