In [223]:
import pandas as pd
import numpy as np

from scipy import stats

from sklearn.linear_model import Lasso

In [195]:
PREDICTIONS_DIR = "training_set/predictions"

In [196]:
model_data = pd.read_csv(f"{PREDICTIONS_DIR}/model_data.csv")

In [197]:
model_data.head()

Unnamed: 0,name,validation_spearman_rank,parameters,predictions,notes,is_short_term
0,ResNet152_SVR,0.2408,{'RANDOM_SEED': 1},training_set/predictions/ResNet152_SVR/0.csv,,False
1,C3D_SVR,0.1658,{'RANDOM_SEED': 1},training_set/predictions/C3D_SVR/0.csv,,False
2,VGGish_SVR,0.1044,{'RANDOM_SEED': 1},training_set/predictions/VGGish_SVR/0.csv,,False


In [198]:
def get_predictions(prediction_files):
    preds = []
    actual = None
    is_training = None
    
    for pred_file in prediction_files:
        prediction_data = pd.read_csv(pred_file)
        print(len(prediction_data))
        if actual is None:
            actual = np.array(prediction_data["actual"])
            is_training = np.array(prediction_data["in_training_set"])
        assert np.allclose(actual, np.array(prediction_data["actual"]))
        assert np.equal(is_training, np.array(prediction_data["in_training_set"])).all()
        
        
        preds.append(prediction_data["prediction"])
        
    return preds, actual, is_training

In [199]:
preds, actual, is_training = get_predictions(model_data["predictions"])

590
590
590


In [200]:
preds_training = [pred[is_training] for pred in preds]
actual_training = actual[is_training]
preds_validation = [pred[np.logical_not(is_training)] for pred in preds]
actual_validation = actual[np.logical_not(is_training)]

In [201]:
X_train = np.array(preds_training).T
y_train = np.array(actual_training)
X_test = np.array(preds_validation).T
y_test = np.array(actual_validation)

In [183]:
X_train.shape, y_train.shape

((472, 3), (472,))

In [245]:
ensemble = Lasso(normalize=True, alpha = 0.0001)

In [246]:
ensemble.fit(X_train, y_train)

Lasso(alpha=0.0001, normalize=True)

In [247]:
ensemble.coef_

array([1.0836491 , 0.15428462, 0.        ])

In [248]:
normalize = lambda arr: arr / np.sum(arr)
ensemble.coef_ = normalize(ensemble.coef_)
print(ensemble.coef_)

[0.87536924 0.12463076 0.        ]


In [251]:
ensemble_test_predictions = ensemble.predict(X_test)

In [252]:
spearman_rank, _ = stats.spearmanr(y_test, ensemble_test_predictions)
print(spearman_rank)

0.26192119611860387
