# **MODEL ENSEMBLING - NO CONTEXT**

Mounting Google Drive

In [116]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Initial setup

In [117]:
import pickle
import pandas as pd
from itertools import combinations
from sklearn.metrics import ndcg_score, accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import warnings

In [118]:
route = '/content/drive/MyDrive/IMPACT PROJECT/ALL PREDS'

%cd {route}

/content/drive/MyDrive/IMPACT PROJECT/ALL PREDS


Loading the predictions

In [119]:
with open('group_labels.pkl', 'rb') as f:
        labels = pickle.load(f)

with open('naml.pkl', 'rb') as f:
        naml = pickle.load(f)

with open('naml.pkl', 'rb') as f:
        naml = pickle.load(f)

with open('npa.pkl', 'rb') as f:
        npa = pickle.load(f)

with open('nrms.pkl', 'rb') as f:
        nrms = pickle.load(f)

with open('lstur.pkl', 'rb') as f:
        lstur = pickle.load(f)

----

Function definitions

In [120]:
def dcg_score(y_true, y_score, k=10):
    """
    Computing dcg score metric at k.
    """
    k = min(np.shape(y_true)[-1], k)
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    """
    Computing ndcg score metric at k.
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

def calculate_ndcg(y_true, y_pred, k):
    """
    Computing ndcg score metric at k.
    """
    ndcg = np.mean(
        [
            ndcg_score(each_labels, each_preds, k)
            for each_labels, each_preds in zip(y_true, y_pred)
        ]
    )
    return ndcg

def normalize_array(arr):
    """
    Scales and normalizes an array.
    """
    scaler = StandardScaler()
    normalized_arr = scaler.fit_transform(arr.reshape(-1, 1))
    return normalized_arr.flatten()

def transform_to_binary_with_threshold(arr, threshold):
    """
    Transforms an array filled with predictions to a binary one.
    """
    return [1 if value >= threshold else 0 for value in arr]

def calculate_precision(y_true, y_pred):
    """
    Calculates the precision @10
    """
    first_10_elements = [arr[:10] for arr in y_pred]
    first_10_elements_true = [arr[:10] for arr in y_true]
    normalized_array = normalize_array(np.concatenate(first_10_elements))
    binary_result = transform_to_binary_with_threshold(normalized_array, 0.9)
    prec = precision_score(np.concatenate(first_10_elements_true), binary_result)
    return prec

def calculate_mean(predictions):
    """
    Calculate mean predictions between a dictionary of preds
    """
    y_pred_mean = np.mean(list(predictions.values()), axis=0)
    return y_pred_mean.tolist()

In [121]:
# Correcting the format for the npa predictions array

npa2 = []
for i in npa:
    npa2.append(np.array(i))

## Create and Score Ensemblers

In [122]:
predictions = {'naml': naml, 'nrms': nrms, 'lstur': lstur, 'npa': npa2}

In [123]:
# Suppress the warning
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

# Calculating the best combination of models

best_ndcg_10 = 0
best_ndcg_5 = 0
best_precision = 0

best_combination_10 = ()
best_combination_5 = ()
best_combination_p = ()

for r in range(1, len(predictions) + 1):
    for subset in combinations(predictions.keys(), r):
        subset_predictions = {model: predictions[model] for model in subset}
        y_pred = calculate_mean(subset_predictions)

        ndcg_10 = calculate_ndcg(labels, y_pred, k=10)
        ndcg_5 = calculate_ndcg(labels, y_pred, k=5)
        precision = calculate_precision(labels, y_pred)

        print(f"Model combination {subset} yields NDCG@5: {ndcg_5} and NDCG@10: {ndcg_10} -- precision: {precision}")

        if ndcg_10 > best_ndcg_10:
            best_ndcg_10 = ndcg_10
            best_combination_10 = subset

        if ndcg_5 > best_ndcg_5:
            best_ndcg_5 = ndcg_5
            best_combination_5 = subset

        if precision > best_precision:
            best_precision = precision
            best_combination_p = subset

print(f"\n-- Best  NDCG@5 = {best_ndcg_5} by model combination {best_combination_5} --")
print(f"-- Best  NDCG@10 = {best_ndcg_10} by model combination {best_combination_10} --")
print(f"-- Best  precision = {best_precision} by model combination {best_combination_p} --")

Model combination ('naml',) yields NDCG@5: 0.34075560019302104 and NDCG@10: 0.40379306879295496 -- precision: 0.13262965574079774
Model combination ('nrms',) yields NDCG@5: 0.3450982258934691 and NDCG@10: 0.40991177890021435 -- precision: 0.143447566506148
Model combination ('lstur',) yields NDCG@5: 0.3473849017773787 and NDCG@10: 0.40904972548163937 -- precision: 0.13947941899942948
Model combination ('npa',) yields NDCG@5: 0.32882768195760836 and NDCG@10: 0.3926453560892696 -- precision: 0.1314121481177346
Model combination ('naml', 'nrms') yields NDCG@5: 0.3483196929637528 and NDCG@10: 0.41220158260129014 -- precision: 0.13958637026239068
Model combination ('naml', 'lstur') yields NDCG@5: 0.3495521503302876 and NDCG@10: 0.4115712492075966 -- precision: 0.1425198079700246
Model combination ('naml', 'npa') yields NDCG@5: 0.34140139823065535 and NDCG@10: 0.4041923658376537 -- precision: 0.13311802288843644
Model combination ('nrms', 'lstur') yields NDCG@5: 0.35051582205903126 and NDCG@