In [1]:
import pandas as pd
import scipy
from scipy import optimize
import numpy as np
import sklearn.metrics as metrics

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
%cd gdrive/MyDrive/colab/vilio/ernie-vil/data/hm/

/content/gdrive/MyDrive/colab/vilio/ernie-vil/data/hm


In [108]:
val_el36 = pd.read_csv('dev_seenEL36.csv').set_index('id')

In [5]:
val_el50 = pd.read_csv('dev_seenELV50.csv').set_index('id')

In [6]:
val_el72 = pd.read_csv('dev_seenEL72.csv').set_index('id')

In [7]:
val_el36_vcr = pd.read_csv('dev_seenELVCR36.csv').set_index('id')

In [8]:
val_el72_vcr = pd.read_csv('dev_seenELVCR72.csv').set_index('id')

In [9]:
true_labels = pd.read_json('dev_seenlong.jsonl',lines=True).set_index('id')

In [10]:
joined_scores = val_el36.rename(columns={'proba':'el36_proba'})[['el36_proba']] \
    .join(val_el50.rename(columns={'proba': 'el50_proba'})[['el50_proba']], how='inner') \
    .join(val_el72.rename(columns={'proba': 'el72_proba'})[['el72_proba']], how='inner') \
    .join(val_el36_vcr.rename(columns={'proba': 'el36_vcr_proba'})[['el36_vcr_proba']], how='inner') \
    .join(val_el72_vcr.rename(columns={'proba': 'el72_vcr_proba'})[['el72_vcr_proba']], how='inner') \
    .join(true_labels.rename(columns={'label': 'true_label'})[['true_label']], how='inner') \
    .drop_duplicates()

In [95]:
def optimize_weights(w, joined_scores):
  """Function to minimize the negative AUROC score on the validation set for the ensemble"""
  w1,w2,w3,w4,w5=w
  combined_scores = (w1*joined_scores['el36_proba'] + w2*joined_scores['el50_proba'] + 
                     w3*joined_scores['el72_proba'] + w4*joined_scores['el36_vcr_proba'] + w5*joined_scores['el72_vcr_proba'])
  return -metrics.roc_auc_score(joined_scores['true_label'], combined_scores)
  

In [96]:
# This is the baseline 
optimize_weights(np.ones(5) / 5, joined_scores)

-0.804211806500136

In [97]:
def sum_to_one(w):
  return 1 - w.sum()

cons = [{'type':'eq', 'fun': sum_to_one}]

In [98]:
bounds = np.zeros((5,2))
bounds[:,1] = 1

In [99]:
optimized_weights = optimize.dual_annealing(optimize_weights, bounds=bounds, args=(joined_scores,), maxiter=5000, initial_temp=10460)

In [100]:
# Performance only increases 0.005 on validation with tuned ensemble weights...
optimized_weights

     fun: -0.8094445600166423
 message: ['Maximum number of iteration reached']
    nfev: 53707
    nhev: 0
     nit: 5000
    njev: 0
  status: 0
 success: True
       x: array([0.12966678, 0.1167347 , 0.72238828, 0.42744235, 0.81104285])

In [103]:
normed_weights = optimized_weights.x / sum(optimized_weights.x)

In [104]:
normed_weights

array([0.05874519, 0.05288634, 0.32727607, 0.19365161, 0.36744079])

In [105]:
new_preds = normed_weights[0]*joined_scores['el36_proba'] + normed_weights[1] * joined_scores['el50_proba'] + \
            normed_weights[2]*joined_scores['el72_proba'] + normed_weights[3]*joined_scores['el36_vcr_proba'] + \
            normed_weights[4]*joined_scores['el72_vcr_proba']

In [106]:
metrics.roc_auc_score(joined_scores['true_label'], new_preds)

0.8094445600166423

In [111]:
test_el36 = pd.read_csv('test_seenEL36.csv').set_index('id')
test_el50 = pd.read_csv('test_seenELV50.csv').set_index('id')
test_el72 = pd.read_csv('test_seenEL72.csv').set_index('id')
test_el36_vcr = pd.read_csv('test_seenELVCR36.csv').set_index('id')
test_el72_vcr = pd.read_csv('test_seenELVCR72.csv').set_index('id')

In [113]:
joined_test_scores = test_el36.rename(columns={'proba':'el36_proba'})[['el36_proba']] \
    .join(test_el50.rename(columns={'proba': 'el50_proba'})[['el50_proba']], how='inner') \
    .join(test_el72.rename(columns={'proba': 'el72_proba'})[['el72_proba']], how='inner') \
    .join(test_el36_vcr.rename(columns={'proba': 'el36_vcr_proba'})[['el36_vcr_proba']], how='inner') \
    .join(test_el72_vcr.rename(columns={'proba': 'el72_vcr_proba'})[['el72_vcr_proba']], how='inner') \
    .drop_duplicates()

In [116]:
test_preds = normed_weights[0]*joined_test_scores['el36_proba'] + normed_weights[1] * joined_test_scores['el50_proba'] + \
             normed_weights[2]*joined_test_scores['el72_proba'] + normed_weights[3]*joined_test_scores['el36_vcr_proba'] + \
             normed_weights[4]*joined_test_scores['el72_vcr_proba']

In [131]:
test_pred_df = pd.DataFrame(test_preds, columns=['proba']).reset_index()

In [139]:
test_pred_df['label'] = np.where(test_pred_df['proba'] > test_pred_df['proba'].median(), 1, 0)

In [142]:
test_pred_df.to_csv('EL365072_new_ensemble_weights_test_seen.csv', index=False)