In [19]:
import pandas as pd

In [20]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from google.colab import drive
from tqdm import tqdm
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. Set Parameters

To run the ensemble, set the correct paramaters in the next cell and run all cells.
A file with the submission name inserted should be witten to the path given as the final prediction.

**IMPORTANT**: if missing the files needed to run the program, check the README file on how to access these files 

In [21]:
# Path where the models to be ensembled are
PATH = '../data/predictions'
PATH = '/content/drive/My Drive/Colab Notebooks/CSCI-567/csci567_kaggle_demo/Submissions/'

# Filename to write the submission
submission_name = 'ensemble_submission.csv'

# The ENSEMBLE_MODEL constant helps run specific models easily.
# Assign it the variable from below depending on what model (by score) you want to ensemble

# 0.02367 -> 0
# 0.0240 -> 1 (level_2 ensemble)
# 0.02238 -> 2 (Cosine similarity + age cluster custom models)
# 0.02399 -> 3 (level_2 ensemble)
# 0.02355 -> 4 (level_2 ensemble)

ENSEMBLE_MODEL = 0

In [22]:
if ENSEMBLE_MODEL == 0:
  # model with score 0.02367
  model_list = ['SVD_ReRanking_0.0225.csv', 'trending_products_weekly_quotient_mixture_0.0226.csv',
                'Exponential_Decay_0.0217.csv', 'time_is_our_best_friend_submission_0.0220.csv',
                'EDA_Clustering_submission_0.0224.csv', 'Rule_Base_Age_submission_0.0227.csv',
                'lstm_sequential_fix_submission_full_0.0223.csv', 'trending_submission_full_0.0231.csv']
  model_weights = [1.68312712, 1.29621121, 0.99490976, 0.84867336, 1.73965897, 1.50294432, 1.34040495, 1.41992656]

elif ENSEMBLE_MODEL == 1:
  # Weights for 0.0240 score  
  model_list = ['SVD_ReRanking_0.0225.csv', 'trending_products_weekly_quotient_mixture_0.0226.csv',
                'time_is_our_best_friend_submission_0.0220.csv', 'EDA_Clustering_submission_0.0224.csv',
                'Rule_Base_Age_submission_0.0227.csv', 'lstm_sequential_fix_submission_full_0.0223.csv',
                'trending_submission_full_0.0231.csv', 'multiblend-emsemble.csv', 'ensemble_all_submission_0.01188.csv']
  model_weights = [0.168312712, 0.129621121, 0.084867336, 0.173965897, 0.150294432, 0.134040495, 0.141992656, 23.393965897, 0.193965897] 

elif ENSEMBLE_MODEL == 2:
  # simple_model + cosinesim 
  model_list = ['age_simple_combined.csv', 'cosineSim_Final.csv']
  model_weights = [4.55399529, 2.23933693]

elif ENSEMBLE_MODEL == 3:
  # ensemble multiblen + ensemble all
  model_list = ['ensemble_all_submission_0.01188.csv', 'multiblend-emsemble.csv']
  model_weights = [7.97431126, 8.45905625]
elif ENSEMBLE_MODEL == 4:
  # Weights for ensemble level-2 0.02355
  model_list = ['SVD_ReRanking_0.0225.csv', 'trending_products_weekly_quotient_mixture_0.0226.csv',
                'time_is_our_best_friend_submission_0.0220.csv', 'EDA_Clustering_submission_0.0224.csv',
                'Rule_Base_Age_submission_0.0227.csv', 'lstm_sequential_fix_submission_full_0.0223.csv',
                'trending_submission_full_0.0231.csv', 'ensemble_all_submission_0.01188.csv'] 
  model_weights = [1.64372119, 1.34195493, 1.10293642, 1.68786759, 1.43575417, 0.86288483, 1.4416905, 1.37209144]

In [23]:
# Weights assign as the inverse position in the prediction list
position_weights = [1/(i + 1) for i in range(12)]

### 2. Define Functions

In [24]:
# Reads model from submission csv file
def read_model(model_name, path='./Submissions/'):
  return pd.read_csv(path + model_name).sort_values('customer_id').reset_index(drop=True)

# Reads model from pickle file
def read_pickle_model(model_name, path='./Pickles/'):
  return pd.read_pickle(path + model_name)

# Creates a df with all the predictions from the given models list
# Necessary conditions:
  # model_list should not be empty
def read_models(model_list, path='./Submissions/'):
  predictions_df = read_model(model_list[0], path)
  predictions_df.columns = ['customer_id', 'prediction_0']
  for i in range(1, len(model_list)):
    col_name = 'prediction_' + str(i)
    curr_prediction_df = read_model(model_list[i], path)
    predictions_df[col_name] = curr_prediction_df['prediction']
    del curr_prediction_df
  return predictions_df

# Calculates the final prediction for a Series with prediction columns, by getting the top 12 weighted articles
def get_final_prediction(prediction_row, position_weights, model_weights):
  num_models = prediction_row.shape[0] - 1
  predictions = {}
  # Loop over prediction columns
  for m in range(num_models):
    col_name = 'prediction_' + str(m)
    prediction = prediction_row[col_name].split()[:12]
    # Loop over all the articles in the prediction
    for pos, article_id in enumerate(prediction):
      if article_id in predictions:
        predictions[article_id] += model_weights[m] * position_weights[pos]
      else:
        predictions[article_id] = model_weights[m] * position_weights[pos]

  # Sort predictions by value and get top 12 items
  final_prediction = list(dict(sorted(predictions.items(), key=lambda item: -item[1])).keys())[:12]

  return ' '.join(final_prediction)

# Ensemble all the model predictions into a singular final prediction for each customer
def ensemble_models(predictions_df, position_weights, model_weights):
  predictions_df['prediction'] = predictions_df.apply(get_final_prediction, position_weights=position_weights, model_weights=model_weights, axis=1)
  predictions_df = predictions_df[['customer_id', 'prediction']]
  return predictions_df

### 3. Load Models

In [25]:
# Build df with all the predicitions from each model
predictions_df = read_models(model_list, path=PATH)
predictions_df.head()

Unnamed: 0,customer_id,prediction_0,prediction_1,prediction_2,prediction_3,prediction_4,prediction_5,prediction_6,prediction_7
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0918522001 07...,0568601043 0568601006 0448509014 0573085028 07...,0568601043 0924243001 0924243002 0918522001 07...,0568601043 0751471001 0909370001 0915526001 09...,0568601043 0568601006 0745232001 0751471001 04...,0568601043 0568601006 0745232001 0751471001 04...,0568601043 0568601006 0656719005 0745232001 09...,0568601043 0568601006 0568601006 0568597006 04...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0751471001 04...,0826211002 0800436010 0739590027 0448509014 05...,0924243001 0924243002 0918522001 0751471001 04...,0924243001 0924243002 0923758001 0918522001 09...,0826211002 0739590027 0811835004 0764280001 07...,0826211002 0739590027 0811835004 0764280001 07...,0826211002 0800436010 0924243001 0739590027 07...,0826211002 0800436010 0739590027 0811835004 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 07...,0794321007 0852643001 0852643003 0858883002 07...,0794321007 0924243001 0924243002 0918522001 07...,0794321007 0924243001 0924243002 0923758001 09...,0794321007 0852643001 0852643003 0727808007 08...,0794321007 0858883002 0852643003 0727808007 08...,0794321007 0852643001 0852643003 0858883002 09...,0794321007 0852643001 0852643003 0858883002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0751471001 04...,0448509014 0573085028 0751471001 0706016001 06...,0924243001 0924243002 0918522001 0751471001 04...,0924243001 0924243002 0923758001 0918522001 09...,0751471001 0678942001 0673677002 0579541001 05...,0751471001 0579541001 0573085028 0673677002 06...,0448509014 0573085028 0924243001 0751471001 07...,0448509014 0573085028 0751471001 0706016001 06...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0751471001 04...,0730683050 0791587015 0896152002 0818320001 09...,0924243001 0924243002 0918522001 0751471001 04...,0924243001 0924243002 0923758001 0918522001 09...,0730683050 0791587015 0896152002 0927530004 05...,0730683050 0791587015 0896152002 0927530004 08...,0730683050 0791587015 0924243001 0896152002 08...,0730683050 0791587015 0896152002 0927530004 08...


### 4. Ensemble Models

In [None]:
# Get ensembled prediction
predictions_df = ensemble_models(predictions_df, position_weights, model_weights)
predictions_df.head()

### 5. Save Submission

In [28]:
# Write submission to CSV file
def write_submission(submission_df, fname, path='./Submissions/'):
  submission_df.to_csv(path + fname, index=False)

In [29]:
write_submission(predictions_df, submission_name, path=PATH)