In [1]:
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures

### Data, Evaluation

In [2]:
def readX(fp):
    return pd.read_csv(fp).values

In [3]:
# get labels
train_y = readX('data/train_y.txt').flatten()
val_y = readX('data/val_y.txt').flatten()

In [4]:
def accuracy(pred, y):
    return np.mean(pred == y)

def precision(pred, y):
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fp = np.sum(((pred == 1) & (y == 0)) == 1)
    
    return tp / (tp + fp)

def recall(pred, y):
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fn = np.sum(((pred == 0) & (y == 1)) == 1)
    
    return tp / (tp + fn)

### Popularity

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
train_X_pop = readX('data/train_X_pop.txt')
val_X_pop = readX('data/val_X_pop.txt')

In [7]:
lr_pop = LogisticRegression(C=1.0, fit_intercept=True)
lr_pop.fit(train_X_pop, train_y)
pop_pred = lr_pop.predict(val_X_pop)

In [8]:
lr_pop.score(val_X_pop, val_y)

0.818608513181117

### Bayesian Personalized Ranking 

In [9]:
train_X_bpr = readX('data/train_X_bpr.txt')
val_X_bpr = readX('data/val_X_bpr.txt')

In [10]:
lr_bpr = LogisticRegression(C=1.0, fit_intercept=True)
lr_bpr.fit(train_X_bpr, train_y)
bpr_pred = lr_bpr.predict(val_X_bpr)

In [11]:
lr_bpr.score(val_X_bpr, val_y)

0.8356666336958218

### Timestamp

In [13]:
train_X_dt = readX('data/train_X_dt.txt')
val_X_dt = readX('data/val_X_dt.txt')

In [14]:
lr_dt = LogisticRegression(C=1.0, fit_intercept=True)
lr_dt.fit(train_X_dt, train_y)
dt_pred = lr_dt.predict(val_X_dt)

### FPMC

In [15]:
fpmc_pred = readX('data/FPMCPredictions.txt').flatten()

In [16]:
accuracy(fpmc_pred, val_y)

0.8368903697612121

### Latent Factor

In [17]:
lfm_pred = readX('data/LFMPredictions.txt').flatten()

In [18]:
accuracy(lfm_pred, val_y)

0.6664575334156787

### Model Testing

In [19]:
from itertools import combinations

In [20]:
class Composite:
    def __init__(self):
        self.bases = None
    
    def __repr__(self):
        if self.bases == None:
            return 'Composite()'
        else:
            return 'Composite({})'.format(', '.join(self.bases.keys()))
    
    def fit(self, pipe):
        self.bases = {name: pred for name, pred in pipe}
        self.predictions = {name: pred for name, pred in pipe}
        
        # get all combinations
        for i in range(2, len(pipe) + 1):
            for c in combinations(self.bases, i):
                p = np.logical_or.reduce(tuple(self.bases[elem].copy() for elem in c)).astype(int)
                name = ' + '.join(c)
                self.predictions[name] = p
        
        return self
                
    def evaluate(self):
        self.metrics = defaultdict(dict)
        for name, p in tqdm(self.predictions.items()):
            d = self.metrics[name]
            d['accuracy'] = accuracy(p, val_y)
            d['precision'] = precision(p, val_y)
            d['recall'] = recall(p, val_y)

In [21]:
# create pipe
pipe = [
    ('pop', pop_pred),
    ('bpr', bpr_pred),
    ('fpmc', fpmc_pred),
    ('timestamp', dt_pred),
    ('lfm', lfm_pred)
]

comp = Composite()
comp.fit(pipe)
comp.evaluate()

  0%|          | 0/31 [00:00<?, ?it/s]

In [23]:
comp.metrics

defaultdict(dict,
            {'pop': {'accuracy': 0.818608513181117,
              'precision': 0.8816670347011649,
              'recall': 0.7359991760013668},
             'bpr': {'accuracy': 0.8356666336958218,
              'precision': 0.8807392956422244,
              'recall': 0.7764756564808555},
             'fpmc': {'accuracy': 0.8368903697612121,
              'precision': 0.8368903697612121,
              'recall': 0.8368903697612121},
             'timestamp': {'accuracy': 0.7635972036363844,
              'precision': 0.7989837902491151,
              'recall': 0.7044191014000619},
             'lfm': {'accuracy': 0.6664575334156787,
              'precision': 0.6664575334156787,
              'recall': 0.6664575334156787},
             'pop + bpr': {'accuracy': 0.8432017660383404,
              'precision': 0.8272161475778127,
              'recall': 0.8676284616321549},
             'pop + fpmc': {'accuracy': 0.8280840887335353,
              'precision': 0.81631768412

In [38]:
at_least_2 = ((pop_pred + bpr_pred + fpmc_pred) >= 2).astype(int)

In [43]:
[f(at_least_2, val_y) for f in [accuracy, precision, recall]]

[0.8570376676914877, 0.898710523445049, 0.80477812937596]