In [1]:
import gzip
import csv
import json
import math
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures

### Data, Evaluation

In [2]:
def readData(fp, split):
    """
    Train/val data reader that have different headers for pos/neg entries.
    
    split: number of positive/negative items (should be total // 2)
    """
    result = []
    with open(fp, 'r') as f:
        header = next(f).strip().split(',')
        non_ints = {'stream', 'streamer'}
        i = 0
        for line in f:
            # parse data
            content = line.strip().split(',')
            if i < split:
                d = dict(zip(header, content))
                for k, v in d.items():
                    if not k in non_ints:
                        d[k] = int(v)
            else:
                d = {
                    'user': int(content[0]),
                    'stream': content[1],
                    'watched': int(content[-1])
                }    
            i += 1
            result.append(d)
            
    return result

def readCSV(fp):
    """
    Data reader for regular CSV files, returns a list of dictionaries.
    """
    result = []
    with open(fp, 'r') as f:
        header = next(f).strip().split(',')
        for line in f:
            d = dict(zip(header, line.strip().split(',')))
            result.append(d)
    return result

def readX(fp):
    """
    Data reader for processed feature vector files, returns a 2D-list where
    each element is a list of feature vectors.
    """
    result = []
    with open(fp, 'r') as f:
        header = next(f)
        print("Header: " + header.strip())
        for line in f:
            l = line.strip().split(',')
            result.append(l)
    return result

def readJSON(fp):
    """
    JSON data reader.
    """
    with open(fp, 'r') as f:
        return json.load(f)

In [3]:
# read from file
train = readData('data/train.txt', split=12350800)
val = readData('data/val.txt', split=2446606)
train_streams = readJSON('data/train_streams.json')
val_streams = readJSON('data/val_streams.json')

# get labels
train_y = [d['watched'] for d in train]
val_y = [d['watched'] for d in val]

In [4]:
def accuracy(pred, y):
    return np.mean(np.array(pred) == np.array(y))

def precision(pred, y):
    pred = np.array(pred)
    y = np.array(y)
    
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fp = np.sum(((pred == 1) & (y == 0)) == 1)
    
    return tp / (tp + fp)

def recall(pred, y):
    pred = np.array(pred)
    y = np.array(y)
    
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fn = np.sum(((pred == 0) & (y == 1)) == 1)
    
    return tp / (tp + fn)

### Popularity Baseline

In [53]:
class PopBase:
    
    def __init__(self, frac=.5):
        self.frac = frac
    
    def __str__(self):
        return f'PopBase(frac={self.frac})'
    
    def __repr__(self):
        return self.__str__()
    
    def fit(self, X):
        """
        During fitting, we find the streamers that account for 50% of interactions in training.
        
        Parameters:
            X(List[dict]): Training streamers.
        """
        # read all training data
        streamer_count = defaultdict(int)
        total = 0
        
        for d in tqdm(X, 'collecting training data'):
            streamer_count[train_streams[d['stream']]['streamer']] += 1
            total += 1
        
        # rank by most popular
        by_popular = sorted([(streamer_count[s], s) for s in streamer_count], reverse=True)
        
        # find those account for 50%
        most_popular = set()
        accounted = 0
        for count, streamer in by_popular:
            accounted += count
            most_popular.add(streamer)
            
            # check if accounted
            if accounted > total * self.frac:
                break
        
        self.popular_ = most_popular
        return self
    
    def predict(self, X):
        return [1 if val_streams[d['stream']]['streamer'] in self.popular_ else 0 for d in tqdm(X, 'predicting...')]

In [54]:
# fit model
pop = PopBase(frac=0.55)
pop.fit(train)

# evaluate
pred_pop = pop.predict(val)

collecting training data:   0%|          | 0/24701600 [00:00<?, ?it/s]

predicting...:   0%|          | 0/4893212 [00:00<?, ?it/s]

In [52]:
accuracy(pred_pop, val_y)

0.770222912884216

### Latent Factor Model

In [14]:
import tensorflow as tf

In [22]:
userIDs = {}
itemIDs = {}
interactionsPerUser = defaultdict(dict)

for d in tqdm(train[:12350800]):
    u = d['user']
    i = train_streams[d['stream']]['streamer']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    user_d = interactionsPerUser[u]
    if i in user_d:
        user_d[i] += 1
    else:
        user_d[i] = 1

  0%|          | 0/12350800 [00:00<?, ?it/s]

In [34]:
interactions = []
total_num = 0
for u in tqdm(interactionsPerUser):
    for streamer, num in interactionsPerUser[u].items():
        interactions.append((u, streamer, num))
        total_num += num
mu = total_num / len(interactions)

  0%|          | 0/491582 [00:00<?, ?it/s]

In [29]:
# random.shuffle(interactions)
len(interactions)

6435872

In [36]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [37]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb
    
    def find_param_val(self, *args):
        user, item = args
        if user in userIDs:
            user_id = userIDs[user]
        else:
            user_id = -1

        if item in itemIDs:
            item_id = itemIDs[item]
        else:
            item_id = -1
    
        return self.predict(user_id, item_id)

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        if u < 0 and i < 0:
            return self.alpha
        
        if u < 0:
            return self.alpha + self.betaI[i]
        
        if i < 0:
            return self.alpha + self.betaU[u]
        
        return self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [38]:
mdl = LatentFactorModel(mu, 5, 0.00001)

In [39]:
def trainingStep(model, interactions):
    Nsamples = 200000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [40]:
i = 0
prev_obj = float('inf')
inc = 0
while i <= 100:
    obj = trainingStep(mdl, interactions)
    print("iteration " + str(i+1) + ", objective = " + str(obj))
    if obj >= prev_obj: 
        inc += 1
    else:
        inc = 0
    if inc >= 5: break
    prev_obj = obj
    i += 1

iteration 1, objective = 1.7868031
iteration 2, objective = 1.786833
iteration 3, objective = 1.7647964
iteration 4, objective = 1.7992533
iteration 5, objective = 1.7218503
iteration 6, objective = 1.7184359
iteration 7, objective = 1.7160406
iteration 8, objective = 1.7567174
iteration 9, objective = 1.7451345
iteration 10, objective = 1.7645899
iteration 11, objective = 1.750503
iteration 12, objective = 1.7365141
iteration 13, objective = 1.6913228
iteration 14, objective = 1.694685
iteration 15, objective = 1.6790042
iteration 16, objective = 1.7072066
iteration 17, objective = 1.6943059
iteration 18, objective = 1.7444688
iteration 19, objective = 1.7171757
iteration 20, objective = 1.7237492
iteration 21, objective = 1.7013156
iteration 22, objective = 1.6901853
iteration 23, objective = 1.7480377
iteration 24, objective = 1.7540728
iteration 25, objective = 1.7145406
iteration 26, objective = 1.7233647
iteration 27, objective = 1.7233428
iteration 28, objective = 1.731783
itera

In [44]:
# build sets
val_X_ltf = [mdl.find_param_val(d['user'], val_streams[d['stream']]['streamer']) for d in tqdm(val)]

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [45]:
ltf_score_per_user = defaultdict(list)

for i in tqdm(range(len(val_X_ltf))):
    score = val_X_ltf[i]
    if not isinstance(score, int):
        score = score.numpy()
    user = val[i]['user']
    ltf_score_per_user[user].append((score, i))

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [47]:
ltf_pred_dict = dict()

for u in ltf_score_per_user:
    lst = ltf_score_per_user[u]
    lst.sort(reverse=True)
    full_size = len(lst)
    pos_size = full_size // 2
    for i in range(full_size):
        if i < pos_size:
            ltf_pred_dict[lst[i][-1]] = 1
        else:
            ltf_pred_dict[lst[i][-1]] = 0

In [48]:
ltf_pred = [ltf_pred_dict[i] for i in range(4893212)]

In [49]:
accuracy(ltf_pred, val_y)

0.6664575334156787

In [50]:
with open('data/LFMPredictions.txt', 'w') as f:
    f.write('prediction\n')
    for pred in ltf_pred:
        f.write(str(pred) + '\n')