In [1]:
import gzip
import csv
import json
import math
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures

### Read Data

In [2]:
def add_watch_time(d, col):
    # process to minutes
    d[col + '_in_day'] = d[col] % 144
    d[col + '_at_day'] = d[col] // 144

In [3]:
def readCSV(path):    
    # open file
    f = gzip.open(path, 'rt')
    header = ['user', 'stream', 'streamer', 'time_start', 'time_stop']
    
    # process each line
    train, val = [], []
    train_streams, val_streams = dict(), dict()
    streams_per_user = defaultdict(set)
    users_per_stream = defaultdict(set)
    
    print('Parsing Data...')
    for line in f:
        # parse data
        d = dict(zip(header,line.strip().split(',')))
        d['user'], d['time_start'], d['time_stop'] = int(d['user']), int(d['time_start']), int(d['time_stop'])
        add_watch_time(d, 'time_start')
        add_watch_time(d, 'time_stop')
        d['watched'] = 1
        
        # handy info
        uid, sid = d['user'], d['stream']
        
        if uid > 500_000:
            break
        
        if d['time_stop_at_day'] < 36:
            # to training set
            train.append(d)
            
            # to training streams
            if not sid in train_streams:
                train_streams[sid] = {
                    'streamer': d['streamer'], 
                    'time_start': d['time_start'], 
                    'time_stop': d['time_stop']
                }
            else:
                train_streams[sid]['time_start'] = min(train_streams[sid]['time_start'], d['time_start'])
                train_streams[sid]['time_stop'] = max(train_streams[sid]['time_stop'], d['time_stop'])
            
            # to user/stream interactions
            streams_per_user[d['user']].add(sid)
            users_per_stream[sid].add(d['user'])
        else:
            val.append(d)
            if not sid in val_streams:
                val_streams[sid] = {
                    'streamer': d['streamer'], 
                    'time_start': d['time_start'], 
                    'time_stop': d['time_stop']
                }
            else:
                val_streams[sid]['time_start'] = min(val_streams[sid]['time_start'], d['time_start'])
                val_streams[sid]['time_stop'] = max(val_streams[sid]['time_stop'], d['time_stop'])
    
    return train, val, train_streams, val_streams, streams_per_user, users_per_stream

In [4]:
def sample_negatives(lst, streams):
    
    ids = list(streams.keys())
    negatives = []
    
    # go through each training interaction
    for d in tqdm(lst, 'Getting Negative Samples'):
        # find a stream
        while True:
            neg = random.choice(ids)

            if not (neg in streams_per_user[d['user']]):
                negatives.append({
                    'user': d['user'],
                    'stream': neg,
                    'watched': 0
                })
                break
    
    # augment train/val set
    lst += negatives

In [5]:
# build training and validation sets
train, val, train_streams, val_streams, streams_per_user, users_per_stream = readCSV('data/full_a.csv.gz')
sample_negatives(train, train_streams)
sample_negatives(val, val_streams)
len(train)

Parsing Data...


Getting Negative Samples:   0%|          | 0/12350800 [00:00<?, ?it/s]

Getting Negative Samples:   0%|          | 0/2446606 [00:00<?, ?it/s]

24701600

In [6]:
train_y = [d['watched'] for d in train]
val_y = [d['watched'] for d in val]

### Evaluation Metric

In [7]:
def accuracy(pred, y):
    return np.mean(np.array(pred) == np.array(y))

In [8]:
def precision(pred, y):
    pred = np.array(pred)
    y = np.array(y)
    
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fp = np.sum(((pred == 1) & (y == 0)) == 1)
    
    return tp / (tp + fp)

In [9]:
def recall(pred, y):
    pred = np.array(pred)
    y = np.array(y)
    
    tp = np.sum(((pred == 1) & (y == 1)) == 1)
    fn = np.sum(((pred == 0) & (y == 1)) == 1)
    
    return tp / (tp + fn)

### Feature 1: Streamer Popularity Score ($\rightarrow$ Popularity Feature)

In [10]:
from sklearn.linear_model import LogisticRegression

#### Popular Streamer Accounting for Half of History (Source: Paper)

#### Popular Stream

In [11]:
def find_popularity(X, on='streamer'):
    # fit only on the first half
    X_fit = X[:12350800]
    
    # read all training data
    item_count = defaultdict(int)
    total_watched = 0
    popularity = defaultdict(float)
    
    # count all recipes
    for d in X_fit:
        if on == 'stream':
            item_count[d[on]] += 1
            total_watched += 1
        else:
            item_count[train_streams[d['stream']][on]] += 1
            total_watched += 1
    
    # find recipe with their counts
    popularity = pd.DataFrame([(r, item_count[r]) for r in item_count]).rename(columns={0: on, 1: 'count'})
    
    # make percentile
    percentile = (
        popularity
        .groupby('count').count()
        .reset_index()
    )
    
    percentile = (
        percentile
        .assign(percentile=(percentile['count'] * percentile[on]).cumsum() / 12350800)
        .drop(columns=[on])
    )

    # join with original table, convert to dictionary
    return (
        popularity
        .merge(percentile, how='left', left_on='count', right_on='count')
        .drop(columns=['count'])
        .set_index(on)['percentile']
        .to_dict()
    )

In [12]:
popularity_streamer = find_popularity(train, on='streamer')

In [13]:
def dict_lookup(val, d):
    if val in d:
        return d[val]
    else:
        return 0

In [14]:
train_X_pop = [[dict_lookup(train_streams[d['stream']]['streamer'], popularity_streamer)] for d in tqdm(train)]
val_X_pop = [[dict_lookup(val_streams[d['stream']]['streamer'], popularity_streamer)] for d in tqdm(val)]

  0%|          | 0/24701600 [00:00<?, ?it/s]

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [15]:
lr_pop = LogisticRegression(C=1.0, fit_intercept=True)
lr_pop.fit(train_X_pop, train_y)
print('Training Accuracy: ' + str(lr_pop.score(train_X_pop, train_y)))
print('Validation Accuracy: ' + str(lr_pop.score(val_X_pop, val_y)))

Training Accuracy: 0.8226428652394987
Validation Accuracy: 0.818608513181117


### Feature 2: Jaccard
Per Huy

### Feature 3: Direct Times

In [47]:
for d in train_streams.values():
    add_watch_time(d, 'time_start')
    add_watch_time(d, 'time_stop')
    
for d in val_streams.values():
    add_watch_time(d, 'time_start')
    add_watch_time(d, 'time_stop')

In [49]:
train_X_dt = [[train_streams[d['stream']]['time_start_at_day'],
               train_streams[d['stream']]['time_start_in_day'],
               train_streams[d['stream']]['time_stop_at_day'],
               train_streams[d['stream']]['time_stop_in_day']] for d in train]
val_X_dt = [[val_streams[d['stream']]['time_start_at_day'],
             val_streams[d['stream']]['time_start_in_day'],
             val_streams[d['stream']]['time_stop_at_day'],
             val_streams[d['stream']]['time_stop_in_day']] for d in val]

In [50]:
lr_dt = LogisticRegression(C=1.0, fit_intercept=True)
lr_dt.fit(train_X_dt, train_y)
print('Training Accuracy: ' + str(lr_dt.score(train_X_dt, train_y)))
print('Validation Accuracy: ' + str(lr_dt.score(val_X_dt, val_y)))

Training Accuracy: 0.763157244874826
Validation Accuracy: 0.7635972036363844


In [51]:
train_X_dt[0]

[0, 0, 0, 31]

In [52]:
train_X_t = [[train_streams[d['stream']]['time_start'],
               train_streams[d['stream']]['time_stop']] for d in tqdm(train)]
val_X_t = [[val_streams[d['stream']]['time_start'],
             val_streams[d['stream']]['time_stop']] for d in tqdm(val)]

  0%|          | 0/24701600 [00:00<?, ?it/s]

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [57]:
lr_t = LogisticRegression(C=1.0, fit_intercept=True)
lr_t.fit(train_X_t, train_y)
print('Training Accuracy: ' + str(lr_t.score(train_X_t, train_y)))
print('Validation Accuracy: ' + str(lr_t.score(val_X_t, val_y)))

Training Accuracy: 0.7627117676587751
Validation Accuracy: 0.7646607177453174


### Interaction 1: Bayesian Personalized Ranking

In [61]:
import scipy
from implicit import bpr
import tensorflow as tf

In [62]:
user_ids, streamer_ids = {}, {}
interactions = []
users_per_streamer = defaultdict(set)
streamers_per_user = defaultdict(set)

for d in train[:12350800]:
    u,i = d['user'], train_streams[d['stream']]['streamer']
    r = d['watched']
    users_per_streamer[i].add(u)
    streamers_per_user[u].add(i)
    if not u in user_ids: user_ids[u] = len(user_ids)
    if not i in streamer_ids: streamer_ids[i] = len(streamer_ids)
    interactions.append((u, i, r))

n_users, n_streamers = len(user_ids),len(streamer_ids)

In [63]:
Xiu = scipy.sparse.lil_matrix((n_streamers, n_users))
for tup in interactions:
    u, i, r = tup
    Xiu[streamer_ids[i], user_ids[u]] += r
    
Xui = scipy.sparse.csr_matrix(Xiu.T)

In [64]:
bpr_mdl = bpr.BayesianPersonalizedRanking(factors=5, verify_negative_samples=True, random_state=42)

In [65]:
bpr_mdl.fit(Xiu)

  0%|          | 0/100 [00:00<?, ?it/s]

In [66]:
def bpr_dict_lookup(val, d):
    if val in d:
        return d[val]
    else:
        return -1

In [67]:
beta_i = bpr_mdl.item_norms
beta_u = bpr_mdl.user_norms

gamma_i = bpr_mdl.item_factors
gamma_u = bpr_mdl.user_factors

def bpr_predict(user, stream):
    u = bpr_dict_lookup(user, user_ids)
    i = bpr_dict_lookup(train_streams[stream]['streamer'], streamer_ids)
    
    if u == -1 and i == -1:
        return 100
    
    if u == -1:
        return beta_i[i]
    
    if i == -1:
        return beta_u[u]
    
    return beta_i[i] + beta_u[u] + tf.tensordot(gamma_u[u], gamma_i[i], 1).numpy()

def bpr_predict_val(user, stream):
    u = bpr_dict_lookup(user, user_ids)
    i = bpr_dict_lookup(val_streams[stream]['streamer'], streamer_ids)
    
    if u == -1 and i == -1:
        return 100
    
    if u == -1:
        return beta_i[i]
    
    if i == -1:
        return beta_u[u]
    
    return beta_i[i] + beta_u[u] + tf.tensordot(gamma_u[u], gamma_i[i], 1).numpy()

In [68]:
train_X_bpr = [bpr_predict(d['user'], d['stream']) for d in tqdm(train)]

  0%|          | 0/24701600 [00:00<?, ?it/s]

In [69]:
val_X_bpr = [bpr_predict_val(d['user'], d['stream']) for d in tqdm(val)]

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [70]:
lr_bpr = LogisticRegression(C=1, fit_intercept=True)
lr_bpr.fit([[x] for x in train_X_bpr], train_y)
print(lr_bpr.score([[x] for x in train_X_bpr], train_y))
print(lr_bpr.score([[x] for x in val_X_bpr], val_y))

0.8613636363636363
0.8356666336958218


In [71]:
train_X_bpr[:10]

[6.638426,
 6.8270526,
 5.2530336,
 6.437763,
 6.638426,
 7.374193,
 6.437763,
 6.452118,
 5.2530336,
 7.374193]

### Save Results

In [17]:
','.join(['{}' for _ in range(len(header))]).format(*train[0].values())

'1,33827518864,lirik,25,26,25,0,26,0,1'

In [24]:
header = list(train[0].keys())
header

['user',
 'stream',
 'streamer',
 'time_start',
 'time_stop',
 'time_start_in_day',
 'time_start_at_day',
 'time_stop_in_day',
 'time_stop_at_day',
 'watched']

In [25]:
','.join(header)+ '\n'

'user,stream,streamer,time_start,time_stop,time_start_in_day,time_start_at_day,time_stop_in_day,time_stop_at_day,watched\n'

In [58]:
'{},{}\n'.format(*train_X_t[0])

'0,31\n'

In [73]:
with open('data/val_X_bpr.txt', 'w') as f:
    f.write('bpr_score\n')
    for d in tqdm(val_X_bpr):
        f.write('{}\n'.format(d))

  0%|          | 0/4893212 [00:00<?, ?it/s]

In [75]:
with open('data/val_streams.json', 'w') as f:
    json.dump(val_streams, f, indent=4, sort_keys=False)