# Experimental setup

In [None]:
%matplotlib inline
import sys
import time
import math
import copy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as sf

from lightfm import LightFM
import lightfm.evaluation
import scipy.sparse as sparse
from scipy.special import expit
from pyspark.sql.types import *
from skopt import forest_minimize

plt.style.use('bmh')
pd.options.display.max_columns = 500

from sklearn.feature_extraction import DictVectorizer

## Data

In [None]:
days = 22

paths = []
for i in range(1, 23):
    paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/events/march/_day=' + str(i))
interactions = spark.read.parquet(*paths)
print(interactions.count())
interactions.printSchema()

df_int = interactions.toPandas()
df_int = df_int.rename(columns={"seriesRef":"mid", "npoprofileid":"uid"})

df_cont = spark.read.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/poms_stream/").select('seriesRef').dropDuplicates().toPandas()
df_cont = df_cont.rename(columns={"seriesRef": "mid"})

# add together df_int['mid'] and df_cont['mid'] remove duplicates
df_cont = pd.DataFrame(pd.concat([df_cont['mid'], df_int['mid']])).drop_duplicates()

n_users = df_int.uid.unique().shape[0]
n_items = df_cont.mid.unique().shape[0]
sparsity = float(df_int.shape[0]) / float(n_users*n_items) * 100
print('Threshold - Starting interactions info')
print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))

df_lim = df_int

# Create mappings
mid_to_idx = {}
idx_to_mid = {}
for (idx, mid) in enumerate(df_cont.mid.unique().tolist()):
    mid_to_idx[mid] = idx
    idx_to_mid[idx] = mid
    
uid_to_idx = {}
idx_to_uid = {}
for (idx, uid) in enumerate(df_lim.uid.unique().tolist()):
    uid_to_idx[uid] = idx
    idx_to_uid[idx] = uid
    
def map_ids(row, mapper):
    return mapper[row]

I = df_lim.uid.apply(map_ids, args=[uid_to_idx]).values
J = df_lim.mid.apply(map_ids, args=[mid_to_idx]).values
V = np.ones(I.shape[0])
interactions = sparse.coo_matrix((V, (I, J)), dtype=np.float64)
interactions = interactions.tocsr()

interactions

## Train-test split

In [None]:
# For all watched stuff
train_day = df_int[df_int['day']!=days]
train_day = train_day[train_day['uid'].isin(df_lim.uid.tolist())]
test_day = df_int[df_int['day']==days]
test_day = test_day[test_day['uid'].isin(df_lim.uid.tolist())]

intersect = np.intersect1d(train_day.uid.unique(), test_day.uid.unique())
trainday = train_day
testday = test_day[test_day['uid'].isin(intersect)].copy()

# map users and items to idx
user_list = []
item_list = []
for idx, row in testday.iterrows():
    idx_user = uid_to_idx.get(row.uid)
    idx_item = mid_to_idx.get(row.mid)
    user_list.append(idx_user)
    item_list.append(idx_item)
testday['user'] = user_list
testday['item'] = item_list

def train_test_split(ratings):
    train = ratings.copy().tocoo()
    print(train.shape)
    test = sparse.lil_matrix(train.shape) 
    user_index = testday.user.tolist()        
    train = train.tolil()

    for user in user_index:
        test_ratings = testday[testday['user'] == user].item.tolist()
        test[user, test_ratings] = ratings[user, test_ratings]
    return train.tocsr(), test.tocsr(), user_index

train, test, user_index = train_test_split(interactions)

eval_train = train.copy()
non_eval_users = list(set(range(train.shape[0])) - set(user_index))

eval_train = eval_train.tolil()
for u in non_eval_users:
    eval_train[u, :] = 0.0
eval_train = eval_train.tocsr()

In [None]:
# For recommended and watched stuff
recwatched_df = spark.read.parquet('gs://dataproc-jupyter-eileen.npo-data.nl/data/events/recwatched').toPandas()
recwatched_df = recwatched_df.rename(columns={"seriesRef":"mid", "npoprofileid":"uid"})
recwatched_df

train_day2 = train_day
test_day2 = recwatched_df[recwatched_df['uid'].isin(df_lim.uid.tolist())]

intersect2 = np.intersect1d(train_day2.uid.unique(), test_day2.uid.unique())
trainday2 = train_day2
testday2 = test_day2[test_day2['uid'].isin(intersect2)].copy()

# map users and items to idx
user_list2 = []
item_list2 = []
for idx, row in testday2.iterrows():
    idx_user = uid_to_idx.get(row.uid)
    idx_item = mid_to_idx.get(row.mid)
    user_list2.append(idx_user)
    item_list2.append(idx_item)
testday2['user'] = user_list2
testday2['item'] = item_list2

def train_test_split2(ratings):
    train2 = ratings.copy().tocoo()
    print(train2.shape)
    test2 = sparse.lil_matrix(train2.shape) 
    user_index2 = testday2.user.tolist()        
    train2 = train2.tolil()

    for user in user_index2:
        test_ratings2 = testday2[testday2['user'] == user].item.tolist()
        test2[user, test_ratings2] = ratings[user, test_ratings2]
    return train2.tocsr(), test2.tocsr(), user_index2

train2, test2, user_index2 = train_test_split2(interactions)

eval_train2 = train2.copy()
non_eval_users2 = list(set(range(train2.shape[0])) - set(user_index2))

eval_train2 = eval_train2.tolil()
for u in non_eval_users2:
    eval_train2[u, :] = 0.0
eval_train2 = eval_train2.tocsr()

In [None]:
print("Train (All watched) " + str(len(trainday)))
print("All watched " + str(len(testday)))
print("All rec and watched " + str(len(testday2)))

df_sparsity = pd.concat([trainday, testday])
n_users = df_sparsity.uid.unique().shape[0]
n_items = df_cont.mid.unique().shape[0]
sparsity = float(df_int.shape[0]) / float(n_users*n_items) * 100
print('Threshold - Starting interactions info')
print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))

## The execution of the model

In [None]:
features = ['broadcaster', 'credits', 'description_tfidf', 'genres', 'title_tfidf', 'sub_tfidf']

def combinations(features):
    combi = []
    for i in range(1, len(features)+1):
        combi = combi + list(itertools.combinations(features,i))
    return combi

combi = combinations(features)
combi.insert(0, None)
print(len(combi))

def get_itemfeatures(sideinfo):
    feat_dlist = [{} for _ in idx_to_mid]
    for idx, row in sideinfo.iterrows():
        feat_key = '{}'.format(str(row.value).lower())
        idx = mid_to_idx.get(row.mid)
        if idx is not None:
            feat_dlist[idx][feat_key] = 1
    
    item_features = DictVectorizer().fit_transform(feat_dlist)

    eye = sparse.eye(item_features.shape[0], item_features.shape[0]).tocsr()
    item_features_concat = sparse.hstack((eye, item_features))
    item_features_concat = item_features_concat.tocsr().astype(np.float32)
    
    return item_features_concat

def learning_curve(model, train, test, eval_train, train2, test2, eval_train2,
                        iterarray, user_features=None,
                        item_features=None, k=5,
                        **fit_params):
    old_epoch = 0
    test_patk = []
    test_patk_sd = []
    test2_patk = []
    test2_patk_sd = []
    test_rr = []
    test_rr_sd = []
    test2_rr = []
    test2_rr_sd = []
    for epoch in iterarray:
        print(epoch)
        more = epoch - old_epoch
        model.fit_partial(train, user_features=user_features,
                          item_features=item_features,
                          epochs=more, **fit_params)
        this_test_pk = lightfm.evaluation.precision_at_k(model, test, item_features=item_features, train_interactions=None, k=k)
        this_test_pk2 = lightfm.evaluation.precision_at_k(model, test2, item_features=item_features, train_interactions=None, k=k)
        this_test_rr = lightfm.evaluation.reciprocal_rank(model, test, item_features=item_features, train_interactions=None)
        this_test_rr2 = lightfm.evaluation.reciprocal_rank(model, test2, item_features=item_features, train_interactions=None)
        
        test_patk.append(np.mean(this_test_pk))
        test_patk_sd.append(np.std(this_test_pk))
        test2_patk.append(np.mean(this_test_pk2))
        test2_patk_sd.append(np.std(this_test_pk2))
        test_rr.append(np.mean(this_test_rr))
        test_rr_sd.append(np.std(this_test_rr))
        test2_rr.append(np.mean(this_test_rr2))
        test2_rr_sd.append(np.std(this_test_rr2))
        row = [epoch, test_patk[-1], test_patk_sd[-1], test2_patk[-1], test2_patk_sd[-1], test_rr[-1], test_rr_sd[-1], test2_rr[-1], test2_rr_sd[-1]]
        results.append(row)
    return model, test_patk, test_patk_sd, test2_patk, test2_patk_sd, test_rr, test_rr_sd, test2_rr, test2_rr_sd

In [None]:
results = []
for j in range (0, len(combi)):
    print(time.asctime()) 
    feature = combi[j]
    print(feature, j)

    if feature == None: 
        item_features_concat = None
    else: 
        paths = []
        for i in range(0, len(feature)):
            paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/content_features/' + feature[i])
        sideinfo = spark.read.parquet(*paths)

        sideinfo = sideinfo.toPandas()
        sideinfo['value'] = sideinfo.value.str.encode('utf-8')
        sideinfo = sideinfo[sideinfo['mid'].isin(df_lim.mid.tolist())]
        sideinfo = sideinfo[sideinfo.groupby('value').value.transform(len) > 1] #remove unique content featuers
        item_features_concat = get_itemfeatures(sideinfo)
    
    result = []
    model = LightFM(loss='warp', random_state=2016)
    model.fit(train, item_features=item_features_concat, epochs=0)

    iterarray = range(10, 110, 10)
    model, test_patk, test_patk_sd, test2_patk, test2_patk_sd, test_rr, test_rr_sd, test2_rr, test2_rr_sd = learning_curve(
        model, train, test, eval_train, train2, test2, eval_train2, iterarray, item_features=item_features_concat,k=5, **{'num_threads': 4}) #num_threads 1

    
    results.append(result)
    print(time.asctime())

In [None]:
df = pd.DataFrame(results)
df.columns = ['epoch', 'test_p5', 'p5_std', 'test_p52', 'p5_std2', 'test_r-rank', 'rank_std', 'test_r-rank2', 'rank_std2']
indeces = []
features = []
df = df.dropna()
for i in range(0, len(combi)):
    for j in range(0, 10):
        indeces.append(i)
        features.append(str(combi[i]))
df['combi'] = indeces
df['features'] = features
df

In [None]:
spark.createDataFrame(df).write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/results/marchv3results58-64(2)-")

## Optimizing model


In [None]:
feature = combi[48] # ('broadcaster', 'description_tfidf', 'genres', 'title_tfidf')

paths = []
for i in range(0, len(feature)):
    paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/content_features/' + feature[i])
sideinfo = spark.read.parquet(*paths)

sideinfo = sideinfo.toPandas()
sideinfo['value'] = sideinfo.value.str.encode('utf-8')
sideinfo = sideinfo[sideinfo['mid'].isin(df_lim.mid.tolist())]
item_features_concat = get_itemfeatures(sideinfo)

In [None]:
def objective_wsideinfo(params):
    epochs, learning_rate,\
    no_components, item_alpha,\
    scale = params
    
    user_alpha = item_alpha * scale
    model = LightFM(loss='warp',
                    random_state=2016,
                    learning_rate=learning_rate,
                    no_components=no_components,
                    user_alpha=user_alpha,
                    item_alpha=item_alpha)
    model.fit(train, epochs=epochs,
              item_features=item_features_concat,
              num_threads=4, verbose=True)
    
    patks = lightfm.evaluation.precision_at_k(model, test,
                                              item_features=item_features_concat,
                                              train_interactions=None,
                                              k=5, num_threads=3)

    mapatk = np.mean(patks)
    # Make negative because we want to _minimize_ objective
    out = -mapatk
    # Weird shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out

In [None]:
# Grid search
space = [(1, 120), # epochs
         (10**-3, 1.0, 'log-uniform'), # learning_rate
         (20, 200), # no_components
         (10**-5, 10**-3, 'log-uniform'), # item_alpha
         (0.001, 1., 'log-uniform') # user_scaling
        ]
item_features = item_features_concat.astype(np.float32)
res_fm_itemfeat_precision = forest_minimize(objective_wsideinfo, space, n_calls=50,
                                  random_state=0,
                                  verbose=True)

In [None]:
print('Maximimum p@k found: {:6.5f}'.format(-res_fm_itemfeat_precision.fun))
print('Optimal parameters:')
params = ['epochs', 'learning_rate', 'no_components', 'item_alpha', 'scaling']
for (p, x_) in zip(params, res_fm_itemfeat_precision.x):
    print('{}: {}'.format(p, x_))
print('Standard deviation: ' + str(np.std(-res_fm_itemfeat_precision.func_vals)))

### Train using optimal parameters

In [None]:
# optimized hyperparameters
epochs = 89
learning_rate = 0.0146853682318
no_components = 168
item_alpha = 0.00086386659459
scale = 0.325637504106

user_alpha = item_alpha * scale
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=learning_rate,
                no_components=no_components,
                user_alpha=user_alpha,
                item_alpha=item_alpha)
model.fit(interactions, epochs=epochs,
          item_features=item_features_concat,
          num_threads=4)

In [None]:
precision_test = lightfm.evaluation.precision_at_k(model, test, item_features=item_features_concat, train_interactions=None, k=5)
precision_test2 = lightfm.evaluation.precision_at_k(model, test2, item_features=item_features_concat, train_interactions=None, k=5)
rank_test = lightfm.evaluation.reciprocal_rank(model, test, item_features=item_features_concat, train_interactions=None)
rank_test2 = lightfm.evaluation.reciprocal_rank(model, test2, item_features=item_features_concat, train_interactions=None)

print("Precision mean: " + str(np.mean(precision_test)))
print("Precision std: " + str(np.std(precision_test)))
print("Precision2 mean: " + str(np.mean(precision_test2)))
print("Precision2 std: " + str(np.std(precision_test2)))
print("Rank mean: " + str(np.mean(rank_test)))
print("Rank std: " + str(np.std(rank_test)))
print("Rank2 mean: " + str(np.mean(rank_test2)))
print("Rank2 std: " + str(np.std(rank_test2)))

## Recommendations

In [None]:
predictions_df = pd.DataFrame(columns=['itemlist','scores', 'userlist'])
for i in range(0, len(user_list2)):
    scores = model.predict(np.int32(user_list2[i], dtype=np.int32), item_list) #31
    score_df = pd.DataFrame({'itemlist' : item_list, 'scores' : scores}).drop_duplicates()
    score_df = score_df.sort_values('scores', ascending=False)
    score_df['userlist'] = user_list2[i]
    predictions_df = pd.concat([predictions_df, score_df.head(5)])
series_ref = []
for x in predictions_df['itemlist'].tolist():
    series_ref.append(idx_to_mid.get(x))
predictions_df['seriesRef'] = series_ref
predictions_df

#### Individual example

In [None]:
# show recommendations for a random user
rando = user_list2[random.randint(0, len(user_list2))]
print(rando)
user = rando
user_id = idx_to_uid.get(user)
print(user_id)
scores = model.predict(np.int32(user, dtype=np.int32), item_list) #31
score_df = pd.DataFrame({'itemlist' : item_list, 'scores' : scores})
score_df = score_df.drop_duplicates()
score_df = score_df.sort_values('scores', ascending=False)
series_ref = []
for x in score_df['itemlist'].tolist():
    series_ref.append(idx_to_mid.get(x))
score_df['seriesRef'] = series_ref
score_df