In [1]:
%matplotlib inline
import sys
import time
import math
import copy
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as sf

from lightfm import LightFM
import lightfm.evaluation
import scipy.sparse as sparse
from scipy.special import expit
from pyspark.sql.types import *
from skopt import forest_minimize

plt.style.use('bmh')
pd.options.display.max_columns = 500

from sklearn.feature_extraction import DictVectorizer

  from .murmurhash import murmurhash3_32
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
  from ._criterion import Criterion
  from .ball_tree import BallTree
  from .kd_tree import KDTree
  from ._random import sample_without_replacement
  from numpy.core.umath_tests import inner1d
  from ._gradient_boosting import predict_stages
  from . import _hashing


## Data

In [2]:
days = 22

paths = []
for i in range(1, 23):
    paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/events/march/_day=' + str(i))
interactions = spark.read.parquet(*paths)
print(interactions.count())
interactions.printSchema()

df_int = interactions.toPandas()
df_int = df_int.rename(columns={"seriesRef":"mid", "npoprofileid":"uid"})

df_cont = spark.read.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/poms_stream/").select('seriesRef').dropDuplicates().toPandas()
df_cont = df_cont.rename(columns={"seriesRef": "mid"})

# add together df_int['mid'] and df_cont['mid'] remove duplicates
df_cont = pd.DataFrame(pd.concat([df_cont['mid'], df_int['mid']])).drop_duplicates()

n_users = df_int.uid.unique().shape[0]
n_items = df_cont.mid.unique().shape[0]
sparsity = float(df_int.shape[0]) / float(n_users*n_items) * 100
print('Threshold - Starting interactions info')
print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))

# # Thresholding interactions based on user
# def threshold_interactions(df, df_cont, mid_min):
#     n_users = df.uid.unique().shape[0]
#     n_items = df_cont.mid.unique().shape[0]
#     sparsity = float(df_int.shape[0]) / float(n_users*n_items) * 100
#     print('Threshold - Starting interactions info')
#     print('Number of users: {}'.format(n_users))
#     print('Number of models: {}'.format(n_items))
#     print('Sparsity: {:4.3f}%'.format(sparsity))
    
#     mid_counts = df.groupby('uid').mid.count()
#     df = df[~df.uid.isin(mid_counts[mid_counts < mid_min].index.tolist())]
    
#     n_users = df.uid.unique().shape[0]
#     n_items = df_cont.mid.unique().shape[0]
#     sparsity = float(df_int.shape[0]) / float(n_users*n_items) * 100
#     print('Threshold - Ending interactions info')
#     print('Number of users: {}'.format(n_users))
#     print('Number of models: {}'.format(n_items))
#     print('Sparsity: {:4.3f}%'.format(sparsity))
#     return df

#df_lim = threshold_interactions(df_int, df_cont, 5) 
df_lim = df_int

# Create mappings
mid_to_idx = {}
idx_to_mid = {}
for (idx, mid) in enumerate(df_cont.mid.unique().tolist()):
    mid_to_idx[mid] = idx
    idx_to_mid[idx] = mid
    
uid_to_idx = {}
idx_to_uid = {}
for (idx, uid) in enumerate(df_lim.uid.unique().tolist()):
    uid_to_idx[uid] = idx
    idx_to_uid[idx] = uid
    
def map_ids(row, mapper):
    return mapper[row]

I = df_lim.uid.apply(map_ids, args=[uid_to_idx]).values
J = df_lim.mid.apply(map_ids, args=[mid_to_idx]).values
V = np.ones(I.shape[0])
interactions = sparse.coo_matrix((V, (I, J)), dtype=np.float64)
interactions = interactions.tocsr()

interactions

1235728
root
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- npoprofileid: string (nullable = true)
 |-- seriesRef: string (nullable = true)

Threshold - Starting interactions info
Number of users: 181357
Number of models: 3169
Sparsity: 0.215%


<181357x3169 sparse matrix of type '<type 'numpy.float64'>'
	with 674689 stored elements in Compressed Sparse Row format>

## Train-test split

In [3]:
# For all watched stuff
train_day = df_int[df_int['day']!=days]
train_day = train_day[train_day['uid'].isin(df_lim.uid.tolist())]
test_day = df_int[df_int['day']==days]
test_day = test_day[test_day['uid'].isin(df_lim.uid.tolist())]

intersect = np.intersect1d(train_day.uid.unique(), test_day.uid.unique())
trainday = train_day[train_day['uid'].isin(intersect)].copy()
testday = test_day[test_day['uid'].isin(intersect)].copy()

# map users and items to idx
user_list = []
item_list = []
for idx, row in testday.iterrows():
    idx_user = uid_to_idx.get(row.uid)
    idx_item = mid_to_idx.get(row.mid)
    user_list.append(idx_user)
    item_list.append(idx_item)
testday['user'] = user_list
testday['item'] = item_list

def train_test_split(ratings):
    train = ratings.copy().tocoo()
    print(train.shape)
    test = sparse.lil_matrix(train.shape) 
    user_index = testday.user.tolist()        
    train = train.tolil()

    for user in user_index:
        test_ratings = testday[testday['user'] == user].item.tolist()
        test[user, test_ratings] = ratings[user, test_ratings]
    return train.tocsr(), test.tocsr(), user_index

train, test, user_index = train_test_split(interactions)

eval_train = train.copy()
non_eval_users = list(set(range(train.shape[0])) - set(user_index))

eval_train = eval_train.tolil()
for u in non_eval_users:
    eval_train[u, :] = 0.0
eval_train = eval_train.tocsr()

(181357, 3169)


In [4]:
# For recommended and watched stuff
recwatched_df = spark.read.parquet('gs://dataproc-jupyter-eileen.npo-data.nl/data/events/recwatched').toPandas()
recwatched_df = recwatched_df.rename(columns={"seriesRef":"mid", "npoprofileid":"uid"})
recwatched_df

train_day2 = train_day
test_day2 = recwatched_df[recwatched_df['uid'].isin(df_lim.uid.tolist())]

intersect2 = np.intersect1d(train_day2.uid.unique(), test_day2.uid.unique())
trainday2 = train_day2[train_day2['uid'].isin(intersect2)].copy()
testday2 = test_day2[test_day2['uid'].isin(intersect2)].copy()

# map users and items to idx
user_list2 = []
item_list2 = []
for idx, row in testday2.iterrows():
    idx_user = uid_to_idx.get(row.uid)
    idx_item = mid_to_idx.get(row.mid)
    user_list2.append(idx_user)
    item_list2.append(idx_item)
testday2['user'] = user_list2
testday2['item'] = item_list2

def train_test_split2(ratings):
    train2 = ratings.copy().tocoo()
    print(train2.shape)
    test2 = sparse.lil_matrix(train2.shape) 
    user_index2 = testday2.user.tolist()        
    train2 = train2.tolil()

    for user in user_index2:
        test_ratings2 = testday2[testday2['user'] == user].item.tolist()
        test2[user, test_ratings2] = ratings[user, test_ratings2]
    return train2.tocsr(), test2.tocsr(), user_index2

train2, test2, user_index2 = train_test_split2(interactions)

eval_train2 = train2.copy()
non_eval_users2 = list(set(range(train2.shape[0])) - set(user_index2))

eval_train2 = eval_train2.tolil()
for u in non_eval_users2:
    eval_train2[u, :] = 0.0
eval_train2 = eval_train2.tocsr()

(181357, 3169)


In [5]:
print("All watched " + str(len(testday)))
print("All rec and watched " + str(len(testday2)))

All watched 41358
All rec and watched 149


## The execution of the model

In [6]:
features = ['broadcaster', 'credits', 'description_tfidf', 'genres', 'title_tfidf', 'sub_tfidf']

def combinations(features):
    combi = []
    for i in range(1, len(features)+1):
        combi = combi + list(itertools.combinations(features,i))
    return combi

combi = combinations(features)
combi.insert(0, None)
print(len(combi))

def get_itemfeatures(sideinfo):
    feat_dlist = [{} for _ in idx_to_mid]
    for idx, row in sideinfo.iterrows():
        feat_key = '{}'.format(str(row.value).lower())
        idx = mid_to_idx.get(row.mid)
        if idx is not None:
            feat_dlist[idx][feat_key] = 1
    
    #print(feat_dlist[0])
    #print(len(feat_dlist))
    
    item_features = DictVectorizer().fit_transform(feat_dlist)
    #print(item_features.check_format)
    
    # Need to hstack item_features
    eye = sparse.eye(item_features.shape[0], item_features.shape[0]).tocsr()
    item_features_concat = sparse.hstack((eye, item_features))
    item_features_concat = item_features_concat.tocsr().astype(np.float32)
    
    return item_features_concat

def learning_curve(model, train, test, eval_train, train2, test2, eval_train2,
                        iterarray, user_features=None,
                        item_features=None, k=5,
                        **fit_params):
    old_epoch = 0
    test_patk = []
    test_patk_sd = []
    test2_patk = []
    test2_patk_sd = []
    test_rr = []
    test_rr_sd = []
    test2_rr = []
    test2_rr_sd = []
    for epoch in iterarray:
        print(epoch)
        more = epoch - old_epoch
        model.fit_partial(train, user_features=user_features,
                          item_features=item_features,
                          epochs=more, **fit_params)
        this_test_pk = lightfm.evaluation.precision_at_k(model, test, item_features=item_features, train_interactions=None, k=k)
        this_test_pk2 = lightfm.evaluation.precision_at_k(model, test2, item_features=item_features, train_interactions=None, k=k)
        this_test_rr = lightfm.evaluation.reciprocal_rank(model, test, item_features=item_features, train_interactions=None)
        this_test_rr2 = lightfm.evaluation.reciprocal_rank(model, test2, item_features=item_features, train_interactions=None)
        
        test_patk.append(np.mean(this_test_pk))
        test_patk_sd.append(np.std(this_test_pk))
        test2_patk.append(np.mean(this_test_pk2))
        test2_patk_sd.append(np.std(this_test_pk2))
        test_rr.append(np.mean(this_test_rr))
        test_rr_sd.append(np.std(this_test_rr))
        test2_rr.append(np.mean(this_test_rr2))
        test2_rr_sd.append(np.std(this_test_rr2))
        row = [epoch, test_patk[-1], test_patk_sd[-1], test2_patk[-1], test2_patk_sd[-1], test_rr[-1], test_rr_sd[-1], test2_rr[-1], test2_rr_sd[-1]]
        results.append(row)
        #precision.append([float(i) for i in list(this_test)])
        #rank.append([float(i) for i in list(this_test2)])
    return model, test_patk, test_patk_sd, test2_patk, test2_patk_sd, test_rr, test_rr_sd, test2_rr, test2_rr_sd

64


In [18]:
# don't touch plz
results = []
#precisions = []
#ranks = []

In [23]:
for j in range (50, 64):
    print(time.asctime()) 
    feature = combi[j]
    print(feature, j)

    if feature == None: 
        item_features_concat = None
    else: 
        paths = []
        for i in range(0, len(feature)):
            paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/content_features/' + feature[i])
        sideinfo = spark.read.parquet(*paths)

        sideinfo = sideinfo.toPandas()
        sideinfo['value'] = sideinfo.value.str.encode('utf-8')
        sideinfo = sideinfo[sideinfo['mid'].isin(df_lim.mid.tolist())]
        sideinfo = sideinfo[sideinfo.groupby('value').value.transform(len) > 1] #remove unique content featuers
        item_features_concat = get_itemfeatures(sideinfo)
    
    result = []
#     precision = []
#     rank = []
    # Initialize model
    model = LightFM(loss='warp', random_state=2016)
    model.fit(train, item_features=item_features_concat, epochs=0)

    iterarray = range(10, 110, 10)
    model, test_patk, test_patk_sd, test2_patk, test2_patk_sd, test_rr, test_rr_sd, test2_rr, test2_rr_sd = learning_curve(
        model, train, test, eval_train, train2, test2, eval_train2, iterarray, item_features=item_features_concat,k=5, **{'num_threads': 4}) #num_threads 1

    
    results.append(result)
#     precisions.append(precision)
#     ranks.append(rank)
    print(time.asctime())

Mon Jun 10 14:40:55 2019
(('broadcaster', 'description_tfidf', 'title_tfidf', 'sub_tfidf'), 50)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 14:54:01 2019
Mon Jun 10 14:54:01 2019
(('broadcaster', 'genres', 'title_tfidf', 'sub_tfidf'), 51)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 15:05:02 2019
Mon Jun 10 15:05:02 2019
(('credits', 'description_tfidf', 'genres', 'title_tfidf'), 52)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 15:15:13 2019
Mon Jun 10 15:15:13 2019
(('credits', 'description_tfidf', 'genres', 'sub_tfidf'), 53)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 15:30:03 2019
Mon Jun 10 15:30:03 2019
(('credits', 'description_tfidf', 'title_tfidf', 'sub_tfidf'), 54)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 15:45:35 2019
Mon Jun 10 15:45:35 2019
(('credits', 'genres', 'title_tfidf', 'sub_tfidf'), 55)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 15:58:56 2019
Mon Jun 10 15:58:56 2019
(('description_tfidf', 'genres', 'title_tfidf', 'sub_tfidf'), 56)
10
20
30
40
50
60
70
80
90
100
Mon Jun 10 1

In [24]:
pd.DataFrame(results).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263
0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,
1,0.076455,0.093186,0.10231,0.106454,0.109416,0.112468,0.115642,0.117609,0.11944,0.120892,,0.08322,0.097703,0.101937,0.107457,0.111427,0.113477,0.117429,0.119196,0.120969,0.121978,,0.067896,0.080939,0.090995,0.097144,0.101577,0.104083,0.105214,0.107174,0.108819,0.109635,,0.072554,0.08579,0.094201,0.098493,0.102348,0.104186,0.107257,0.108902,0.110836,0.113535,,0.0513,0.052482,0.054866,0.05937,0.063475,0.0646,0.069046,0.070145,0.072452,0.075035,,0.075683,0.088091,0.094567,0.098654,0.103132,0.105009,0.107431,0.109153,0.110753,0.112616,,0.060738,0.07355,0.08403,0.090288,0.096656,0.09875,0.102297,0.103781,0.105914,0.107996,,0.068545,0.080117,0.088778,0.095718,0.099245,0.102201,0.103794,0.106936,0.107913,0.110586,,0.088457,0.098885,0.105837,0.1127,0.115591,0.117943,0.120493,0.122646,0.123892,0.125229,,0.080406,0.093045,0.101449,0.104784,0.107611,0.111479,0.113946,0.115835,0.117165,0.120172,,0.07479,0.090378,0.098583,0.103325,0.105394,0.108819,0.110573,0.113156,0.115077,0.117037,,0.082057,0.094137,0.101629,0.106583,0.11029,0.112854,0.116092,0.118161,0.120583,0.121952,,0.069509,0.082571,0.089941,0.093147,0.098924,0.100736,0.104533,0.10567,0.10763,0.108825,,0.061214,0.066463,0.074167,0.083548,0.089793,0.095763,0.097992,0.100941,0.103107,0.104276,,0.054654,0.059562,0.062415,0.068339,0.074977,0.077881,0.082693,0.086125,0.088496,0.091316,,0.06255,0.067106,0.072207,0.08067,0.083824,0.089016,0.090969,0.094477,0.096771,0.098937,,0.077997,0.090873,0.095891,0.101481,0.104494,0.108414,0.109905,0.112937,0.114749,0.116837,,0.068796,0.085739,0.095178,0.097799,0.103878,0.106833,0.109198,0.110207,0.112231,0.114389,,0.051062,0.055482,0.056388,0.060333,0.064728,0.06832,0.072561,0.074129,0.0788,0.081421,,0.0607,0.068416,0.073981,0.081755,0.084152,0.089806,0.092678,0.095718,0.09848,0.10017,,0.069573,0.080849,0.090147,0.095049,0.097568,0.100466,0.102516,0.103839,0.105285,0.107926,,0.076712,0.089614,0.096052,0.099772,0.101989,0.106473,0.108343,0.109198,0.111954,0.113169,,0.061304,0.07222,0.080766,0.087493,0.09318,0.097581,0.10019,0.102381,0.104989,0.107174,,0.055996,0.058843,0.061329,0.064626,0.069412,0.074559,0.08049,0.086671,0.090674,0.092222,
2,0.106766,0.112687,0.115659,0.116741,0.117243,0.117517,0.118109,0.118365,0.118838,0.119207,,0.108345,0.113946,0.115209,0.116571,0.116987,0.116978,0.117881,0.118173,0.118404,0.118568,,0.101695,0.108596,0.112213,0.113922,0.114656,0.115253,0.115464,0.116147,0.1161,0.116367,,0.103708,0.109641,0.112607,0.113553,0.114698,0.114959,0.115776,0.116193,0.116372,0.116993,,0.091395,0.092364,0.093894,0.096717,0.099406,0.099735,0.102753,0.103314,0.104459,0.106068,,0.105112,0.110835,0.112717,0.113634,0.114937,0.115183,0.115521,0.115664,0.116081,0.116546,,0.09687,0.104103,0.109246,0.111856,0.114011,0.114435,0.115203,0.115487,0.115664,0.11627,,0.102851,0.108909,0.111496,0.113607,0.114866,0.115472,0.115286,0.116327,0.116243,0.116803,,0.110549,0.113217,0.11539,0.116404,0.117023,0.117311,0.117943,0.11816,0.118306,0.118484,,0.108773,0.11302,0.115673,0.116601,0.116616,0.117004,0.117537,0.117909,0.118093,0.118997,,0.10528,0.111772,0.114501,0.115378,0.115612,0.116255,0.116407,0.117201,0.117112,0.1176,,0.10767,0.111848,0.114003,0.114959,0.116234,0.116707,0.117218,0.117344,0.117633,0.117964,,0.102741,0.109176,0.111618,0.112548,0.114426,0.114339,0.115259,0.115676,0.115741,0.116022,,0.097931,0.100977,0.105288,0.109644,0.111363,0.113325,0.113658,0.114213,0.114107,0.114351,,0.093518,0.096104,0.098032,0.101909,0.105507,0.106962,0.109078,0.110443,0.111178,0.112262,,0.098724,0.100755,0.10328,0.107561,0.109404,0.111196,0.111695,0.113213,0.11398,0.114617,,0.106589,0.112592,0.113284,0.114747,0.115372,0.116075,0.116565,0.117017,0.117242,0.117595,,0.100974,0.108409,0.112047,0.112735,0.114421,0.114833,0.11566,0.11582,0.116034,0.116627,,0.091703,0.094949,0.09553,0.098116,0.10064,0.103057,0.105064,0.105559,0.107352,0.108265,,0.096801,0.101605,0.104051,0.108084,0.108485,0.110948,0.111866,0.112584,0.113507,0.113359,,0.102811,0.10858,0.112039,0.113591,0.114113,0.114475,0.115019,0.115016,0.115338,0.115788,,0.106678,0.111611,0.113313,0.113642,0.113963,0.115077,0.11567,0.115916,0.116781,0.116749,,0.096699,0.103793,0.107015,0.109837,0.112047,0.113695,0.114464,0.114226,0.114816,0.115259,,0.094116,0.095866,0.097213,0.099279,0.102123,0.104391,0.107205,0.109459,0.111198,0.111421,
3,0.066129,0.077419,0.080645,0.082258,0.087097,0.090323,0.090323,0.093548,0.098387,0.1,,0.070968,0.082258,0.079032,0.082258,0.091935,0.090323,0.091935,0.096774,0.096774,0.1,,0.066129,0.082258,0.075806,0.074194,0.079032,0.077419,0.080645,0.082258,0.085484,0.090323,,0.056452,0.064516,0.066129,0.066129,0.069355,0.066129,0.074194,0.077419,0.074194,0.083871,,0.048387,0.051613,0.051613,0.059677,0.06129,0.062903,0.066129,0.067742,0.069355,0.067742,,0.059677,0.067742,0.072581,0.079032,0.082258,0.082258,0.090323,0.091935,0.095161,0.08871,,0.056452,0.067742,0.070968,0.067742,0.074194,0.072581,0.077419,0.077419,0.085484,0.08871,,0.06129,0.064516,0.072581,0.077419,0.083871,0.080645,0.082258,0.087097,0.08871,0.090323,,0.079032,0.075806,0.077419,0.08871,0.085484,0.08871,0.083871,0.098387,0.096774,0.098387,,0.072581,0.080645,0.083871,0.083871,0.091935,0.08871,0.091935,0.096774,0.093548,0.098387,,0.069355,0.082258,0.087097,0.083871,0.08871,0.083871,0.087097,0.08871,0.082258,0.083871,,0.069355,0.074194,0.079032,0.082258,0.083871,0.08871,0.096774,0.098387,0.103226,0.08871,,0.064516,0.072581,0.079032,0.080645,0.080645,0.085484,0.08871,0.091935,0.090323,0.083871,,0.056452,0.06129,0.064516,0.067742,0.067742,0.067742,0.069355,0.067742,0.077419,0.069355,,0.040323,0.046774,0.048387,0.054839,0.058065,0.058065,0.059677,0.062903,0.067742,0.074194,,0.066129,0.067742,0.062903,0.074194,0.075806,0.077419,0.080645,0.075806,0.079032,0.077419,,0.064516,0.072581,0.082258,0.083871,0.080645,0.074194,0.079032,0.083871,0.08871,0.087097,,0.05,0.06129,0.056452,0.062903,0.064516,0.070968,0.070968,0.077419,0.079032,0.082258,,0.041935,0.05,0.046774,0.05,0.054839,0.062903,0.066129,0.06129,0.069355,0.075806,,0.058065,0.059677,0.067742,0.069355,0.072581,0.077419,0.077419,0.080645,0.083871,0.080645,,0.064516,0.070968,0.082258,0.080645,0.082258,0.080645,0.085484,0.087097,0.085484,0.08871,,0.069355,0.074194,0.079032,0.074194,0.082258,0.079032,0.083871,0.082258,0.082258,0.085484,,0.058065,0.069355,0.06129,0.06129,0.067742,0.067742,0.064516,0.074194,0.075806,0.077419,,0.046774,0.054839,0.059677,0.059677,0.066129,0.064516,0.064516,0.066129,0.067742,0.070968,
4,0.094089,0.100674,0.101344,0.101639,0.102365,0.102721,0.102721,0.102973,0.103163,0.103175,,0.095693,0.098414,0.097777,0.101639,0.10286,0.102721,0.10286,0.099948,0.099948,0.103175,,0.097457,0.101639,0.100299,0.099896,0.101022,0.100674,0.104478,0.104764,0.10526,0.105814,,0.09002,0.093493,0.094089,0.094089,0.095189,0.094089,0.096613,0.097417,0.096613,0.101907,,0.089338,0.091125,0.087514,0.09497,0.095638,0.096276,0.097457,0.098003,0.098519,0.094654,,0.09151,0.094654,0.096167,0.097777,0.101639,0.098414,0.102721,0.10286,0.103062,0.102556,,0.09002,0.094654,0.099007,0.098003,0.099896,0.099465,0.100674,0.100674,0.102149,0.102556,,0.092204,0.093493,0.096167,0.097417,0.098691,0.098109,0.098414,0.099164,0.099361,0.099531,,0.097777,0.097029,0.097417,0.102556,0.102149,0.102556,0.098691,0.099987,0.103125,0.099987,,0.099465,0.101344,0.101907,0.101907,0.10286,0.102556,0.105949,0.103125,0.102973,0.106244,,0.095189,0.098414,0.099164,0.098691,0.099361,0.098691,0.099164,0.099361,0.098414,0.098691,,0.095189,0.096613,0.097777,0.098414,0.098691,0.099361,0.099948,0.103163,0.099948,0.102556,,0.093493,0.096167,0.097777,0.098109,0.098109,0.098941,0.099361,0.099674,0.099531,0.098691,,0.09002,0.092204,0.093493,0.094654,0.094654,0.094654,0.095189,0.094654,0.097417,0.095189,,0.080241,0.084658,0.085651,0.089221,0.090782,0.090782,0.09151,0.092865,0.094654,0.096613,,0.094089,0.094654,0.092865,0.096613,0.100299,0.097417,0.101344,0.100299,0.101022,0.100674,,0.093493,0.096167,0.101639,0.101907,0.101344,0.099896,0.101022,0.101907,0.105654,0.10547,,0.086603,0.092204,0.09002,0.092865,0.093493,0.095693,0.095693,0.100674,0.097777,0.101639,,0.081416,0.086603,0.084658,0.086603,0.089221,0.096276,0.097457,0.092204,0.095189,0.100299,,0.090782,0.09151,0.094654,0.095189,0.096167,0.097417,0.097417,0.098109,0.101907,0.098109,,0.093493,0.095693,0.101639,0.098109,0.098414,0.098109,0.098941,0.099164,0.102149,0.102556,,0.095189,0.096613,0.097777,0.096613,0.098414,0.097777,0.098691,0.098414,0.098414,0.098941,,0.090782,0.095189,0.092204,0.092204,0.094654,0.094654,0.093493,0.096613,0.097029,0.097417,,0.084658,0.089221,0.09151,0.09151,0.094089,0.093493,0.093493,0.094089,0.094654,0.095693,
5,0.201703,0.244959,0.277051,0.284485,0.298015,0.308772,0.32609,0.331393,0.339381,0.344666,,0.217787,0.262495,0.276003,0.297321,0.319855,0.321875,0.3373,0.342216,0.349946,0.351651,,0.181354,0.206867,0.229843,0.243877,0.259289,0.267647,0.277398,0.283258,0.291829,0.300243,,0.20061,0.225328,0.242182,0.268237,0.278546,0.288043,0.299025,0.305142,0.309162,0.318096,,0.141641,0.150275,0.157648,0.165553,0.174134,0.180253,0.186017,0.192889,0.196949,0.201259,,0.20697,0.23519,0.254485,0.265897,0.282946,0.289095,0.297066,0.305235,0.309408,0.315718,,0.170072,0.200653,0.219258,0.232891,0.251475,0.261257,0.270472,0.275444,0.282689,0.290337,,0.185027,0.213907,0.229226,0.250442,0.260133,0.276598,0.280936,0.292818,0.298654,0.310407,,0.235705,0.264919,0.295199,0.317048,0.326632,0.338677,0.347835,0.353845,0.360096,0.364464,,0.208296,0.243052,0.271499,0.288487,0.298174,0.310592,0.322677,0.330789,0.334232,0.338242,,0.201057,0.241667,0.262688,0.278068,0.284057,0.303005,0.306771,0.315095,0.323427,0.330297,,0.21481,0.257835,0.277969,0.299709,0.310913,0.321246,0.335589,0.337564,0.344844,0.354679,,0.1917,0.21816,0.239091,0.244319,0.262701,0.26966,0.278749,0.285547,0.291693,0.303297,,0.172101,0.185037,0.201384,0.218088,0.226795,0.238131,0.245385,0.258273,0.266519,0.271164,,0.159925,0.170024,0.176109,0.190179,0.202446,0.207102,0.216693,0.223786,0.230451,0.232824,,0.168131,0.180563,0.192404,0.210107,0.21839,0.229196,0.236436,0.247365,0.253254,0.261271,,0.209179,0.242724,0.260604,0.276204,0.285822,0.302302,0.31065,0.317365,0.326879,0.334865,,0.191132,0.217004,0.245556,0.258953,0.274911,0.292552,0.298397,0.304992,0.313411,0.319212,,0.149085,0.156425,0.159166,0.168444,0.176499,0.184532,0.19096,0.19457,0.2014,0.206787,,0.173071,0.188964,0.199139,0.209606,0.221718,0.234667,0.24675,0.254473,0.262208,0.266612,,0.185941,0.209831,0.228527,0.24434,0.247909,0.266152,0.274916,0.280487,0.285218,0.298562,,0.207412,0.234227,0.248271,0.265022,0.275174,0.296816,0.303165,0.30164,0.313366,0.319886,,0.172133,0.195729,0.210385,0.221408,0.240777,0.248343,0.258965,0.269921,0.280256,0.28349,,0.162932,0.168286,0.177404,0.186921,0.19387,0.200684,0.209254,0.219899,0.227406,0.234598,
6,0.237882,0.267656,0.293213,0.291666,0.30243,0.309379,0.32367,0.325645,0.330603,0.333061,,0.245713,0.280575,0.289232,0.304516,0.32342,0.321143,0.331819,0.333034,0.337168,0.337108,,0.227263,0.234694,0.247841,0.254729,0.267212,0.272695,0.284178,0.286678,0.294853,0.303501,,0.242796,0.252648,0.258619,0.286273,0.290712,0.29933,0.306961,0.310271,0.312161,0.31791,,0.2018,0.209393,0.215583,0.219346,0.222719,0.226764,0.228099,0.232411,0.234022,0.235663,,0.245094,0.260659,0.274474,0.281418,0.296344,0.300117,0.30432,0.310925,0.312784,0.316019,,0.223907,0.238596,0.248159,0.252261,0.267719,0.276234,0.280355,0.284103,0.288972,0.295238,,0.229761,0.246441,0.252866,0.268414,0.271582,0.288517,0.289816,0.299625,0.304983,0.314329,,0.263346,0.28074,0.305619,0.317766,0.32278,0.331033,0.336188,0.338113,0.341044,0.343068,,0.239043,0.263541,0.285847,0.30104,0.305366,0.312607,0.321972,0.32716,0.327683,0.327204,,0.236133,0.26684,0.278063,0.288153,0.291309,0.307875,0.308112,0.314113,0.319679,0.323988,,0.245215,0.279999,0.291279,0.309143,0.314142,0.321105,0.330535,0.328733,0.331953,0.339062,,0.23616,0.247439,0.263927,0.262921,0.278155,0.282355,0.286667,0.293781,0.297449,0.308517,,0.227956,0.230662,0.233814,0.24375,0.245617,0.24904,0.254198,0.265798,0.271723,0.275011,,0.222066,0.225382,0.22512,0.231751,0.238374,0.238539,0.245251,0.248426,0.252232,0.250531,,0.22343,0.226334,0.230478,0.239208,0.244302,0.250145,0.256254,0.266515,0.269675,0.277786,,0.241955,0.266897,0.279389,0.288862,0.294831,0.307787,0.314996,0.317418,0.324027,0.328734,,0.233707,0.238755,0.261515,0.273195,0.282884,0.29937,0.301985,0.307885,0.312474,0.316508,,0.215764,0.216768,0.215838,0.218196,0.21974,0.223077,0.225655,0.22666,0.22888,0.232375,,0.228837,0.235451,0.23518,0.237558,0.247345,0.255941,0.266655,0.271153,0.276648,0.278013,,0.229216,0.239913,0.248269,0.260857,0.256892,0.277619,0.285998,0.289886,0.292264,0.304637,,0.24438,0.255487,0.260952,0.276361,0.285649,0.305546,0.308518,0.304008,0.314082,0.31875,,0.227354,0.233773,0.235569,0.2391,0.257364,0.258347,0.268673,0.276736,0.288153,0.287291,,0.222744,0.224163,0.227645,0.230945,0.231039,0.230734,0.234982,0.239378,0.242975,0.250542,
7,0.188365,0.223869,0.228058,0.226005,0.227251,0.239181,0.241379,0.258666,0.261339,0.26358,,0.199037,0.225332,0.209899,0.219636,0.238025,0.224061,0.239283,0.233052,0.249823,0.245293,,0.17463,0.199361,0.209424,0.209485,0.214149,0.212977,0.214898,0.224213,0.223408,0.233516,,0.177714,0.177818,0.183263,0.208543,0.220188,0.209068,0.22587,0.22347,0.212182,0.216229,,0.145016,0.14882,0.155348,0.167543,0.172282,0.1798,0.181613,0.190437,0.192628,0.190933,,0.176075,0.182562,0.196367,0.209531,0.221897,0.230392,0.235334,0.249953,0.246249,0.261483,,0.146724,0.173463,0.195325,0.196684,0.193936,0.202468,0.196849,0.198505,0.210425,0.218576,,0.1838,0.191112,0.208145,0.216977,0.222837,0.254676,0.240428,0.257959,0.261221,0.258195,,0.206941,0.207479,0.231263,0.24177,0.243313,0.253301,0.25543,0.263728,0.272462,0.271715,,0.217781,0.222318,0.232479,0.256645,0.280133,0.269715,0.263936,0.269912,0.263726,0.271818,,0.194117,0.219657,0.222994,0.219471,0.220597,0.236764,0.239495,0.23764,0.238184,0.236767,,0.209546,0.230813,0.221415,0.238218,0.227408,0.242553,0.242517,0.248028,0.24348,0.246339,,0.191099,0.208319,0.216826,0.229114,0.251284,0.243538,0.243215,0.243442,0.255002,0.260573,,0.165455,0.170095,0.184119,0.183708,0.175889,0.18546,0.188806,0.192409,0.198589,0.19252,,0.134139,0.150224,0.146852,0.165619,0.172254,0.175412,0.175002,0.184811,0.193356,0.192637,,0.171183,0.179571,0.178993,0.193086,0.207654,0.212551,0.214595,0.213943,0.224567,0.219416,,0.198311,0.19947,0.215942,0.21303,0.218874,0.234509,0.236069,0.235818,0.246955,0.256317,,0.159911,0.171076,0.173361,0.18515,0.196909,0.219839,0.220858,0.215592,0.219875,0.224996,,0.133807,0.144376,0.151213,0.149622,0.158301,0.17317,0.175211,0.166037,0.175474,0.184769,,0.185453,0.182582,0.199988,0.195736,0.206897,0.207925,0.197009,0.209579,0.21397,0.207588,,0.189643,0.216518,0.212781,0.223525,0.222322,0.219149,0.233536,0.230429,0.221713,0.231226,,0.217938,0.212155,0.210048,0.229239,0.217105,0.235108,0.242978,0.232869,0.246238,0.258696,,0.177239,0.194554,0.19713,0.187293,0.206391,0.207917,0.214046,0.220271,0.226313,0.233128,,0.143521,0.151436,0.167217,0.163032,0.181327,0.179043,0.17572,0.184978,0.194134,0.201346,
8,0.22723,0.255054,0.269957,0.25701,0.241421,0.249217,0.247332,0.272705,0.266787,0.259976,,0.241792,0.259643,0.228708,0.239563,0.255786,0.233934,0.253347,0.235562,0.254314,0.246238,,0.220851,0.238374,0.247809,0.246088,0.246449,0.241798,0.239878,0.256095,0.246616,0.263025,,0.222596,0.215182,0.215739,0.260612,0.270742,0.251239,0.269117,0.26422,0.244434,0.237128,,0.222747,0.22412,0.227389,0.232758,0.232899,0.236515,0.236059,0.240951,0.240279,0.237125,,0.216464,0.2059,0.21707,0.232573,0.245496,0.261884,0.255457,0.276646,0.26446,0.288012,,0.194065,0.212643,0.237687,0.234124,0.23643,0.238892,0.21668,0.217931,0.228018,0.237614,,0.240789,0.227542,0.242824,0.243282,0.248821,0.295495,0.271315,0.293318,0.293501,0.285977,,0.2353,0.235258,0.264126,0.255867,0.257344,0.266474,0.265894,0.271137,0.291882,0.290592,,0.26849,0.255099,0.25919,0.288597,0.307493,0.294462,0.29041,0.290668,0.283649,0.290712,,0.238947,0.247571,0.255897,0.258736,0.246915,0.272218,0.264582,0.256274,0.26239,0.249885,,0.264682,0.27044,0.248979,0.277117,0.251989,0.276441,0.266192,0.265264,0.256877,0.260873,,0.24112,0.248713,0.257658,0.266309,0.29096,0.274046,0.270622,0.269764,0.284327,0.297174,,0.226026,0.221555,0.227407,0.219049,0.208799,0.219957,0.220032,0.220991,0.224066,0.21478,,0.20312,0.209357,0.203919,0.227137,0.228692,0.227357,0.215216,0.225926,0.231466,0.221781,,0.223089,0.225701,0.217431,0.222567,0.242321,0.251657,0.243593,0.248244,0.258985,0.247938,,0.257882,0.241925,0.245524,0.241195,0.242644,0.269307,0.261588,0.25083,0.267295,0.26847,,0.213347,0.215026,0.220053,0.232465,0.234391,0.26448,0.256366,0.24595,0.2477,0.247613,,0.203993,0.204987,0.209698,0.202944,0.203809,0.21673,0.2125,0.194592,0.200348,0.206008,,0.244068,0.237381,0.248859,0.232714,0.247634,0.237691,0.219938,0.241242,0.247304,0.237363,,0.236858,0.25754,0.239325,0.249725,0.238303,0.240024,0.258556,0.256372,0.240248,0.255746,,0.272346,0.245783,0.243734,0.281032,0.259704,0.278819,0.275207,0.260162,0.276064,0.279793,,0.238811,0.247945,0.255385,0.238249,0.256807,0.254645,0.262228,0.266114,0.268421,0.268536,,0.206834,0.202686,0.214706,0.197718,0.224568,0.217261,0.21161,0.216914,0.234902,0.237117,


In [26]:
len(results)/11

24

In [27]:
spark.createDataFrame(pd.DataFrame(results)).write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/results/marchv2results40-64")
# spark.createDataFrame(pd.DataFrame(precisions)).write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/results/marchv2precisions22-30")
# spark.createDataFrame(pd.DataFrame(ranks)).write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/results/marchv2ranks22-30")

In [202]:
pd.DataFrame(combi).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63
0,,"(broadcaster,)","(credits,)","(description_tfidf,)","(genres,)","(title_tfidf,)","(sub_tfidf,)","(broadcaster, credits)","(broadcaster, description_tfidf)","(broadcaster, genres)","(broadcaster, title_tfidf)","(broadcaster, sub_tfidf)","(credits, description_tfidf)","(credits, genres)","(credits, title_tfidf)","(credits, sub_tfidf)","(description_tfidf, genres)","(description_tfidf, title_tfidf)","(description_tfidf, sub_tfidf)","(genres, title_tfidf)","(genres, sub_tfidf)","(title_tfidf, sub_tfidf)","(broadcaster, credits, description_tfidf)","(broadcaster, credits, genres)","(broadcaster, credits, title_tfidf)","(broadcaster, credits, sub_tfidf)","(broadcaster, description_tfidf, genres)","(broadcaster, description_tfidf, title_tfidf)","(broadcaster, description_tfidf, sub_tfidf)","(broadcaster, genres, title_tfidf)","(broadcaster, genres, sub_tfidf)","(broadcaster, title_tfidf, sub_tfidf)","(credits, description_tfidf, genres)","(credits, description_tfidf, title_tfidf)","(credits, description_tfidf, sub_tfidf)","(credits, genres, title_tfidf)","(credits, genres, sub_tfidf)","(credits, title_tfidf, sub_tfidf)","(description_tfidf, genres, title_tfidf)","(description_tfidf, genres, sub_tfidf)","(description_tfidf, title_tfidf, sub_tfidf)","(genres, title_tfidf, sub_tfidf)","(broadcaster, credits, description_tfidf, genres)","(broadcaster, credits, description_tfidf, titl...","(broadcaster, credits, description_tfidf, sub_...","(broadcaster, credits, genres, title_tfidf)","(broadcaster, credits, genres, sub_tfidf)","(broadcaster, credits, title_tfidf, sub_tfidf)","(broadcaster, description_tfidf, genres, title...","(broadcaster, description_tfidf, genres, sub_t...","(broadcaster, description_tfidf, title_tfidf, ...","(broadcaster, genres, title_tfidf, sub_tfidf)","(credits, description_tfidf, genres, title_tfidf)","(credits, description_tfidf, genres, sub_tfidf)","(credits, description_tfidf, title_tfidf, sub_...","(credits, genres, title_tfidf, sub_tfidf)","(description_tfidf, genres, title_tfidf, sub_t...","(broadcaster, credits, description_tfidf, genr...","(broadcaster, credits, description_tfidf, genr...","(broadcaster, credits, description_tfidf, titl...","(broadcaster, credits, genres, title_tfidf, su...","(broadcaster, description_tfidf, genres, title...","(credits, description_tfidf, genres, title_tfi...","(broadcaster, credits, description_tfidf, genr..."


## Optimizing model
https://github.com/lyst/lightfm/issues/286 Different evaluation results on two runs with random state set due to multiple threads

In [24]:
#feature = combi[16] # ('description_tfidf', 'genres')
feature = combi[50] # ('broadcaster', 'description_tfidf', 'title_tfidf', 'sub_tfidf')

paths = []
for i in range(0, len(feature)):
    paths.append('gs://dataproc-jupyter-eileen.npo-data.nl/data/content_features/' + feature[i])
sideinfo = spark.read.parquet(*paths)

sideinfo = sideinfo.toPandas()
sideinfo['value'] = sideinfo.value.str.encode('utf-8')
sideinfo = sideinfo[sideinfo['mid'].isin(df_lim.mid.tolist())]
item_features_concat = get_itemfeatures(sideinfo)

In [16]:
def objective_wsideinfo(params):
    epochs, learning_rate,\
    no_components, item_alpha,\
    scale = params
    
    user_alpha = item_alpha * scale
    model = LightFM(loss='warp',
                    random_state=2016,
                    learning_rate=learning_rate,
                    no_components=no_components,
                    user_alpha=user_alpha,
                    item_alpha=item_alpha)
    model.fit(train, epochs=epochs,
              item_features=item_features_concat,
              num_threads=4, verbose=True)
    
    patks = lightfm.evaluation.precision_at_k(model, test,
                                              item_features=item_features_concat,
                                              train_interactions=None,
                                              k=5, num_threads=3)
    mapatk = np.mean(patks)
    # Make negative because we want to _minimize_ objective
    out = -mapatk
    # Weird shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out

In [18]:
# Grid search
space = [(1, 120), # epochs
         (10**-3, 1.0, 'log-uniform'), # learning_rate
         (20, 200), # no_components
         (10**-5, 10**-3, 'log-uniform'), # item_alpha
         (0.001, 1., 'log-uniform') # user_scaling
        ]
item_features = item_features_concat.astype(np.float32)
res_fm_itemfeat_precision = forest_minimize(objective_wsideinfo, space, n_calls=50,
                                  random_state=0,
                                  verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 49.3514
Function value obtained: -0.0432
Current minimum: -0.0432
Iteration No: 2 started. Evaluating function at random point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 

Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 59.5264
Function value obtained: -0.1397
Current minimum: -0.1806
Iteration No: 13 started. Searching for the next optimal point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57

Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
Epoch 100
Epoch 101
Epoch 102
Epoch 103
Epoch 104
Epoch 105
Epoch 106
Epoch 107
Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 76.5149
Function value obtained: -0.1018
Current minimum: -0.1847
Iteration No: 20 started. Searching for the next optimal point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64


Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 77.8553
Function value obtained: -0.1859
Current minimum: -0.1901
Iteration No: 27 started. Searching for the next optimal point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57

Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 54.4938
Function value obtained: -0.1843
Current minimum: -0.1901
Iteration No: 35 started. Searching for the next optimal point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59

Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 81.6203
Function value obtained: -0.1830
Current minimum: -0.1901
Iteration No: 45 started. Searching for the next optimal point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16

In [19]:
print('Maximimum p@k found: {:6.5f}'.format(-res_fm_itemfeat_precision.fun))
print('Optimal parameters:')
params = ['epochs', 'learning_rate', 'no_components', 'item_alpha', 'scaling']
for (p, x_) in zip(params, res_fm_itemfeat_precision.x):
    print('{}: {}'.format(p, x_))
print('Standard deviation: ' + str(np.std(-res_fm_itemfeat_precision.func_vals)))

Maximimum p@k found: 0.19006
Optimal parameters:
epochs: 119
learning_rate: 0.0950077507918
no_components: 87
item_alpha: 1.40170452425e-05
scaling: 0.00129052277604
Standard deviation: 0.05562915


### Train using optimal parameters

In [27]:
## Gemiddeld online

# epochs = 94
# learning_rate = 0.003
# no_components = 155
# item_alpha = 0.0008
# scale = 0.881

In [27]:
## Model1_V2

# epochs = 109
# learning_rate = 0.0145931111183
# no_components =  186
# item_alpha = 0.000458915607758
# scale = 0.451807286244

epochs = 105
learning_rate = 0.017517368561
no_components =  183
item_alpha = 0.00086611936337
scale = 0.837505047238

In [28]:
# epochs, learning_rate,\
# no_components, item_alpha,\
# scale = res_fm_itemfeat_precision.x

user_alpha = item_alpha * scale
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=learning_rate,
                no_components=no_components,
                user_alpha=user_alpha,
                item_alpha=item_alpha)
model.fit(interactions, epochs=epochs,
          item_features=item_features_concat,
          num_threads=4)

<lightfm.lightfm.LightFM at 0x7fd542493150>

In [29]:
precision_test = lightfm.evaluation.precision_at_k(model, test, item_features=item_features_concat, train_interactions=None, k=5)
rank_test = lightfm.evaluation.reciprocal_rank(model, test, item_features=item_features_concat, train_interactions=None)

print("Precision mean: " + str(np.mean(precision_test)))
print("Precision std: " + str(np.std(precision_test)))
print("Rank mean: " + str(np.mean(rank_test)))
print("Rank std: " + str(np.std(rank_test)))

Precision mean: 0.1918399
Precision std: 0.11441082
Rank mean: 0.5132041
Rank std: 0.33908904


In [30]:
import pickle
!pip install google-cloud-storage
from google.cloud import storage
import re

def _make_gcs_uri(bucket_name, object_name):
    return "gs://{}/{}".format(bucket_name, object_name)

def pickle_and_upload(obj, bucket_name, object_name):
    """Returns the object's GCS uri."""
    print("pickling data")
    pickle_str = pickle.dumps(obj)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(object_name)
    print("uploading object {} to bucket {}".format(object_name, bucket_name))
    blob.upload_from_string(pickle_str)

    return _make_gcs_uri(bucket_name, object_name)

def split_uri(gcs_uri):
    """Splits gs://bucket_name/object_name to (bucket_name, object_name)"""
    pattern = r"gs://([^/]+)/(.+)"
    match = re.match(pattern, gcs_uri)

    bucket_name = match.group(1)
    object_name = match.group(2)

    return bucket_name, object_name

def download_and_unpickle(bucket_name, object_name):
    blob = get_blob(bucket_name, object_name)
    pickle_str = blob.download_as_string()

    obj = pickle.loads(pickle_str)
    return obj

def get_blob(bucket_name, object_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(object_name)
    return blob


def get_uri_blob(gcs_uri):
    bucket_name, object_name = split_uri(gcs_uri)
    return get_blob(bucket_name, object_name)

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
Collecting google-cloud-core<0.29dev,>=0.28.0 (from google-cloud-storage)
  Downloading https://files.pythonhosted.org/packages/0f/41/ae2418b4003a14cf21c1c46d61d1b044bf02cf0f8f91598af572b9216515/google_cloud_core-0.28.1-py2.py3-none-any.whl
[31mERROR: google-cloud-bigquery 1.14.0 has requirement google-cloud-core<2.0dev,>=1.0.0, but you'll have google-cloud-core 0.28.1 which is incompatible.[0m
[31mERROR: google-cloud-bigtable 0.33.0 has requirement google-cloud-core<2.0dev,>=1.0.0, but you'll have google-cloud-core 0.28.1 which is incompatible.[0m
Installing collected packages: google-cloud-core
  Found existing installation: google-cloud-core 1.0.1
    Uninstalling google-cloud-core-1.0.1:
      Successfully uninstalled google-cloud-core-1.0.1
Successfully insta

In [31]:
obj = model
obj_name = split_uri('gs://dataproc-jupyter-eileen.npo-data.nl/data/results/model2_v2')
pickle_and_upload(obj, obj_name[0], obj_name[1])

pickling data
uploading object data/results/model2_v2 to bucket dataproc-jupyter-eileen.npo-data.nl


'gs://dataproc-jupyter-eileen.npo-data.nl/data/results/model2_v2'

## List creation
Now you see the recommended lists <br>
(can also show a few items which were also chosen - known_positives) <br>
https://towardsdatascience.com/how-to-build-a-movie-recommender-system-in-python-using-lightfm-8fa49d7cbe3b

In [None]:
# None: de luizenmoeder, wie is de mol, 2 voor 12, oogappels, met het mes op tafel
# broadcaster: wie is de mol, oogappels, moltalk, verborgen verleden, floortje naar het einde van de wereld

In [56]:
obj_name = split_uri('gs://dataproc-jupyter-eileen.npo-data.nl/data/results/model4')
model0 = download_and_unpickle(obj_name[0], obj_name[1])

In [45]:
# Also, consider filtering out items already seen (because that isn't happening now)

# show recommendations for user
user = user_list[21]
print(idx_to_uid.get(user))
scores = model.predict(np.int32(user, dtype=np.int32), item_list) #31
score_df = pd.DataFrame({'itemlist' : item_list, 'scores' : scores})
score_df = score_df.drop_duplicates()
score_df = score_df.sort_values('scores', ascending=False)
series_ref = []
for x in score_df['itemlist'].tolist():
    series_ref.append(idx_to_mid.get(x))
score_df['seriesRef'] = series_ref
score_df

b2421249-a92d-4501-95ce-35fedcd12352


Unnamed: 0,itemlist,scores,seriesRef
12391,2598,0.575958,POMS_S_NPO_4780015
9794,2492,0.561227,AT_2048191
948,2500,0.517884,POW_03825084
3897,2528,0.440839,POW_00828626
419,2597,0.431538,VPWON_1275436
4459,2516,0.430931,POMS_S_NPO_3602337
1418,2509,0.378966,POMS_S_NPO_2966850
40815,2599,0.376753,POMS_S_EO_690753
6955,2594,0.362682,POW_02977467
1424,2622,0.362659,POMS_S_BNN_6114668


In [46]:
df_int[df_int['uid'] == 'b2421249-a92d-4501-95ce-35fedcd12352'] 
#floortje naar het einde van de wereld, de wereld draait door, per seconde wijzer

Unnamed: 0,month,day,uid,mid
312018,3,17,b2421249-a92d-4501-95ce-35fedcd12352,POW_03804985
878899,3,19,b2421249-a92d-4501-95ce-35fedcd12352,POW_03164058
897368,3,22,b2421249-a92d-4501-95ce-35fedcd12352,POW_03164058
976988,3,16,b2421249-a92d-4501-95ce-35fedcd12352,POW_03804985
1072729,3,18,b2421249-a92d-4501-95ce-35fedcd12352,POW_03804985
1140639,3,22,b2421249-a92d-4501-95ce-35fedcd12352,POW_00823240
1207932,3,20,b2421249-a92d-4501-95ce-35fedcd12352,POW_03164058
1209712,3,21,b2421249-a92d-4501-95ce-35fedcd12352,POW_03164058


In [50]:
sideinfo[sideinfo['mid'].isin(['POW_03804985', 'POW_03164058', 'POW_00823240'])].sort_values('value')

Unnamed: 0,mid,value,feature
58513,POW_03164058,KRNC,broadcaster
55081,POW_03804985,KRNC,broadcaster
58496,POW_00823240,KRNC,broadcaster
57433,POW_00823240,KRO,broadcaster
39557,POW_03164058,albert,description
21675,POW_03804985,cadi,description
12091,POW_00823240,dood,description
21670,POW_03804985,dylan,description
39564,POW_03164058,forbes,description
12086,POW_00823240,gevonden,description
