In [1]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from scipy.sparse import coo_matrix
import itertools
from datetime import datetime

from pathlib import Path
import sys

PATH = '/Users/danil/Documents/github/Klarna/'
sys.path.append(str(PATH))



In [71]:
from src.utils import random_search, mDCG 
from src.pickle_utils import save_to_pickle

In [3]:
%%time

events = pd.read_csv(PATH + 'data/raw/events.csv')

CPU times: user 954 ms, sys: 196 ms, total: 1.15 s
Wall time: 1.22 s


In [4]:
print(events.shape)
events.head()

(2756101, 5)


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


convert timestamp to datetime format 

In [5]:
events['timestamp'] = events['timestamp'].apply(lambda x: datetime.fromtimestamp(x//1000.0))

cut date that was proposed in task

In [6]:
events = events[events['timestamp'] <= '2015-09-01']
print(events.shape)

(2448866, 5)


# check dataframe with predictions

In [7]:
pred = pd.read_csv(PATH + 'data/raw/predictions.csv')
print(pred.shape)
pred.head()

(174956, 101)


Unnamed: 0,visitorid,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,...,item_90,item_91,item_92,item_93,item_94,item_95,item_96,item_97,item_98,item_99
0,593408,,,,,,,,,,...,,,,,,,,,,
1,71998,,,,,,,,,,...,,,,,,,,,,
2,1403739,,,,,,,,,,...,,,,,,,,,,
3,693797,,,,,,,,,,...,,,,,,,,,,
4,1244757,,,,,,,,,,...,,,,,,,,,,


check number of cold users

In [8]:
users_to_pred = set(pred['visitorid'])
users_with_hist = set(events['visitorid'])

intersec = users_to_pred.intersection(users_with_hist)

In [9]:
print('users with hist:', len(intersec))
print('share:', len(intersec)/pred.shape[0])

users with hist: 17018
share: 0.09727017078579757


oh)) we know history only for less than 10% of users

what means a lot of cold start users

# metric

DCG - takse into account relevance of recommended item and position of this item.

I will use modified ndcg metric because I want to take into account that most relevant items would be at the beginning of recom list and items that more likely to be bouhgt should take first position

convert events to scores

In [10]:
scores = {'view': 1,
          'addtocart': 2,
          'transaction': 3}

events['event'] = events['event'].replace(scores)

# train/test split

To split dataset to train and test I will cut data by datetime value and tran on data before time T and validate on date after time T.

Another iteresting approach is leave-p-out method, it means train on users hist and predict n last items.

For faster computation I will cut train data (train from 2015-06-01, till 2015-08-25)
and test from 2015-08-25 till 2015-09-01

In [11]:
test_df = events[events['timestamp'] >= '2015-08-25']
train_df = events[(events['timestamp'] < '2015-08-25')
                  & (events['timestamp'] >= '2015-06-01')]

In [12]:
test_users = set(test_df['visitorid'])
train_users = set(train_df['visitorid'])

In [13]:
len(test_users.intersection(train_users))/len(train_users)

0.009003259686106766

This is not very good ratio for testing, better should be as in prediction dataset, 0.09. It make sense for competition but in real life I think this way of separation is pretty good.

Lets find most popular items, to be more precise, 100 most scored items

In [14]:
items_scores = train_df.groupby(['itemid', 'event'])['event'].count().reset_index(name='count')
items_scores = items_scores.eval('score = event * count')
items_scores = items_scores.groupby('itemid')['score'].sum().reset_index().sort_values('score', ascending = False)

items_scores.head()

Unnamed: 0,itemid,score
191175,461686,2296
77897,187946,2289
2195,5411,2071
90926,219512,1404
106543,257040,1328


In [15]:
top_rated_items = items_scores[:100]['itemid'].values

Lets find latest items

In [16]:
latest_items = np.array(events.groupby('visitorid').timestamp.max().sort_values(ascending=False)[:200].index)

items_to_recom - short list of items for model scoring

In [17]:
items_to_recom = items_scores[:200]['itemid'].values

I will remove users with less then 5 interactions, for better performance, but on practice better to test min value of interactios. (I wiould check also 3, 10)

In [18]:
users_interactions = train_df['visitorid'].value_counts()

active_users = set(users_interactions[users_interactions >= 5].index)

train_df = train_df[train_df['visitorid'].isin(active_users)]

In [19]:
len(test_users.intersection(active_users))/len(active_users)

0.037053199508073155

# cold users predictions

compare few methods for cold start users

1. recommend most rated items 
2. recommend most rated items with shuffling
3. recommend latest items
4. recommend latest items with shuffling
5. mixed 2 best from 1-4

In [20]:
cold_test_users = test_users.difference(active_users)
warm_test_users = test_users.intersection(active_users)

In [21]:
top_rated_items = items_scores[:100]['itemid'].values
cold_recoms_most_rated = {}
for user in cold_test_users:
    cold_recoms_most_rated[user] = tuple(top_rated_items)

In [22]:
cold_recoms_most_rated_shuffled = {}
for user in cold_test_users:
    np.random.shuffle(items_to_recom)
    cold_recoms_most_rated_shuffled[user] = tuple(items_to_recom[:100])

In [23]:
cold_recoms_latest = {}
for user in cold_test_users:
    cold_recoms_latest[user] = tuple(latest_items[:100])

In [24]:
cold_recoms_latest_shuffled = {}
for user in cold_test_users:
    np.random.shuffle(latest_items)
    cold_recoms_latest_shuffled[user] = tuple(latest_items[:100])

# validate cold start users

get max score for each item for every user

In [25]:
correct_interactions = test_df.groupby(['visitorid', 'itemid'])['event'].max().reset_index()

In [26]:
correct_interactions.head()

Unnamed: 0,visitorid,itemid,event
0,6,65273,2
1,6,253615,1
2,6,344723,1
3,29,299118,1
4,80,458747,1


prepare dict with correct interactions

In [27]:
correct = {}
for visitor in correct_interactions['visitorid'].unique():
    d = {r['itemid']: r['event']
         for _, r in correct_interactions[correct_interactions['visitorid'] == visitor].iterrows()}
    correct[visitor] = d

filter for cold users

In [28]:
cold_correct = {u: i for u, i in correct.items() if u in cold_test_users}

In [29]:
mDCG(cold_correct, cold_recoms_most_rated)

0.013300723013813397

In [30]:
mDCG(cold_correct, cold_recoms_most_rated_shuffled)

0.007267581145842816

better recommend most rated items

In [31]:
mDCG(cold_correct, cold_recoms_latest)

1.1532303120929191e-05

In [32]:
mDCG(cold_correct, cold_recoms_latest_shuffled)

1.5066526458730882e-05

as we see, latest would not be very popular for users

Now lets test mixed variant

according to low matric for latest items, I will use only 10 latest items in recommendation list

In [33]:
items_to_recom = items_scores[:90]['itemid'].values
latest_items = np.array(events.groupby(
    'visitorid').timestamp.max().sort_values(ascending=False)[:50].index)

cold_recoms_mixed = {}
for user in cold_test_users:
    np.random.shuffle(latest_items)
    mixed = np.append(items_to_recom, latest_items[:10])
    np.random.shuffle(mixed)
    cold_recoms_mixed[user] = tuple(mixed)

In [34]:
mDCG(cold_correct, cold_recoms_mixed)

0.007273511853174171

Best way to to handle cold users is to recommend items with top scores after shuffling

# users with history

To make recommendations for users with history we have to compare different algorythms, for example:

1. covisits - for current user, we will recom items that was most popular for users who also check the same items (as current user) during 1 session. This easy approach also can help with cold users, but unfortunately we can use it when we know at least 1 interaction in current user session. It doesnt work for main page and we need to define what is user session, and it is duration.

2. CF - collaborative filtering methods, I this this approach is suitable for this case. Realizations like ALS(pyspark), LightFM(from my exeriance this lib works pretty well, and now we have not so huge dataset to use pyspark)

3. NN - Neural networks (recurrent RNN), from latest articles I know that for now NN algs for Recom task could not beat classic algs like MF, and it takes to much time to retrain, but we have to do this pretty often.

4. Supervised learning - when we know user features and triing to predict future actions, here I cant extract any feature.

5. Markov chains - very interesting approach, and this one would be the second for testing (not implemented here)

so here I will use LightFM, in future we can use, item/users features, extrct item/user embeddings to use for new items or users

In [35]:
model = LightFM(no_components=10)

In [36]:
train_df = train_df[['visitorid', 'itemid', 'event']].rename(columns={
                                                             'event': 'val'})

make dict with encoded and decoded user/item ids

In [37]:
user_encode = {u: i for i, u in enumerate(train_df['visitorid'].unique())}
item_encode = {u: i for i, u in enumerate(train_df['itemid'].unique())}

In [38]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [39]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(50414, 78403)

In [40]:
train_df['visitorid'] = train_df['visitorid'].apply(lambda x: user_encode[x])

train_df['itemid'] = train_df['itemid'].apply(lambda x: item_encode[x])

In [41]:
train_coo = coo_matrix(
    (train_df['val'], (train_df['visitorid'], train_df['itemid'])), shape=(n_users, n_items))

define users and items for scoreng

In [42]:
users_to_predict = [user_encode[x] for x in warm_test_users]
items_to_predict = [item_encode[x] for x in items_to_recom]

train model

In [43]:
model.fit(train_coo, epochs=5)

<lightfm.lightfm.LightFM at 0x11bea8110>

In [44]:
recoms = {}
num_to_recom = 100
for user in users_to_predict:
    predict = model.predict(user, items_to_predict, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_predict[i]]
                                 for i in top_recoms_id]

In [45]:
warm_correct = {u: i for u, i in correct.items() if u in warm_test_users}

In [46]:
mDCG(warm_correct, recoms)

0.03371368666558496

It is nice, that here here we have better metric))

# tune hyperparams

In [91]:
score, hyperparams, model, takes = max(random_search(
    train_coo, warm_correct, num_threads=6), key=lambda x: x[0])

print("Best score {} at {}, takes {}".format(score, hyperparams, takes))

hyperparams set: {'no_components': 52, 'learning_schedule': 'adagrad', 'loss': 'bpr', 'learning_rate': 0.07949959441563276, 'num_epochs': 6}
0.08579625609698624
hyperparams set: {'no_components': 16, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.0023025505836894015, 'num_epochs': 13}
0.050663491196277805
hyperparams set: {'no_components': 21, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.01382623788509415, 'num_epochs': 29}
0.06291092523658173
hyperparams set: {'no_components': 56, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.07971357065071046, 'num_epochs': 29}
0.08383368598603667
hyperparams set: {'no_components': 37, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.03240679092075815, 'num_epochs': 15}
0.0834321986423992
hyperparams set: {'no_components': 46, 'learning_schedule': 'adadelta', 'loss': 'warp-kos', 'learning_rate': 0.08276514765049861, 'num_epochs': 19}
0.08262282947170363
hyperparams 

# prediction

In [94]:
users_to_pred = set(pred['visitorid'])

update list of most popular items

In [95]:
items_scores = events.groupby(['itemid', 'event'])['event'].count().reset_index(name='count')

items_scores = items_scores.eval('score = event * count')

items_scores = items_scores.groupby('itemid')['score'].sum().reset_index().sort_values('score', ascending = False)

items_scores.head()

Unnamed: 0,itemid,score
220401,461686,2871
89839,187946,2691
2553,5411,2182
122797,257040,1785
177016,370653,1732


retrain model on full dataset

In [96]:
users_interactions = events['visitorid'].value_counts()

active_users = set(users_interactions[users_interactions >= 5].index)

train_df = events[events['visitorid'].isin(active_users)]

In [97]:
num_epochs = hyperparams.pop('num_epochs')
model = LightFM(**hyperparams)

In [98]:
train_df = train_df[['visitorid', 'itemid', 'event']].rename(columns={
                                                             'event': 'val'})

In [99]:
user_encode = {u: i for i, u in enumerate(train_df['visitorid'].unique())}
item_encode = {u: i for i, u in enumerate(train_df['itemid'].unique())}

In [100]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [101]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(72804, 96589)

In [102]:
train_df['visitorid'] = train_df['visitorid'].apply(lambda x: user_encode[x])

train_df['itemid'] = train_df['itemid'].apply(lambda x: item_encode[x])

In [103]:
train_coo = coo_matrix(
    (train_df['val'], (train_df['visitorid'], train_df['itemid'])), shape=(n_users, n_items))

In [104]:
items_to_recom = items_scores[:200]['itemid'].values
warm_users = users_to_pred.intersection(active_users)
cold_users = users_to_pred.difference(active_users)

In [105]:
users_to_predict = [user_encode[x] for x in warm_users]
items_to_predict = [item_encode[x] for x in items_to_recom]

In [106]:
model.fit(train_coo, epochs = num_epochs)

<lightfm.lightfm.LightFM at 0x119fce290>

In [107]:
recoms = {}
num_to_recom = 100
for user in users_to_predict:
    predict = model.predict(user, items_to_predict, num_threads = -1)
    top_recoms_id = sorted(range(len(predict)), key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_predict[i]] for i in top_recoms_id]

recom to cold users

In [108]:
items_to_recom = items_scores[:100]['itemid'].values
for user in cold_users:
    recoms[user] = tuple(items_to_recom)

# save predictions

In [109]:
predictions = pd.DataFrame(recoms)

predictions = predictions.T

predictions.columns = ['item_' + str(i) for i in range(100)]

predictions = predictions.reset_index().rename(columns={'index': 'visitorid'})

In [110]:
predictions.head()

Unnamed: 0,visitorid,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,...,item_90,item_91,item_92,item_93,item_94,item_95,item_96,item_97,item_98,item_99
0,819206,290250,209994,232509,260650,449571,194791,390591,15948,225778,...,316753,11279,100282,338660,432152,298009,213834,161623,394678,4001
1,262151,209994,290250,180751,82389,192875,316779,390591,194791,228392,...,359491,316753,243355,219512,298009,159822,218794,442131,138427,215715
2,720903,416017,62549,209994,194791,29196,101845,66752,213834,312728,...,355994,180751,210087,216305,445351,15948,393111,441852,320130,286731
3,229383,315543,15948,5411,290250,447661,445749,316779,194791,115323,...,111530,315545,243980,432152,299677,29100,441852,400946,46156,142466
4,1269770,225778,151444,209994,290250,101845,260317,429094,232509,394678,...,112782,204494,133907,449912,180751,222888,342530,215715,272455,389158


save all recommendations

In [113]:
predictions.to_csv(PATH + 'data/processed/recommendations.csv', index=False)

save most rated items for cold users

In [72]:
items_to_recom = items_scores[:100]['itemid'].values

save_to_pickle(items_to_recom.tolist(), PATH + 'data/processed/most_rated.pickle')

# future steps

1. definately work with item/user features, add them to lightFM model
2. mix model prediction, top rated, latest items 
3. use current user interaction to take them into account (like also mixing covisit alg with precalculated recommendations)
4. take into account trend (add multiplier for predicted scores based on how much item was bought during last perion, week or day for example)
5. Build Markov chain model to find best(lonegst) items path for users.