This is my work on using the lightFM to test a small portion of the whole data set (3 days) due to the limit run time of the kernel. The train-test ratio 80:20. This test uses only the user-item interations (only the file events.csv is at use).

The AUCs are 98.4% for training and 81.5% for testing. 

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn import preprocessing
from lightfm import LightFM
from scipy.sparse import csr_matrix 
from scipy.sparse import coo_matrix 
from sklearn.metrics import roc_auc_score
import time
from lightfm.evaluation import auc_score
import pickle
from lightfm.evaluation import precision_at_k


In [2]:
def create_data(datapath,start_date,end_date):
    df=pd.read_csv(datapath)
    df=df.assign(date=pd.Series(datetime.fromtimestamp(a/1000).date() for a in df.timestamp))
    df=df.sort_values(by='date').reset_index(drop=True) # for some reasons RetailRocket did NOT sort data by date
    df=df[(df.date>=datetime.strptime(start_date,'%Y-%m-%d').date())&(df.date<=datetime.strptime(end_date,'%Y-%m-%d').date())]
    df=df[['visitorid','itemid','event']]
    df=df.dropna()
    print(df.head())
    return df

In [3]:
def create_implicit_feedback_matrix1(df, split_ratio):
    # assume df.columns=['visitorid','itemid','event']
    split_point=np.int(np.round(df.shape[0]*split_ratio))
    df_train=df.iloc[0:split_point]
    df_test=df.iloc[split_point::]
    df_test=df_test[(df_test['visitorid'].isin(df_train['visitorid']))&\
                     (df_test['itemid'].isin(df_train['itemid']))]
    id_cols=['visitorid','itemid']
    trans_cat_train=dict()
    trans_cat_test=dict()
    for k in id_cols:
        cate_enc=preprocessing.LabelEncoder()
        trans_cat_train[k]=cate_enc.fit_transform(df_train[k].values)
        trans_cat_test[k]=cate_enc.transform(df_test[k].values)
    
    # --- Encode ratings:
    cate_enc=preprocessing.LabelEncoder()
    ratings=dict()
    ratings['train']=cate_enc.fit_transform(df_train.event)
    ratings['test'] =cate_enc.transform(df_test.event)
    
    n_users=len(np.unique(trans_cat_train['visitorid']))
    n_items=len(np.unique(trans_cat_train['itemid']))    
    
    
    rate_matrix=dict()
    rate_matrix['train']=coo_matrix((ratings['train'],(trans_cat_train['visitorid'],\
                                              trans_cat_train['itemid']))\
                             ,shape=(n_users,n_items))
    rate_matrix['test']=coo_matrix((ratings['test'],(trans_cat_test['visitorid'],\
                                              trans_cat_test['itemid']))\
                             ,shape=(n_users,n_items))
    return rate_matrix

In [4]:
def create_implicit_feedback_matrix(df, split_ratio):
    # assume df.columns=['visitorid','itemid','event']
    id_cols=['visitorid','itemid']
    trans_cat=dict()
    for k in id_cols:
        cate_enc=preprocessing.LabelEncoder()
        trans_cat[k]=cate_enc.fit_transform(df[k].values)
    cate_enc=preprocessing.LabelEncoder()
    ratings=cate_enc.fit_transform(df.event) 
    n_users=len(np.unique(trans_cat['visitorid']))
    n_items=len(np.unique(trans_cat['itemid']))    
    split_point=np.int(np.round(df.shape[0]*split_ratio))
    
    rate_matrix=dict()
    rate_matrix['train']=coo_matrix((ratings[0:split_point],(trans_cat['visitorid'][0:split_point],\
                                              trans_cat['itemid'][0:split_point]))\
                             ,shape=(n_users,n_items))
    rate_matrix['test']=coo_matrix((ratings[split_point+1::],(trans_cat['visitorid'][split_point+1::],\
                                              trans_cat['itemid'][split_point+1::]))\
                             ,shape=(n_users,n_items))
    return rate_matrix

In [5]:
if __name__=='__main__':
    start_time = time.time()
    df=create_data('../input/events.csv','2015-5-3','2015-5-18')
    modelLoad=False
    
    rating_matrix=create_implicit_feedback_matrix1(df,.8)
    if(modelLoad):
        with open('saved_model','rb') as f:
            saved_model=pickle.load(f)
            model=saved_model['model']
    else:
        model=LightFM(no_components=5,loss='warp')
        model.fit(rating_matrix['train'],epochs=10,num_threads=1)
        
        with open('saved_model','wb') as f:
            saved_model={'model':model}
            pickle.dump(saved_model, f)
    auc_train = auc_score(model, rating_matrix['train']).mean()
    auc_test = auc_score(model, rating_matrix['test']).mean()
    
    #df=df.assign(pred_score=model.predict(df['visitorid'],df['itemid']))
    
    #df_auc=df.groupby(by='visitorid').apply(lambda df: roc_auc_score(df['event'].values,df['pred_score'].values))
    #print('Training auc %0.3f' % numpy.mean([i for i in df_auc.values if i > -1]))
    
    print("--- Run time:  %s mins ---\n" % ((time.time() - start_time)/60))
    print("Train AUC %.3f\n"%auc_train)
    print("Test AUC %.3f\n"%auc_test)

   visitorid  itemid event
0     689859  421640  view
1     935582  203248  view
2     696326  194830  view
3     131668  395045  view
4     595484  129111  view
--- Run time:  5.484336030483246 mins ---

Train AUC 0.849

Test AUC 0.807



In [6]:
from lightfm.data import Dataset

data=Dataset(user_identity_features=True, item_identity_features=True)
dat=df[['visitorid','itemid']]
print(len(dat))
data.fit(users=np.unique(dat.iloc[:, 0]), items=np.unique(dat.iloc[:, 1]))
dat.columns=['user_id','item_id']
dat=[tuple(x) for x in dat.values]
#train_interactions, train_weights = dataset.build_interactions((i[0], i[1], i[2]) for i in train_data)
train_interactions, train_weights=data.build_interactions(data=dat)

solver = LightFM()
solver.fit(interactions=train_interactions, sample_weight=train_weights)






323403


<lightfm.lightfm.LightFM at 0x7f679ba325f8>

In [7]:
print(train_interactions.shape, train_weights.shape)
print(len(dat),dat[0])
n_items=df[['itemid']].drop_duplicates().shape[0]
for i in range(df[['visitorid']].drop_duplicates().shape[0]):
    print(i,
    solver.predict(i,np.arange(n_items-1)))
#solver.get_user_representations()



(169613, 82358) (169613, 82358)
323403 (689859, 421640)
0 [0.10764892 0.08511725 0.13548908 ... 0.1278456  0.08012655 0.08871306]
1 [0.07408099 0.04992663 0.0922001  ... 0.08677652 0.04653502 0.05124496]
2 [0.07193369 0.04917442 0.09506869 ... 0.08692593 0.0483262  0.04700624]
3 [0.07119108 0.04522689 0.09522123 ... 0.08941064 0.04416416 0.05216651]
4 [0.07238064 0.04967985 0.0921221  ... 0.09134065 0.04814515 0.05180089]
5 [0.06681007 0.05084258 0.08572785 ... 0.09282093 0.05079828 0.04690681]
6 [0.06943521 0.05320751 0.09023182 ... 0.09634916 0.05274031 0.04964271]
7 [0.08940887 0.07247604 0.11309297 ... 0.11507415 0.0677372  0.06988082]
8 [0.08484397 0.06815552 0.10894042 ... 0.11046148 0.06614684 0.06815739]
9 [0.06986687 0.04192581 0.08914683 ... 0.08188941 0.0409736  0.04490777]
10 [0.0941761  0.07435439 0.11281288 ... 0.11367968 0.07207233 0.07263497]
11 [0.07025386 0.0502412  0.09357785 ... 0.09101892 0.04884379 0.0478541 ]
12 [0.06860954 0.05391393 0.09068277 ... 0.08975837 0.