In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

In [3]:
df = pd.read_csv('transaction_history.csv')
# df['prod'] = df["prod"].replace(regex='_.{4}$', value="")       # use product and family info only
# df['prod'] = df["prod"].replace(regex='_.{4}_.{4}$', value="")  # use family info only
# df

In [4]:
df.domestic = pd.factorize(df.domestic)[0]
df = pd.get_dummies(df, dummy_na=True, columns=['state', 'ind_code','ind_seg_code'])
# df

In [5]:
interactions = pd.DataFrame(df.iloc[:, [1,2,4,0]])
interactions = interactions.rename(columns={"customer": "user_id", "prod": "item_id"})
# interactions

In [6]:
user_features = df.iloc[:,1:]
user_features = user_features.drop(['new_orders', 'prod'], axis=1)
user_features = user_features.rename(columns={"customer": "user_id"})
user_features = user_features.drop_duplicates('user_id')
# user_features

In [36]:
test_size = 125
k = 5
model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, 
               sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')

avg_base = []
avg_same = []
avg_diff = []
           
for i in range(6):
    train_mask = (df.day>test_size*i) & (df.day<=500+test_size*i)
    valid_mask = (df.day>500+test_size*i) & (df.day<=500+test_size*(i+1))
    
    interactions_train = interactions[train_mask][['user_id', 'item_id']]
    interactions_valid = interactions[valid_mask][['user_id', 'item_id']]
    
    train_users = np.sort(interactions_train.user_id.unique())
    valid_users = np.sort(interactions_valid.user_id.unique())
    user_features_train = user_features[user_features.user_id.isin(train_users)]
    
    print("train users: {}".format(len(train_users)))
    print("valid users: {}".format(len(valid_users)))

    model.fit(interactions_train, user_features=user_features_train, epochs=20, verbose=True)
    
    test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
    test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}
    
    most_popular = interactions_train.groupby('item_id')['user_id'].count().sort_values(ascending=False)[:k]

    base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])

    print("number of test users: {}".format(len(test_user_items)))
    print("baseline hit rate: {:.3f}".format(base_hrt))
    avg_base.append(base_hrt)
    
    model_hit_rate = hit_rate(model, interactions_valid, k=k)
    print("hit_rate_same: {:.3f}".format(model_hit_rate))
    avg_same.append(model_hit_rate)

    model_hit_rate = hit_rate(model, interactions_valid, k=k, filter_previous=True)
    print("hit_rate_diff: {:.3f}".format(model_hit_rate))
    avg_diff.append(model_hit_rate)
    print()
    
print("mean baseline:", np.mean(avg_base))
print("mean same:", np.mean(avg_same))
print("mean diff:", np.mean(avg_diff))

train users: 2113
valid users: 1400

training epoch: 0
log likelihood: -31410.19921875

training epoch: 1
log likelihood: -26836.390625

training epoch: 2
log likelihood: -25651.25

training epoch: 3
log likelihood: -25049.30078125

training epoch: 4
log likelihood: -24649.41015625

training epoch: 5
log likelihood: -24445.939453125

training epoch: 6
log likelihood: -24057.7109375

training epoch: 7
log likelihood: -23948.669921875

training epoch: 8
log likelihood: -23761.919921875

training epoch: 9
log likelihood: -23614.669921875

training epoch: 10
log likelihood: -23602.7109375

training epoch: 11
log likelihood: -23453.599609375

training epoch: 12
log likelihood: -23396.41015625

training epoch: 13
log likelihood: -23243.189453125

training epoch: 14
log likelihood: -23306.779296875

training epoch: 15
log likelihood: -23153.51953125

training epoch: 16
log likelihood: -23076.91015625

training epoch: 17
log likelihood: -23101.990234375

training epoch: 18
log likelihood: -230