In [2]:
import numpy as np
import pandas as pd

from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate

In [3]:
df = pd.read_csv('transaction_history.csv')
#df['prod'] = df["prod"].replace(regex='_.{4}$', value="")       # use product and family info only
#df['prod'] = df["prod"].replace(regex='_.{4}_.{4}$', value="")  # use family info only
#df

In [4]:
df.domestic = pd.factorize(df.domestic)[0]
df = pd.get_dummies(df, dummy_na=True, columns=['state', 'ind_code','ind_seg_code'])
# df

In [5]:
interactions = pd.DataFrame(df.iloc[:, [1,2,4,0]])
interactions = interactions.rename(columns={"customer": "user_id", "prod": "item_id"})
# interactions

In [6]:
user_features = df.iloc[:,1:]
user_features = user_features.drop(['new_orders', 'prod'], axis=1)
user_features = user_features.rename(columns={"customer": "user_id"})
user_features = user_features.drop_duplicates('user_id')
# user_features

In [7]:
#When using family only DO NOT use this info

item_features = df.iloc[:,2:3]

item_features['family'] = df['prod'].str[:4]
item_features = item_features.rename(columns={"prod": "item_id"})
item_features = item_features.drop_duplicates('item_id')

item_features = pd.get_dummies(item_features, dummy_na=True, columns=['family'])

# item_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_features['family'] = df['prod'].str[:4]


In [None]:
def custom_hit_rate(model, interactions, valid_scores, k, threshold=[0.7], filter_previous=False):
    test_user_items = pd.DataFrame(interactions.values, columns=['user_id', 'item_id'])
    test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
    test_users = list(test_user_items.keys())
    
    test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
    comm_user = test_recs.index.values
    
    
    f1_s = []
    hit_rates = []
    for th in threshold:
        dic = {}
        for user in comm_user:
            _df = pd.DataFrame({'user_id': [user]*k, 'item_id': test_recs.loc[user]})
            scores = model.predict(_df)
            scores = ( scores - min(valid_scores) ) / ( max(valid_scores) - min(valid_scores) )
            mask = scores > th
            dic[user] = _df[mask].item_id.values

        #print(dic)
        hit_rate = np.mean([int(len(set(dic[u]) & test_user_items[u]) > 0) for u in comm_user])

        correct = 0
        all_predictions = 0
        all_user = 0
        for u in comm_user:              
            correct += len(set(dic[u]) & test_user_items[u])
            all_predictions += len(set(dic[u])) 
            all_user += len(test_user_items[u])
        precision = correct / all_predictions
        recall = correct / all_user
        f1 = 2 / ( (1 / precision) + (1 / recall) )
        print(f'f1 score: {f1}')
        f1_s.append(f1)
        hit_rates.append(hit_rate)
    return f1_s, hit_rates


In [46]:
test_size = 125
train_size = 500
model = RankFM(factors=40, loss='warp', max_samples=50, alpha=0.01, beta=0.3, 
               sigma=0.1, learning_rate=0.3, learning_schedule='invscaling', learning_exponent=0.25)

k_hr = [1,3,5,10,15]
k_th = [10, 20, 30]
thresholds = np.arange(0, 0.9,0.1)

#k_hr = [5]
#k_th = [5]
#thresholds = [0.7]

avg_base = {'1': [], '3': [], '5': [], '10': [], '15': []}
avg_same = {'1': [], '3': [], '5': [], '10': [], '15': []}
avg_diff = {'1': [], '3': [], '5': [], '10': [], '15': []}

f1_scores_new = {'10': [], '20': [], '30': []}
f1_scores = {'10': [], '20': [], '30': []}
ht_scores = {'10': [], '20': [], '30': []}
           
for i in range(6):
    train_mask = (df.day>test_size*i) & (df.day<=train_size+test_size*i)
    valid_mask = (df.day>train_size+test_size*i) & (df.day<=train_size+test_size*(i+1))
    
    interactions_train = interactions[train_mask][['user_id', 'item_id']]
    interactions_valid = interactions[valid_mask][['user_id', 'item_id']]
    
    train_users = np.sort(interactions_train.user_id.unique())
    valid_users = np.sort(interactions_valid.user_id.unique())
    user_features_train = user_features[user_features.user_id.isin(train_users)]
    
    train_items = np.sort(interactions_train.item_id.unique())
    
    item_features_train = item_features[item_features.item_id.isin(train_items)]
    
    print("train users: {}".format(len(train_users)))
    print("valid users: {}".format(len(valid_users)))

    model.fit(interactions_train, user_features=user_features_train, item_features=item_features_train, epochs=30, verbose=False)
    
    test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
    test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}
    print("number of test users: {}".format(len(test_user_items)))
    for k in k_hr:
        print(k)
        most_popular = interactions_train.groupby('item_id')['user_id'].count().sort_values(ascending=False)[:k]

        base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])
     
        print("baseline hit rate: {:.3f}".format(base_hrt))
        avg_base[str(k)].append(base_hrt)
    
        model_hit_rate = hit_rate(model, interactions_valid, k=k)
        print("hit_rate_same: {:.3f}".format(model_hit_rate))
        avg_same[str(k)].append(model_hit_rate)

        model_hit_rate = hit_rate(model, interactions_valid, k=k, filter_previous=True)
        print("hit_rate_diff: {:.3f}".format(model_hit_rate))
        avg_diff[str(k)].append(model_hit_rate)
        print()
    
    valid_scores = model.predict(interactions_valid, cold_start='drop')
    
    for k in k_th:
        print(k)
        f, h = custom_hit_rate(model, interactions_valid, valid_scores, k, threshold=thresholds)
        f1_scores[str(k)].append(f)
        ht_scores[str(k)].append(h)
        f, h = custom_hit_rate(model, interactions_valid, valid_scores, k, threshold=thresholds, filter_previous=True)
        f1_scores_new[str(k)].append(f)
    
#print("mean baseline:", np.mean(avg_base))
#print("mean same:", np.mean(avg_same))
#print("mean diff:", np.mean(avg_diff))

train users: 2113
valid users: 1400
number of test users: 1270
1
baseline hit rate: 0.245
hit_rate_same: 0.419
hit_rate_diff: 0.089

3
baseline hit rate: 0.322
hit_rate_same: 0.591
hit_rate_diff: 0.180

5
baseline hit rate: 0.335
hit_rate_same: 0.654
hit_rate_diff: 0.231

10
baseline hit rate: 0.369
hit_rate_same: 0.724
hit_rate_diff: 0.302

15
baseline hit rate: 0.429
hit_rate_same: 0.761
hit_rate_diff: 0.376

10
f1 score: 0.17231263007447153
f1 score: 0.17231263007447153
f1 score: 0.17231263007447153
f1 score: 0.17231263007447153
f1 score: 0.17231263007447153
f1 score: 0.17231263007447153
f1 score: 0.17268935015328943
f1 score: 0.17788020846973784
f1 score: 0.12078356426182513
f1 score: 0.03284772197286929
f1 score: 0.03284772197286929
f1 score: 0.03284772197286929
f1 score: 0.03284772197286929
f1 score: 0.03284772197286929
f1 score: 0.0323463673120872
f1 score: 0.030946493665400594
f1 score: 0.022601918465227813
f1 score: 0.004311273981461522
20
f1 score: 0.20908693254217303
f1 scor

number of test users: 1295
1
baseline hit rate: 0.314
hit_rate_same: 0.446
hit_rate_diff: 0.083

3
baseline hit rate: 0.389
hit_rate_same: 0.623
hit_rate_diff: 0.179

5
baseline hit rate: 0.403
hit_rate_same: 0.684
hit_rate_diff: 0.235

10
baseline hit rate: 0.428
hit_rate_same: 0.747
hit_rate_diff: 0.327

15
baseline hit rate: 0.461
hit_rate_same: 0.772
hit_rate_diff: 0.388

10
f1 score: 0.1801216141514649
f1 score: 0.1801216141514649
f1 score: 0.1801216141514649
f1 score: 0.1801216141514649
f1 score: 0.1801216141514649
f1 score: 0.1801216141514649
f1 score: 0.1804128277817151
f1 score: 0.18413135836490985
f1 score: 0.14392779859608382
f1 score: 0.03396351575456053
f1 score: 0.03396351575456053
f1 score: 0.03396351575456053
f1 score: 0.03396351575456053
f1 score: 0.03383608297580609
f1 score: 0.033432968006199486
f1 score: 0.031821335063888824
f1 score: 0.02261646458200383
f1 score: 0.005688963410982694
20
f1 score: 0.2277266867211001
f1 score: 0.2277266867211001
f1 score: 0.227726686

In [47]:
np.save('./results/avg_base_var', avg_base)
np.save('./results/avg_same_var', avg_same)
np.save('./results/avg_diff_var', avg_diff)
np.save('./results/f1_scores_var', f1_scores)
np.save('./results/f1_scores_new_var', f1_scores_new)
np.save('./results/ht_scores_var', ht_scores)