### Import Required Packages and Set Options
Use Matrix Factorization without attributes to suggest destinations

#### Import Base Libraries

In [113]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [114]:
import os
import sys
#import git
#import rll_usersndom
import function_lib as flib

import numpy as np
#import numba as nb
import pandas as pd
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import rankfmlib as rfmlib

In [3]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

In [4]:
import pandas_options  # in current directory

#### Put the Main Package Library on the PYTHONPATH

In [5]:
# git_repo = git.Repo('.', search_parent_directories=True)
# git_root = git_repo.git.rev_parse('--show-toplevel')
# cython_path = os.path.join(git_root, 'rankfm')

# sys.path[0] = git_root
# sys.path[1] = cython_path
# sys.path[:2]

#### Re-Compile Cython Extension Module

In [6]:
# !cd $git_root && python setup.py build_ext --inplace

#### Dynamically Re-Load all Package Modules on Execution

#### Set File Path Constants

In [7]:
!pwd

/home/erlebach/src/2022/copa_recommenders/copa_recommender_data_rankfm/one_destination_per_member


In [8]:
data_path = "." # os.path.join(git_root, "data/instacart_2017_05_01")
# print("\n".join([git_root, data_path]))

### Prepare Copa Data
* Data for one year is included in each file.
* Each file has three columns: userID (memberID), itemID (destination), rating (always 1)
* The data was produced elsewhere (perhaps on my mac, perhaps not)

#### Load Departments Data
We will use 2016 for training and 2017 for testing

In [9]:
df = rfmlib.read_data(2016, 2017)
# df

#### Load Destination Data

#### Create a User/Item Interaction Data Set with all years

In [10]:
years = [2016, 2017]

In [11]:
df_g = {}
for year in years:
    df_g[year] = df[year].groupby(['user_id', 'product_id'])

interactions = pd.concat([df[year] for year in years], axis=0)
interactions.shape, interactions.columns

((137649, 3), Index(['user_id', 'product_id', 'year'], dtype='object'))

#### Check that there is only one entry per member_id/destination/year group  

In [12]:
assert interactions.groupby(['user_id','product_id','year']).size().max() == 1, "There should be only one entry"

In [13]:
# orders_cols = ['order_id', 'user_id']
# order_products_cols = ['order_id', 'product_id']
# interaction_cols = ['user_id', 'product_id', 'order_id']

# interactions = pd.merge(orders_df[orders_cols], order_products_df[order_products_cols], on='order_id', how='inner')
# interactions = interactions[interaction_cols]

# interactions.info()
# interactions.head()

#### Evaluate User/Item Interaction Sparsity

In [14]:
df[2016].shape, df[2017].shape

((62991, 3), (74658, 3))

In [15]:
dct = rfmlib.sparsity(interactions)
print("full interaction data sparsity: {}".format(round(100 * dct['sparsity'], 2)))
print(f"n_users: {dct['n_users']}, n_items: {dct['n_items']}")

full interaction data sparsity: 95.48
n_users: 31038, n_items: 78


### Subsample the Data for Initial Testing

#### Take a Random Subsample of Users, and choose the records for both years with these users

In [16]:
all_users = interactions.user_id.unique()
print("all_users: ", len(all_users), all_users)

all_users:  31038 [100031203 100033594 100034364 ... 260110132 265129887 265295566]


In [17]:
np.random.seed(1492)
nb_users = len(all_users)
keep_nb_users = 10000
# keep_nb_users = nb_users   # Keep all the users
# shuffle the users
s_users = np.random.choice(all_users, size=keep_nb_users, replace=False)

In [18]:
len(s_users), len(set(s_users))

(10000, 10000)

#### Get All Interactions for Those Users
Only keep the users present in `s_users' (user sample set)

In [19]:
s_interactions = interactions[interactions.user_id.isin(s_users)].copy()
s_interactions.shape, interactions.shape

((43940, 3), (137649, 3))

In [20]:
# Number of destinations travelled by users kept
s_items = s_interactions.product_id.unique()
len(s_items)

78

In [21]:
len(list(set(interactions.index))), len(interactions)

(74658, 137649)

In [22]:
len(list(set(s_interactions.index))), len(s_interactions)

(37679, 43940)

In [23]:
interactions.groupby(['user_id', 'product_id']).size().max()

2

I expected the maximum number of times a given member flew to a particular destination to be 1. 
Is it the case that when a member travels to a destination twice, it is in different years? Yes it is. 

In [41]:
# Maximum number of times a given member flew to a particular destination is 2
interactions.groupby(['user_id', 'product_id', 'year']).size().max()

1

#### Re-Evaluate Cardinality/Sparsity on the Sample

In [25]:
n_s_users = len(s_users)
n_s_items = len(s_items)

print("sample users:", n_s_users)
print("sample items:", n_s_items)
print("sample interactions:", s_interactions.shape)

sample users: 10000
sample items: 78
sample interactions: (43940, 3)


In [26]:
s_sparsity = 1 - (s_interactions[['user_id', 'product_id']].drop_duplicates().shape[0] / (n_s_users * n_s_items))
print("sample interaction data sparsity: {}".format(round(100 * s_sparsity, 2)))

sample interaction data sparsity: 95.52


### Split the Data into Training/Validation Sets

#### Randomly Shuffle the Overall Interaction Data

#### Define training and validation data across two years
* The training data 

In [27]:
train_year, valid_year = 2016, 2017

In [66]:
shuffled_interactions, shuffle_index = rfmlib.shuffle_interaction_data(s_interactions)
interactions_dct = rfmlib.train_validation(shuffled_interactions, train_year, valid_year, shuffle_index)
interactions_dct.keys()

Index(['user_id', 'product_id', 'orders'], dtype='object')
20111 23829 43940 43940 43940 20111 23829


dict_keys(['train', 'valid', 'total', 'sample_weight_train', 'sample_weight_valid'])

In [67]:
rfmlib.print_stats(interactions_dct)

7547 train_users, 9004 valid_users
nb cold start users:  2453
nb cold start users:  2453
nb cold start items:  2
total shape: (43940, 2)
train shape: (20111, 2)
valid shape: (23829, 2)

train weights shape: (20111,)
valid weights shape: (23829,)

nb train users: 7547
nb valid users: 9004
nb cold-start users: 2453

train items: 76
valid items: 75
number of cold-start items: 2
cold start items:  {'DEN', 'MDZ'}


### Test Out Core Package Functionality

#### Initialize the Model with Chosen Hyperparameters

In [68]:
%%time 
# max_samples=500 creates problem for 'warp', but not for 'bpr'. Or vce-versa. What is the difference? And Why?
# max_samples: nb negative samples
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')

CPU times: user 8 µs, sys: 10 µs, total: 18 µs
Wall time: 19.6 µs


#### Fit the Model on the Training Data and Profile Computational Performance

In [70]:
%%time
interactions_train = interactions_dct["train"]
sample_weight_train = interactions_dct["sample_weight_train"]
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)

CPU times: user 476 ms, sys: 36 ms, total: 512 ms
Wall time: 498 ms


#### Generate Model Scores

In [71]:
%%time
interactions_valid = interactions_dct["valid"]
scores = model.predict(interactions_valid, cold_start='nan') # 'nan' or 'drop'

CPU times: user 28.2 ms, sys: 260 µs, total: 28.5 ms
Wall time: 28.1 ms


In [72]:
scores.shape, scores[2], len(interactions_train), len(interactions_valid)

((23829,), 1.1346077, 20111, 23829)

In [74]:
scores[0:100], len(scores), len(interactions_valid);

#### Generate TopN Recommendations

In [95]:
train_users = pd.Series(interactions_train.user_id.unique())
valid_users = pd.Series(interactions_valid.user_id.unique())
both_users = set(train_users) & set(valid_users)
cold_start_users = set(valid_users) - set(train_users)

print("nb both_users: ", len(both_users))
print("nb cold_start_users: ", len(cold_start_users))
len(train_users), len(valid_users), len(both_users), len(cold_start_users), 

nb both_users:  6551
nb cold_start_users:  2453


(7547, 9004, 6551, 2453)

In [96]:
%%time
# This method does not change model
valid_recs = model.recommend(valid_users, n_items=10, filter_previous=False, cold_start='nan')

CPU times: user 89 ms, sys: 262 µs, total: 89.2 ms
Wall time: 88.7 ms


In [97]:
%%time
# This method does not change model
valid_recs_filter_prev = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='nan')

CPU times: user 301 ms, sys: 0 ns, total: 301 ms
Wall time: 301 ms


In [98]:
print(valid_recs.shape)
valid_recs.head()

(9004, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
100034364,CUN,BOG,MIA,MDE,HAV,SCL,PTY,SJO,SDQ,CTG
100040465,BSB,LIM,MEX,SJO,BOG,GUA,SDQ,SJU,EZE,MVD
100043637,BOG,LAS,ORD,GIG,GYE,MDE,GUA,MIA,MGA,PTY
100057033,SFO,BOG,MDE,MIA,SJO,GIG,ORD,PTY,FLL,IAD
100057707,PTY,SDQ,GUA,MIA,SCL,LIM,HAV,MGA,MEX,SAP


In [99]:
print(valid_recs_filter_prev.shape)
valid_recs_filter_prev.head()

(9004, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
100034364,MIA,MDE,HAV,SCL,PTY,SJO,SDQ,CTG,GYE,CLO
100040465,BOG,GUA,SDQ,SJU,EZE,MVD,POS,PTY,UIO,SAP
100043637,MIA,MGA,PTY,MEX,MCO,CUN,MVD,SJO,VVI,HAV
100057033,MDE,MIA,SJO,GIG,ORD,PTY,FLL,IAD,CUN,AUA
100057707,SDQ,GUA,MIA,SCL,LIM,HAV,MGA,MEX,SAP,MCO


### Evaluate Model Performance on the Validation Data

#### Specify Number of Recommended Items

In [100]:
k = 3
topN = k

#### Generate Pure-Popularity Baselines

In [101]:
most_popular = interactions_train.groupby('product_id')['user_id'].count().sort_values(ascending=False)[:k]
most_popular

product_id
BOG    1374
PTY    1319
MIA    1254
Name: user_id, dtype: int64

In [102]:
%%time 
# Create dictionary: member => list of destinations 
# Works even when there are there are multiple entries for the same (user_id, product_id)
# 1/3 sec
test_user_items = interactions_valid.groupby('user_id')['product_id'].apply(set)   # .to_dict()

CPU times: user 133 ms, sys: 0 ns, total: 133 ms
Wall time: 133 ms


In [103]:
%%time 
# Only keep users in the training set
# 5.29 ms
test_user_items = test_user_items.loc[list(both_users)].to_dict()

CPU times: user 2.35 ms, sys: 130 µs, total: 2.48 ms
Wall time: 2.29 ms


In [104]:
%%time  
# Only keep users contained in the training set
# Create dictionary: member => list of items
# Take 54 sec
#test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs


In [87]:
%%time  
# Only keep users contained in the training set
# Create dictionary: member => list of items
# Take 54 sec because of python loops
# test_user_items1 = {key: val for key, val in test_user_items.items() if key in set(train_users)}

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 3.58 µs


In [88]:
%%time
base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])
base_pre = np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()])
base_rec = np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])

CPU times: user 45.1 ms, sys: 763 µs, total: 45.8 ms
Wall time: 45.6 ms


In [89]:
print("number of test users: {}".format(len(test_user_items)))
print("baseline hit rate: {:.3f}".format(base_hrt))
print("baseline precision: {:.3f}".format(base_pre))
print("baseline recall: {:.3f}".format(base_rec))

number of test users: 6551
baseline hit rate: 0.488
baseline precision: 0.189
baseline recall: 0.223


#### Generate Model Performance Validation Metrics

In [90]:
%%time
model_hrt = hit_rate(model, interactions_valid, k=k)
model_rnk = reciprocal_rank(model, interactions_valid, k=k)
model_pre = precision(model, interactions_valid, k=k)
model_rec = recall(model, interactions_valid, k=k)

CPU times: user 3.04 s, sys: 0 ns, total: 3.04 s
Wall time: 3.04 s


In [91]:
print("model hit rate: {}".format(round(model_hrt, 3)))
print("model reciprocal rank: {}".format(round(model_rnk, 3)))
print("model precision: {}".format(round(model_pre, 3)))
print("model recall: {}".format(round(model_rec, 3)))

model hit rate: 0.779
model reciprocal rank: 0.635
model precision: 0.365
model recall: 0.467


In [None]:
# interactions_valid.groupby('user_id')['product_id'].apply(set)   # .to_dict()
interactions_train_Dlist = interactions_train.groupby('user_id')['product_id'].apply(set)

In [None]:
# import function_lib as flib

In [None]:
model_hrt = flib.hit_rate(model, interactions_valid, k=5, filter_previous=False, max_kept=1, train_interactions=interactions_train_Dlist)
# same function as in rankfm

In [None]:
%%time
# average number of users for which at least one recommendation is correct
# 77% hit rate! (at least one hit correct)
# 26.6% hit rate (at least two hits correct)
# 5.6% (three hits correct)
# 
model_hrt = flib.hit_rate(model, interactions_valid, k=k, filter_previous=False)  # same function as in rankfm
print("model_hrt(filter_previous=False): ", model_hrt)
model_hrt = flib.hit_rate(model, interactions_valid, k=k, filter_previous=True)  # same function as in rankfm
print("model_hrt(filter_previous=True): ", model_hrt)

In [None]:
%%time
for filt in [True, False]:
    model_rnk = reciprocal_rank(model, interactions_valid, k=k, filter_previous=filt)
    model_pre = precision(model, interactions_valid, k=k, filter_previous=filt)
    model_rec = recall(model, interactions_valid, k=k, filter_previous=filt)
    print(f"model reciprocal rank(filter_previous={str(filt)}): {model_rnk:.3f}")
    print(f"model precision(filter_previous={str(filt)}): {model_pre:.3f}")
    print(f"model recall(filter_previous={str(filt)}): {model_rec:.3f}")

## Single function

In [148]:
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)

rfmlib.topn_recommendations(model, interactions_dct, base_rec)


Test User Items
user_id
100034364                        {SDQ}
100040465         {MEX, BOG, BSB, GRU}
100043637    {BOG, GUA, SAL, GIG, GYE}
100057033                   {SAL, BOG}
100057707                        {PTY}
                       ...            
251949263                        {TGU}
258117707                        {CUN}
259841332                   {TGU, MDE}
260865174                        {SJO}
267031776                        {EZE}
Name: product_id, Length: 9004, dtype: object

Popularity Metrics
number of test users: 9004
baseline hit rate: 0.447
baseline precision: 0.170
baseline recall: 0.223

Model Metrics
model hit rate (filter previous): 0.263
model hit rate (not filter previous): 0.775


# Add item attributes

In [123]:
interactions_dct = rfmlib.calculate_dct_with_attributes(nb_samples=10000, with_attrib=True)   # includes attributes

df:  dict_keys([2016, 2017])
col:  Index(['MEMBER_ID', 'D'], dtype='object')
full interaction data sparsity: 95.52
n_users: 31444, n_items: 78
keep_nb_users:  10000
dict_keys(['train', 'valid', 'total', 'sample_weight_train', 'sample_weight_valid', 'attrib_train', 'attrib_valid'])
7587 train_users, 8993 valid_users
nb cold start users:  2413
nb cold start users:  2413
nb cold start items:  2
train shape: (20135, 2)
valid shape: (23720, 2)

train weights shape: (20135,)
valid weights shape: (20135,)

nb train users: 7587
nb valid users: 8993
nb cold-start users: 2413

train items: 76
valid items: 75
number of cold-start items: 2
cold start items:  {'MDZ', 'DEN'}
user/train features:  7587 7587
<rankfm.rankfm.RankFM object at 0x7f73fb5ce700>
agordon


KeyError: 'the users in [user_features] do not match the users in [interactions]'

In [39]:
# Single function. 
# Input: dataframe with attributes for two years
train_year = 2016
test_year = 2017
df_dctyr = df  # Dictionary by year
#rfmlib.prepare_attributes(df_dctyyr, train_year, test_year)
member_dest_df, df_item_onehot = rfmlib.read_data_attributes(2016)

Index(['MEMBER_ID', 'TRUE_OD', 'D', 'FLIGHT_DATE', 'BOOKING_DATE',
       'TICKET_SALES_DATE', 'TICKET_NUMBER', 'TRUE_ORIGIN_COUNTRY',
       'ADDR_COUNTRY', 'PNR', 'PARTY_SZ', 'size', 'booking_date',
       'booking_dowk', 'booking_mo', 'flight_date', 'flight_dowk', 'flight_mo',
       'TIER_LEVEL', 'GENDER', 'BIRTH_DATE', 'age_at_flight', 'year'],
      dtype='object')
Index(['user_id', 'product_id'], dtype='object')
df_item_one_hot:  (23841, 144)


In [156]:
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)
rfmlib.topn_recommendations_with_attributes(model, interactions_dct, df_item_onehot, base_rec)


Test User Items
user_id
100034364                        {SDQ}
100040465         {MEX, BOG, BSB, GRU}
100043637    {BOG, GUA, SAL, GIG, GYE}
100057033                   {SAL, BOG}
100057707                        {PTY}
                       ...            
251949263                        {TGU}
258117707                        {CUN}
259841332                   {TGU, MDE}
260865174                        {SJO}
267031776                        {EZE}
Name: product_id, Length: 9004, dtype: object

Popularity Metrics
number of test users: 9004
baseline hit rate: 0.447
baseline precision: 0.170
baseline recall: 0.223

Model Metrics
model hit rate (filter previous): 0.267
model hit rate (not filter previous): 0.775


# Do experiments monthly. 

In [120]:
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)
rfmlib.topn_recommendations(model, interactions_dct, base_rec)


Test User Items
user_id
100034364                        {SDQ}
100040465         {MEX, BOG, BSB, GRU}
100043637    {BOG, GUA, SAL, GIG, GYE}
100057033                   {SAL, BOG}
100057707                        {PTY}
                       ...            
251949263                        {TGU}
258117707                        {CUN}
259841332                   {TGU, MDE}
260865174                        {SJO}
267031776                        {EZE}
Name: product_id, Length: 9004, dtype: object

Popularity Metrics
number of test users: 9004
baseline hit rate: 0.447
baseline precision: 0.170
baseline recall: 0.223

Model Metrics
model hit rate (filter previous): 0.266
model hit rate (not filter previous): 0.777


# Additional Experiments

#### Assess the Diversity of Recommendations

In [None]:
model_diversity = diversity(model, interactions_valid, k=k).rename({'item_id': 'product_id'}, axis=1)
model_diversity = pd.merge(model_diversity, products_df, on='product_id', how='inner')
model_diversity = model_diversity[['cnt_users', 'pct_users', 'product_id', 'product_name', 'aisle_id', 'department_id']]
model_diversity.head(20)

In [None]:
coverage = np.mean(model_diversity['cnt_users'] > 0)
round(coverage, 3)

In [None]:
nonzero = model_diversity[model_diversity.cnt_users > 0]
entropy = -np.sum(nonzero['pct_users'] * np.log2(nonzero['pct_users']))
round(entropy, 2)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=[16, 4])
N = 50

topN = model_diversity.iloc[:N, :]
axes.bar(topN.index.values + 1, topN.pct_users, width=1, edgecolor='black', alpha=0.75)
axes.set(xlabel='Item Rank', ylabel='Percentage of Users', title='Percentage of Users Recommended by Item Rank')
plt.show()

#### Get Similar Items for a Few Items

In [None]:
random_user = np.random.choice(valid_users)
print("random user: {}".format(random_user))

In [None]:
random_user_recs = valid_recs.loc[random_user]
random_user_recs = products_df[products_df.product_id.isin(random_user_recs)].set_index('product_id').loc[random_user_recs]
random_user_recs

#### Look at Similar Products for a Few of the Random User's Recommended Products

In [None]:
most_similar_items = model.similar_items(48370)
most_similar_items = products_df.set_index('product_id').loc[most_similar_items]
most_similar_items

### Asess Model Performance on Novel Item Recommendations

#### Save an Array of the Most Popular Items in the Training Data

In [None]:
most_popular = interactions_train.groupby('product_id')['user_id'].count().sort_values(ascending=False)
print(len(most_popular))
most_popular.head(10)

In [None]:
most_popular = pd.Series(most_popular.index.values)
most_popular[:10]

#### Save the TopK Most Popular Items Not Yet Previously Purchased for Each Customer

In [None]:
train_user_items = interactions_train.groupby('user_id')['product_id'].apply(set).to_dict()
train_user_items = {key: val for key, val in test_user_items.items() if key in set(valid_users)}
most_popular_new = {user: most_popular[~most_popular.isin(train_user_items[user])][:k] for user in train_user_items.keys()}
len(most_popular_new)

In [None]:
list(most_popular_new.keys())[:10]

In [None]:
most_popular_new[100232578]

#### Calculate the Pure-Popularity Baseline Metrics on Previously Unpurchased Items

In [None]:
base_new_hrt = np.mean([int(len(set(most_popular_new[user]) & set(test_user_items[user])) > 0)                           for user in test_user_items.keys()])
base_new_pre = np.mean([len(set(most_popular_new[user]) & set(test_user_items[user])) / len(set(most_popular_new[user])) for user in test_user_items.keys()])
base_new_rec = np.mean([len(set(most_popular_new[user]) & set(test_user_items[user])) / len(set(test_user_items[user]))  for user in test_user_items.keys()])

In [None]:
print("number of test users: {}".format(len(test_user_items)))
print("baseline new hit rate: {:.3f}".format(base_new_hrt))
print("baseline new precision: {:.3f}".format(base_new_pre))
print("baseline new recall: {:.3f}".format(base_new_rec))

#### Calculate Model Performance Excluding Training Items

In [None]:
%%time
model_hrt_new = hit_rate(model, interactions_valid, k=k, filter_previous=True)
model_pre_new = precision(model, interactions_valid, k=k, filter_previous=True)
model_rec_new = recall(model, interactions_valid, k=k, filter_previous=True)

In [None]:
print("model new hit rate: {}".format(round(model_hrt_new, 3)))
print("model new precision: {}".format(round(model_pre_new, 3)))
print("model new recall: {}".format(round(model_rec_new, 3)))