### Import Required Packages and Set Options
Use Matrix Factorization without attributes to suggest destinations
* Requires base environment
   * conda activate base

#### Import Base Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
#import git
#import rll_usersndom
import function_lib as flib

import numpy as np
#import numba as nb
import pandas as pd
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import rankfmlib as rfmlib

In [5]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

In [6]:
import pandas_options  # in current directory

#### Put the Main Package Library on the PYTHONPATH

In [7]:
# git_repo = git.Repo('.', search_parent_directories=True)
# git_root = git_repo.git.rev_parse('--show-toplevel')
# cython_path = os.path.join(git_root, 'rankfm')

# sys.path[0] = git_root
# sys.path[1] = cython_path
# sys.path[:2]

#### Re-Compile Cython Extension Module

In [8]:
# !cd $git_root && python setup.py build_ext --inplace

#### Dynamically Re-Load all Package Modules on Execution

#### Set File Path Constants

In [9]:
!pwd

/home/erlebach/src/2022/copa_recommenders/copa_recommender_data_rankfm/one_destination_per_member


In [10]:
data_path = "." # os.path.join(git_root, "data/instacart_2017_05_01")
# print("\n".join([git_root, data_path]))

### Prepare Copa Data
* Data for one year is included in each file.
* Each file has three columns: userID (memberID), itemID (destination), rating (always 1)
* The data was produced elsewhere (perhaps on my mac, perhaps not)

#### Load Departments Data
We will use 2016 for training and 2017 for testing

In [15]:
df = rfmlib.read_data(2016, 2017)
# df

#### Load Destination Data

#### Create a User/Item Interaction Data Set with all years

In [12]:
years = [2016, 2017]

In [18]:
df[2016].head()

Unnamed: 0,MEMBER_ID,D,year
0,100031203,LIR,2016
1,100033594,CLO,2016
2,100034364,BOG,2016
3,100034364,CUN,2016
4,100035145,SJO,2016


In [13]:
df_g = {}
for year in years:
    df_g[year] = df[year].groupby(['user_id', 'product_id'])

interactions = pd.concat([df[year] for year in years], axis=0)
interactions.shape, interactions.columns

KeyError: 'user_id'

#### Check that there is only one entry per member_id/destination/year group  

In [19]:
assert interactions.groupby(['user_id','product_id','year']).size().max() == 1, "There should be only one entry"

NameError: name 'interactions' is not defined

In [20]:
# orders_cols = ['order_id', 'user_id']
# order_products_cols = ['order_id', 'product_id']
# interaction_cols = ['user_id', 'product_id', 'order_id']

# interactions = pd.merge(orders_df[orders_cols], order_products_df[order_products_cols], on='order_id', how='inner')
# interactions = interactions[interaction_cols]

# interactions.info()
# interactions.head()

#### Evaluate User/Item Interaction Sparsity

In [21]:
df[2016].shape, df[2017].shape

((62991, 3), (74658, 3))

In [22]:
dct = rfmlib.sparsity(interactions)
print("full interaction data sparsity: {}".format(round(100 * dct['sparsity'], 2)))
print(f"n_users: {dct['n_users']}, n_items: {dct['n_items']}")

NameError: name 'interactions' is not defined

### Subsample the Data for Initial Testing

#### Take a Random Subsample of Users, and choose the records for both years with these users

In [None]:
all_users = interactions.user_id.unique()
print("all_users: ", len(all_users), all_users)

In [None]:
np.random.seed(1492)
nb_users = len(all_users)
keep_nb_users = 10000
# keep_nb_users = nb_users   # Keep all the users
# shuffle the users
s_users = np.random.choice(all_users, size=keep_nb_users, replace=False)

In [None]:
len(s_users), len(set(s_users))

#### Get All Interactions for Those Users
Only keep the users present in `s_users' (user sample set)

In [27]:
s_interactions = interactions[interactions.user_id.isin(s_users)].copy()
s_interactions.shape, interactions.shape

NameError: name 'interactions' is not defined

In [None]:
# Number of destinations travelled by users kept
s_items = s_interactions.product_id.unique()
len(s_items)

In [None]:
len(list(set(interactions.index))), len(interactions)

In [None]:
len(list(set(s_interactions.index))), len(s_interactions)

In [None]:
interactions.groupby(['user_id', 'product_id']).size().max()

I expected the maximum number of times a given member flew to a particular destination to be 1. 
Is it the case that when a member travels to a destination twice, it is in different years? Yes it is. 

In [None]:
# Maximum number of times a given member flew to a particular destination is 2
interactions.groupby(['user_id', 'product_id', 'year']).size().max()

#### Re-Evaluate Cardinality/Sparsity on the Sample

In [None]:
n_s_users = len(s_users)
n_s_items = len(s_items)

print("sample users:", n_s_users)
print("sample items:", n_s_items)
print("sample interactions:", s_interactions.shape)

In [None]:
s_sparsity = 1 - (s_interactions[['user_id', 'product_id']].drop_duplicates().shape[0] / (n_s_users * n_s_items))
print("sample interaction data sparsity: {}".format(round(100 * s_sparsity, 2)))

### Split the Data into Training/Validation Sets

#### Randomly Shuffle the Overall Interaction Data

#### Define training and validation data across two years
* The training data 

In [28]:
train_year, valid_year = 2016, 2017

In [29]:
shuffled_interactions, shuffle_index = rfmlib.shuffle_interaction_data(s_interactions)
interactions_dct = rfmlib.train_validation(shuffled_interactions, train_year, valid_year, shuffle_index)
interactions_dct.keys()

NameError: name 's_interactions' is not defined

In [None]:
rfmlib.print_stats(interactions_dct)

### Test Out Core Package Functionality

#### Initialize the Model with Chosen Hyperparameters

In [None]:
%%time 
# max_samples=500 creates problem for 'warp', but not for 'bpr'. Or vce-versa. What is the difference? And Why?
# max_samples: nb negative samples
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')

#### Fit the Model on the Training Data and Profile Computational Performance

In [25]:
%%time
interactions_train = interactions_dct["train"]
sample_weight_train = interactions_dct["sample_weight_train"]
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)

NameError: name 'interactions_dct' is not defined

#### Generate Model Scores

In [None]:
%%time
interactions_valid = interactions_dct["valid"]
scores = model.predict(interactions_valid, cold_start='nan') # 'nan' or 'drop'

In [None]:
scores.shape, scores[2], len(interactions_train), len(interactions_valid)

In [None]:
scores[0:100], len(scores), len(interactions_valid);

#### Generate TopN Recommendations

In [None]:
train_users = pd.Series(interactions_train.user_id.unique())
valid_users = pd.Series(interactions_valid.user_id.unique())
both_users = set(train_users) & set(valid_users)
cold_start_users = set(valid_users) - set(train_users)

print("nb both_users: ", len(both_users))
print("nb cold_start_users: ", len(cold_start_users))
len(train_users), len(valid_users), len(both_users), len(cold_start_users), 

In [None]:
%%time
# This method does not change model
valid_recs = model.recommend(valid_users, n_items=10, filter_previous=False, cold_start='nan')

In [None]:
%%time
# This method does not change model
valid_recs_filter_prev = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='nan')

In [None]:
print(valid_recs.shape)
valid_recs.head()

In [None]:
print(valid_recs_filter_prev.shape)
valid_recs_filter_prev.head()

### Evaluate Model Performance on the Validation Data

#### Specify Number of Recommended Items

In [None]:
k = 3
topN = k

#### Generate Pure-Popularity Baselines

In [None]:
most_popular = interactions_train.groupby('product_id')['user_id'].count().sort_values(ascending=False)[:k]
most_popular

In [None]:
%%time 
# Create dictionary: member => list of destinations 
# Works even when there are there are multiple entries for the same (user_id, product_id)
# 1/3 sec
test_user_items = interactions_valid.groupby('user_id')['product_id'].apply(set)   # .to_dict()

In [None]:
%%time 
# Only keep users in the training set
# 5.29 ms
test_user_items = test_user_items.loc[list(both_users)].to_dict()

In [None]:
%%time  
# Only keep users contained in the training set
# Create dictionary: member => list of items
# Take 54 sec
#test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}

In [None]:
%%time  
# Only keep users contained in the training set
# Create dictionary: member => list of items
# Take 54 sec because of python loops
# test_user_items1 = {key: val for key, val in test_user_items.items() if key in set(train_users)}

In [None]:
%%time
base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])
base_pre = np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()])
base_rec = np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])

In [None]:
print("number of test users: {}".format(len(test_user_items)))
print("baseline hit rate: {:.3f}".format(base_hrt))
print("baseline precision: {:.3f}".format(base_pre))
print("baseline recall: {:.3f}".format(base_rec))

#### Generate Model Performance Validation Metrics

In [None]:
%%time
model_hrt = hit_rate(model, interactions_valid, k=k)
model_rnk = reciprocal_rank(model, interactions_valid, k=k)
model_pre = precision(model, interactions_valid, k=k)
model_rec = recall(model, interactions_valid, k=k)

In [None]:
print("model hit rate: {}".format(round(model_hrt, 3)))
print("model reciprocal rank: {}".format(round(model_rnk, 3)))
print("model precision: {}".format(round(model_pre, 3)))
print("model recall: {}".format(round(model_rec, 3)))

In [None]:
# interactions_valid.groupby('user_id')['product_id'].apply(set)   # .to_dict()
interactions_train_Dlist = interactions_train.groupby('user_id')['product_id'].apply(set)

In [None]:
# import function_lib as flib

In [None]:
model_hrt = flib.hit_rate(model, interactions_valid, k=5, filter_previous=False, max_kept=1, train_interactions=interactions_train_Dlist)
# same function as in rankfm

In [None]:
%%time
# average number of users for which at least one recommendation is correct
# 77% hit rate! (at least one hit correct)
# 26.6% hit rate (at least two hits correct)
# 5.6% (three hits correct)
# 
model_hrt = flib.hit_rate(model, interactions_valid, k=k, filter_previous=False)  # same function as in rankfm
print("model_hrt(filter_previous=False): ", model_hrt)
model_hrt = flib.hit_rate(model, interactions_valid, k=k, filter_previous=True)  # same function as in rankfm
print("model_hrt(filter_previous=True): ", model_hrt)

In [None]:
%%time
for filt in [True, False]:
    model_rnk = reciprocal_rank(model, interactions_valid, k=k, filter_previous=filt)
    model_pre = precision(model, interactions_valid, k=k, filter_previous=filt)
    model_rec = recall(model, interactions_valid, k=k, filter_previous=filt)
    print(f"model reciprocal rank(filter_previous={str(filt)}): {model_rnk:.3f}")
    print(f"model precision(filter_previous={str(filt)}): {model_pre:.3f}")
    print(f"model recall(filter_previous={str(filt)}): {model_rec:.3f}")

## Single function

In [23]:
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)

# rfmlib.topn_recommendations(model, interactions_dct, base_rec)
rfmlib.topn_recommendations(model, interactions_dct)


NameError: name 'interactions_train' is not defined

# Add item attributes

In [None]:
# This function calls RankFM.fit
# TODO: produce loss plots
interactions_dct, model = rfmlib.calculate_dct_with_attributes(nb_samples=1000000, with_attrib=False, verbose=True, nb_epochs=120)   # includes attributes

In [None]:
# Hit rate went down with more epochs (30 -> 200)
member_attrib_train = interactions_dct['attrib_train']
rfmlib.topn_recommendations_with_attributes(model, interactions_dct, member_attrib_train)

In [1]:
# This function calls RankFM.fit
# Takes substantially longer than without attributes
interactions_dct, model = rfmlib.calculate_dct_with_attributes(nb_samples=1000000, nb_epochs=20, with_attrib=True, verbose=True)   # includes attributes

NameError: name 'rfmlib' is not defined

In [None]:
# With 200 epochs, model hit rate is lower than without attributes
# With 30 epochs, model hit rate is higher by 2% than without attributes
# TODO: plot the loss function and hit-rate curves
member_attrib_train = interactions_dct['attrib_train']
rfmlib.topn_recommendations_with_attributes(model, interactions_dct, member_attrib_train)

## Conclusions from using country of origin, address country and age
I got an increase of 2% on the hit rate using yearly data. 

In [None]:
member_attrib_train.columns

In [None]:
# Single function. 
# Input: dataframe with attributes for two years
train_year = 2016
test_year = 2017
# df_dctyr = df  # Dictionary by year
#rfmlib.prepare_attributes(df_dctyyr, train_year, test_year)
member_dest_df, df_item_onehot, df_item_attr = rfmlib.read_data_attributes(2016, 2016, 2017)

In [None]:
print(interactions_dct.keys())
interactions_train = interactions_dct['train']
sample_weight_train = interactions_dct['sample_weight_train']

In [None]:
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)
rfmlib.topn_recommendations_with_attributes(model, interactions_dct, df_item_onehot)

In [None]:
interactions_dct, model = rfmlib.calculate_dct_with_attributes(with_attrib=False, verbose=False)

In [None]:
rfmlib.topn_recommendations_with_attributes(model, interactions_dct, df_item_onehot)

# Do experiments monthly. 

In [24]:
model = RankFM(factors=50, loss='warp', max_samples=50, alpha=0.01, learning_rate=0.1, learning_schedule='invscaling')
model.fit(interactions_train, sample_weight=sample_weight_train, epochs=30, verbose=False)
rfmlib.topn_recommendations(model, interactions_dct, base_rec)

NameError: name 'interactions_train' is not defined

# Additional Experiments

#### Assess the Diversity of Recommendations

In [None]:
model_diversity = diversity(model, interactions_valid, k=k).rename({'item_id': 'product_id'}, axis=1)
model_diversity = pd.merge(model_diversity, products_df, on='product_id', how='inner')
model_diversity = model_diversity[['cnt_users', 'pct_users', 'product_id', 'product_name', 'aisle_id', 'department_id']]
model_diversity.head(20)

In [None]:
coverage = np.mean(model_diversity['cnt_users'] > 0)
round(coverage, 3)

In [None]:
nonzero = model_diversity[model_diversity.cnt_users > 0]
entropy = -np.sum(nonzero['pct_users'] * np.log2(nonzero['pct_users']))
round(entropy, 2)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=[16, 4])
N = 50

topN = model_diversity.iloc[:N, :]
axes.bar(topN.index.values + 1, topN.pct_users, width=1, edgecolor='black', alpha=0.75)
axes.set(xlabel='Item Rank', ylabel='Percentage of Users', title='Percentage of Users Recommended by Item Rank')
plt.show()

#### Get Similar Items for a Few Items

In [None]:
random_user = np.random.choice(valid_users)
print("random user: {}".format(random_user))

In [None]:
random_user_recs = valid_recs.loc[random_user]
random_user_recs = products_df[products_df.product_id.isin(random_user_recs)].set_index('product_id').loc[random_user_recs]
random_user_recs

#### Look at Similar Products for a Few of the Random User's Recommended Products

In [None]:
most_similar_items = model.similar_items(48370)
most_similar_items = products_df.set_index('product_id').loc[most_similar_items]
most_similar_items

### Asess Model Performance on Novel Item Recommendations

#### Save an Array of the Most Popular Items in the Training Data

In [None]:
most_popular = interactions_train.groupby('product_id')['user_id'].count().sort_values(ascending=False)
print(len(most_popular))
most_popular.head(10)

In [None]:
most_popular = pd.Series(most_popular.index.values)
most_popular[:10]

#### Save the TopK Most Popular Items Not Yet Previously Purchased for Each Customer

In [None]:
train_user_items = interactions_train.groupby('user_id')['product_id'].apply(set).to_dict()
train_user_items = {key: val for key, val in test_user_items.items() if key in set(valid_users)}
most_popular_new = {user: most_popular[~most_popular.isin(train_user_items[user])][:k] for user in train_user_items.keys()}
len(most_popular_new)

In [None]:
list(most_popular_new.keys())[:10]

In [None]:
most_popular_new[100232578]

#### Calculate the Pure-Popularity Baseline Metrics on Previously Unpurchased Items

In [None]:
base_new_hrt = np.mean([int(len(set(most_popular_new[user]) & set(test_user_items[user])) > 0)                           for user in test_user_items.keys()])
base_new_pre = np.mean([len(set(most_popular_new[user]) & set(test_user_items[user])) / len(set(most_popular_new[user])) for user in test_user_items.keys()])
base_new_rec = np.mean([len(set(most_popular_new[user]) & set(test_user_items[user])) / len(set(test_user_items[user]))  for user in test_user_items.keys()])

In [None]:
print("number of test users: {}".format(len(test_user_items)))
print("baseline new hit rate: {:.3f}".format(base_new_hrt))
print("baseline new precision: {:.3f}".format(base_new_pre))
print("baseline new recall: {:.3f}".format(base_new_rec))

#### Calculate Model Performance Excluding Training Items

In [None]:
%%time
model_hrt_new = hit_rate(model, interactions_valid, k=k, filter_previous=True)
model_pre_new = precision(model, interactions_valid, k=k, filter_previous=True)
model_rec_new = recall(model, interactions_valid, k=k, filter_previous=True)

In [None]:
print("model new hit rate: {}".format(round(model_hrt_new, 3)))
print("model new precision: {}".format(round(model_pre_new, 3)))
print("model new recall: {}".format(round(model_rec_new, 3)))