### Import Required Packages and Set Options

#### Import Base Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from rankfm.rankfm import RankFM
import hit_rates as hr
import numpy as np

# Things to do
* Take auxiliary data into account, such as 
    * age, gender (probably class imbalance) (20-40, 40-60, 60-80)
    * booking date (month/day of week (dow)) (1-12/1-7)
    * flight date (month/dow) (1-12/1-7)
    * family size (1-7)
    * trip length (nb days): 1-9, 10+. (should plot data. Avoid class imbalance)
* Number of categories: 3 + 12 + 7 + 12 + 7 + 7 + 10 = 65
    * The number of combinations: 3 * 12 * 12 * 7 * 7 * 7 * 10 = 3 * 144 * 343 * 10 = 1,481,760
    * We have about 1,000,000 records. So there are more combinations than we have data. 
        Is that good or bad? 
        
## Regularization parameters
Currently, there are two regularization parameters; $\alpha$ for the weights corresponding to the user-item matrix, 
and $\beta$ for the item and user attributes. Ideally, $\beta$ should be broken up into $\beta_i$ and $\beta_u$ for item nad user attributes, respectively. In Field-Aware Machine Factorization, each field (collection of features) has its own regularizing parameters (as far as I understand it). 

In [3]:
# Optimum 25
samples = np.arange(5, 51, 10).astype('int32')
# Optimimum 20
nb_factors = [5,10,20,30]
# Optimum: 0.01 or 0.05
alphas = [0.001, 0.01, 0.05, 0.1, 0.5, 1.]
betas = [1000., 0.01, 0.05 ,0.1]
nothing = [1,1,1,1,1]

In [4]:
# Optimum parameters when keep_nb_users=2000
# hit rate: 0.27
max_samples = 5
factors = 20
keep_nb_users = 2000
alpha = 0.01
beta = 0.01

# When filter_previous==True, the results are much more sensitive to alpha. 
# it would be great if alpha could be learned during optimization. 
# hit rate: 0.76 (baseline based on most popular: 0.45) (averaged over 5 runs)
alpha = 0.01

In [5]:
# max_samples=500 creates problem for 'warp', but not for 'bpr'. Or vce-versa. What is the difference? And Why?
# for max_samples in samples:
# for factors in nb_factors:
# for alpha in alphas:
# for beta in betas:
for n in nothing:
    model = RankFM(factors=factors,
                   loss='warp', max_samples=int(max_samples), alpha=alpha, beta=beta,
                   learning_rate=0.1, learning_schedule='invscaling')

    results = hr.evaluate_hit_rate(model, '2016', '2017', filter_previous=True, 
                    keep_nb_users=keep_nb_users, verbose=False, nb_epochs=30)
    results['max_samples'] = max_samples
    print(max_samples, results['model_hrt'])
#     print(results)
#     break

Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100130857  BOG  MIA  PTY
100147716  BOG  SDQ  GUA
100158043  BOG  MIA  SDQ
100159218  MIA  BOG  PTY
101476887  BOG  MIA  SJO
5 {0: 0.2631184407796102, 1: 0.02848575712143928}
Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100043637  GDL  MIA  SJO
100056182  PTY  SJO  GUA
100057707  MIA  BOG  CCS
100136144  MIA  MEX  MDE
100155943  SJO  GUA  MIA
5 {0: 0.261136712749616, 1: 0.030721966205837174}
Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100056182  MIA  PTY  SJO
100058083  MIA  BOG  MGA
100139806  MIA  SJO  SDQ
100159303  MEX  LIM  POS
100235732  SDQ  GUA  SJO
5 {0: 0.2790346907993967, 1: 0.033936651583710405}
Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100136144  PTY  MIA  GUA
100240444  PTY  SJO  MDE
101635857  GUA  SJO  MEX
101656205  BOG  PTY  MDE
230001743  MCO  SFO  BOG
5 {0: 0.25987841945288753, 1: 0.026595744680851064}
Evaluate_hit_rate: years: 2016, 2017
            

In [11]:
# max_samples=500 creates problem for 'warp', but not for 'bpr'. Or vce-versa. What is the difference? And Why?
# for max_samples in samples:
# for factors in nb_factors:
model = RankFM(factors=factors,
               loss='warp', max_samples=int(max_samples), alpha=0.01, 
               learning_rate=0.1, learning_schedule='invscaling')

results = hr.evaluate_hit_rate(model, '2016', '2017', filter_previous=True, 
                keep_nb_users=20000, verbose=False, nb_epochs=30)
results['max_samples'] = max_samples
print(max_samples, results['model_hrt'])
#     print(results)
#     break

Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100031203  MIA  BOG  SDQ
100034364  MIA  SJO  GUA
100041338  MIA  PTY  EZE
100041994  JFK  LAX  SJO
100043637  MEX  MIA  TPA
5 {0: 0.2943624668936814, 1: 0.043057132046916385}


In [13]:
results = hr.evaluate_hit_rate(model, '2016', '2017', filter_previous=True, keep_nb_users=20000,
                    verbose=False)

Evaluate_hit_rate: years: 2016, 2017
             0    1    2
100031203  PTY  MIA  BOG
100034364  MIA  MEX  SJO
100035145  BOG  GUA  PTY
100040465  GUA  SCL  BOG
100041338  MIA  JFK  PTY


In [14]:
results

{'keep_nb_users': 20000,
 'filter_previous': True,
 'year1': '2016',
 'year2': '2017',
 'nb_epochs': '2017',
 'seed': None,
 'sparsity_all': 0.9548386510497471,
 'sparsity': 0.9550525641025641,
 'topN=k': 3,
 'most_popular': product_id
 PTY    2780
 BOG    2742
 MIA    2492
 Name: user_id, dtype: int64,
 'base_hrt': 0.4532919873747162,
 'base_pre': 0.172822415416136,
 'base_rec': 0.2304546488761708,
 'model_rnk': 0.20300409649522075,
 'model_pre': 0.11060537096040055,
 'model_rec': 0.1076525858074561,
 'model_hrt': {0: 0.29206493703535125, 1: 0.03770292823547262}}

In [19]:
hr.evaluate_hit_rate(model, '2016', '2017', keep_nb_users=None, 
                     nb_epochs=100, verbose=True, 
                     filter_previous=True)

Evaluate_hit_rate: years: 2016, 2017
sample users: 31028
sample items: 78
sample interactions: (137614, 3)
sample interaction data sparsity: 95.48
total shape: (137614, 2)
train shape: (62977, 2)
valid shape: (74637, 2)

train weights shape: (62977,)
valid weights shape: (74637,)

nb train users: 23553
nb valid users: 28003
nb cold-start users: 7475

train items: 76
valid items: 75
number of cold-start items: 2
cold start items:  {'MDZ', 'DEN'}

training epoch: 0
log likelihood: -37265.671875

training epoch: 1
log likelihood: -37512.26171875

training epoch: 2
log likelihood: -37480.671875

training epoch: 3
log likelihood: -37099.2890625

training epoch: 4
log likelihood: -37030.19921875

training epoch: 5
log likelihood: -36859.26171875

training epoch: 6
log likelihood: -36397.1796875

training epoch: 7
log likelihood: -35771.08984375

training epoch: 8
log likelihood: -35179.3203125

training epoch: 9
log likelihood: -34285.69921875

training epoch: 10
log likelihood: -33515.10937

{'keep_nb_users': None,
 'filter_previous': True,
 'year1': '2016',
 'year2': '2017',
 'nb_epochs': '2017',
 'seed': None,
 'sparsity_all': 0.9548386510497471,
 'sparsity': 0.9548364917708736,
 'topN=k': 3,
 'most_popular': product_id
 PTY    4293
 BOG    4269
 MIA    3916
 Name: user_id, dtype: int64,
 'base_hrt': 0.4536656786772846,
 'base_pre': 0.17300527324453333,
 'base_rec': 0.23133585480159166,
 'model_rnk': 0.1934674590802806,
 'model_pre': 0.1028189140036373,
 'model_rec': 0.09841447867847926,
 'model_hrt': {0: 0.2720187061574435, 1: 0.0338074824629774}}

In [23]:
hr.evaluate_hit_rate(model, '2020', '2021', nb_epochs=30, filter_previous=True, keep_nb_users=1000, verbose=False)

Evaluate_hit_rate: years: 2020, 2021
             0    1    2
100050464  PTY  MIA  MEX
100136144  PTY  GYE  IAD
100234188  PTY  MEX  MIA
100243513  MIA  SJO  SAL
230003246  PTY  MIA  GYE


{'keep_nb_users': 1000,
 'filter_previous': True,
 'year1': '2020',
 'year2': '2021',
 'nb_epochs': '2021',
 'seed': None,
 'sparsity_all': 0.9764312181826373,
 'sparsity': 0.9730416666666667,
 'topN=k': 3,
 'most_popular': product_id
 PTY    127
 MIA     89
 SJO     60
 Name: user_id, dtype: int64,
 'base_hrt': 0.40136986301369865,
 'base_pre': 0.14063926940639268,
 'base_rec': 0.280440313111546,
 'model_rnk': 0.14469696969696969,
 'model_pre': 0.07424242424242423,
 'model_rec': 0.11243506493506493,
 'model_hrt': {0: 0.21136363636363636, 1: 0.011363636363636364}}

In [25]:
hr.evaluate_hit_rate(model, '2021', '2020', nb_epochs=30, filter_previous=True, keep_nb_users=1000, verbose=False)

Evaluate_hit_rate: years: 2021, 2020
             0    1    2
100041644  MIA  PTY  MCO
100131008  MCO  MEX  PTY
100184503  MDE  MIA  CUN
230002638  MIA  PTY  CUN
230004134  MDE  PTY  SDQ


{'keep_nb_users': 1000,
 'filter_previous': True,
 'year1': '2021',
 'year2': '2020',
 'nb_epochs': '2020',
 'seed': None,
 'sparsity_all': 0.9764312181826373,
 'sparsity': 0.9725,
 'topN=k': 3,
 'most_popular': product_id
 MIA    160
 PTY    120
 MCO     79
 Name: user_id, dtype: int64,
 'base_hrt': 0.323943661971831,
 'base_pre': 0.11032863849765258,
 'base_rec': 0.2646948356807512,
 'model_rnk': 0.09323254139668825,
 'model_pre': 0.05039596832253419,
 'model_rec': 0.08668106551475882,
 'model_hrt': {0: 0.1447084233261339, 1: 0.0064794816414686825}}