In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import hit_rates as hr
import rankfmlib as fmlib
# new library for performance studies built from rankfmlib to read a single file rather than yearly files.
import newlib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from rankfm.rankfm import RankFM
import function_lib as flib

from rankfm.evaluation import hit_rate

# Read member attributes

In [5]:
%%time
# Perhaps add an argument stating which colums are attributes
interaction_dct = newlib.read_data_attributes_single_file("activity_reduced_with_attributes.csv",
                                                        age_cuts=[0,30,50,70,150])

CPU times: user 917 ms, sys: 170 ms, total: 1.09 s
Wall time: 1.09 s


The problem to solve: Why are results worse with filter=False? To figure this out, I may have to write my own recommender and hit_rate functions. 
 1) optimize parameter without attributes
 2) optimize age brackets using just one attribute
 3) optimize temperature brackets (destination features) using just one attribute
 4) add gender, country of origin to the member attributes
 
 One problem found was that I was removing duplicate Member/Destinations too early. They must be removed from 
 the training and validation sets separately. Now I get up to 40% if data is not filtered. However, I only get 10% accuracy
 when the previous data is filtered. WHY? 

___

# Create newlib.recommender(data_train)

In [6]:
interaction_dct.keys()

dict_keys(['age_cuts', 'long_cuts', 'lat_cuts', 'yr_l_cuts', 'yr_h_cuts', 'altitude_cuts', 'df_members', 'df_user_attr', 'df_item_attr'])

In [7]:
%%time
# Add a percentage offset that will not be used. 
newlib.train_valid_dct(interaction_dct, train_perc=0.3, valid_perc=0.3, temporal=False)
print(interaction_dct.keys())

(714086, 2)
dict_keys(['age_cuts', 'long_cuts', 'lat_cuts', 'yr_l_cuts', 'yr_h_cuts', 'altitude_cuts', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test'])
CPU times: user 85.4 ms, sys: 4.88 ms, total: 90.3 ms
Wall time: 89.8 ms


In [8]:
loss = 'warp'
loss = 'bpr'  # nb neg samples = 1
model = RankFM(factors=20, loss=loss, max_samples=10000, alpha=0.05, beta=0.1, learning_rate=0.1, learning_schedule='constant')

1. create dataframe list list of destinations for each member. Do this for training and validation

In [3]:
%%time
newlib.run_model(model, interaction_dct, nb_epochs=100, topN=5, with_attrib=True)
#topN=5, nb_epochs=500, hr=0.16 (filtered), hr=0.64 (not filtered), with_attrib=True`
# better results without attributes (age). Perhaps must improve on subdivision. Start experiments. 
# Add gender and country of origin. 
# Experiment with training/validation split. Two different types: 
#  1) Keeping training fixed, vary validation (should not change much)
#  2) Keep training and validation the same, vary initial offset. (set test_error to zero)
# Test different age splits
# Put Read data, train_valid, run_model into a single method. 
# Create dictionary for each run with all the parameters (train_perc, valid_perc, temporal, use_attrib, learning_schedule, 
# learning_rate, alpha, beta, loss, factors, run_nb if repeated for mean/variance)
# No temporal (lose 3% on both filtered and non-filtered)
# I might have to run multiple times to estimate variance. Say, run 10 times.

# Compare against the neural net version of fm, which can run adamW. See if I get the same results. 
# Interface this program with the otherone that uses a DataLoader

NameError: name 'model' is not defined

In [47]:
%%time
# no filter: 63% accurate. That makes sense. 
# with filter of hits in training set: 11% (same results as those of rankfm)
# Same answers when repeated multiple times
result = newlib.recommender(model, interaction_dct, keep_nb_members=None, topN=5)
# topN=5, nb_epochs=500, hr=0.11 (filtered), hr=0.6 (not filtered), with_attrib=True

hit rate (without previous filter) =  0.5982003026039869
hit rate (with previous filter) =  0.36060784983995775
CPU times: user 16.3 s, sys: 201 ms, total: 16.5 s
Wall time: 16.4 s


In [None]:
%%time
# no filter: 63% accurate. That makes sense. 
# with filter of hits in training set: 11% (same results as those of rankfm)
result = newlib.recommender(model, interaction_dct, keep_nb_members=None, topN=5)

# Set up a series of runs to test age brackets. Store results in "age_brackets.csv"
Age brackets will have 4 categories: 0-x, x-y, y-z, z+. Each age_bracket should be at least 10 years. 
Generate x,y,z via sampling, and only keep the choices that satisfy the constraints. 
* x is in [20-40]
* z is in [60-80]
* y is in [x+10, z-10]
Choose 20 intervals, and run simulation. 
* Also execute the model without attributes as a base line. Run 5 times. 

In [9]:
def generate_age_cuts(nb_cuts):
    """
    Generate age cuts and return a list of lists.
    Each age cut has the form [0,x,y,z,150], where x,y are integers. 
    """
    age_cuts = []  
    for i in range(nb_cuts):
        x = np.random.randint(15, 40, 1)[0]
        z = np.random.randint(60, 80, 1)[0]
        y = np.random.randint(x+10, z-10, 1)[0]
        cut = [0, x, y, z, 150]
        age_cuts.append(cut)
    return age_cuts

In [10]:
rankfm = newlib.Rankfm(dct_new={}, infile='activity_reduced_with_attributes.csv', seed=1)
cut = [0, 30, 50, 80, 150]
dct = {'age_cuts':cut ,'nb_epochs':600}
rankfm.read_data(dct_update=dct, continuous_attr=True)

Set up multiple experiments
1. without any attributes, run 10 times
2. with all attributes, categorical, run 10 times
    * issues: convergence criteria (when to stop?) 
    * How to define the categories. Do some exploratory analysis to find out how many flights/members/destinations in each class. 
         Summarize this information, and the compute it for different cutting assumptions. 
3. Run with torchfm, which uses neural networks to implement FM. This leads to more flexibility on the handling of nonlinear terms. 
  * In particular, an attention model might prove useful. 

In [11]:
nb_cuts = 20
cuts = generate_age_cuts(nb_cuts)
rankfm = newlib.Rankfm(dct_new={}, infile='activity_reduced_with_attributes.csv', seed=1)

for cut in cuts:
    dct = {'age_cuts':cut, 'nb_epochs':300 }
    print(cut)
    rankfm.read_data(dct_update=dct, continuous_attr=True)
    rankfm.train_valid()
    rankfm.create_model()
    rankfm.run_model()
    rankfm.update_storage()
    # Overwrite file on each call. Ideally, I should append, but not worth it. 
    rankfm.save('age_cuts.csv')

[0, 20, 42, 71, 150]
(714086, 2)
hr (previous filtered):  0.36599638921745625
hr (previous not filtered):  0.7599140882774077
[0, 23, 44, 69, 150]
(714086, 2)
hr (previous filtered):  0.36539717083786727
hr (previous not filtered):  0.757438209233639
[0, 20, 30, 75, 150]
(714086, 2)
hr (previous filtered):  0.34381896739299367
hr (previous not filtered):  0.7369059090485219
[0, 31, 48, 61, 150]
(714086, 2)
hr (previous filtered):  0.36451292862900725
hr (previous not filtered):  0.7549141191790165
[0, 28, 43, 66, 150]
(714086, 2)
hr (previous filtered):  0.36826905161170115
hr (previous not filtered):  0.7706353642630892
[0, 33, 53, 71, 150]
(714086, 2)
hr (previous filtered):  0.3621220126177083
hr (previous not filtered):  0.7303042545917892
[0, 29, 43, 78, 150]
(714086, 2)
hr (previous filtered):  0.3349632490345085
hr (previous not filtered):  0.6885199950168183
[0, 38, 49, 69, 150]
(714086, 2)
hr (previous filtered):  0.337039222384398
hr (previous not filtered):  0.70305270022670

In [43]:
for run in range(20):
    rankfm.run_model(with_attrib=False)
    rankfm.update_storage()
rankfm.save('no_attrib.csv')

hr (previous filtered):  0.3734639912894696
hr (previous not filtered):  0.7758282781147924
hr (previous filtered):  0.3629491367242184
hr (previous not filtered):  0.773712863586872
hr (previous filtered):  0.3632913361331467
hr (previous not filtered):  0.7600559962669156
hr (previous filtered):  0.3556696220251983
hr (previous not filtered):  0.7591227251516566
hr (previous filtered):  0.3744283714419039
hr (previous not filtered):  0.7774459480479079
hr (previous filtered):  0.36979312490278426
hr (previous not filtered):  0.7800279981334578
hr (previous filtered):  0.36991756105148543
hr (previous not filtered):  0.7214807901695443
hr (previous filtered):  0.34854565251205477
hr (previous not filtered):  0.7511277025976046
hr (previous filtered):  0.3632913361331467
hr (previous not filtered):  0.776823767304402
hr (previous filtered):  0.3708508321667444
hr (previous not filtered):  0.7606159589360709
hr (previous filtered):  0.3701664333488878
hr (previous not filtered):  0.7612

In [423]:
for run in range(20):
    rankfm.run_model(with_attrib=False)
    rankfm.update_storage()
rankfm.save('no_attrib.csv')

hr (previous filtered):  0.3496614275952041
hr (previous not filtered):  0.675343231658073
hr (previous filtered):  0.36752189849040195
hr (previous not filtered):  0.7051003292538982
hr (previous filtered):  0.36631049263837984
hr (previous not filtered):  0.7032055662545816
hr (previous filtered):  0.36121637572218424
hr (previous not filtered):  0.6808100888364291
hr (previous filtered):  0.3641672361309561
hr (previous not filtered):  0.6982667577809529
hr (previous filtered):  0.3567434925762564
hr (previous not filtered):  0.6729825433310554
hr (previous filtered):  0.34655525874386534
hr (previous not filtered):  0.6739765173634839
hr (previous filtered):  0.360315586755296
hr (previous not filtered):  0.6679816114804
hr (previous filtered):  0.36568925886811204
hr (previous not filtered):  0.6709324718891719
hr (previous filtered):  0.35279865813505623
hr (previous not filtered):  0.6753121699695596
hr (previous filtered):  0.3625830900167733
hr (previous not filtered):  0.6704

In [439]:
cuts

[[0, 26, 53, 69, 150],
 [0, 34, 61, 77, 150],
 [0, 30, 52, 76, 150],
 [0, 23, 48, 60, 150],
 [0, 31, 51, 67, 150],
 [0, 27, 43, 73, 150],
 [0, 30, 53, 70, 150],
 [0, 33, 49, 62, 150],
 [0, 31, 48, 70, 150],
 [0, 22, 49, 75, 150],
 [0, 19, 48, 62, 150],
 [0, 19, 58, 79, 150],
 [0, 16, 47, 76, 150],
 [0, 31, 56, 69, 150],
 [0, 38, 68, 79, 150],
 [0, 37, 48, 61, 150],
 [0, 25, 65, 77, 150],
 [0, 17, 50, 72, 150],
 [0, 24, 38, 67, 150],
 [0, 15, 38, 77, 150]]

In [438]:
df_no = pd.read_csv("no_attrib.csv")
df_age = pd.read_csv("age_cuts.csv")

def mean_var(df, cols, msg):
    for i, col in enumerate(cols):
        print(f"{msg}, {col}, mean/var: {df[col].mean()}, {df[col].std()}")
              
mean_var(df_no, ['hr_run_notfiltered','hr_run_filtered'], "No attrib")
mean_var(df_age, ['hr_run_notfiltered','hr_run_filtered'], "Age cuts")
#mean_hit_no = df_no["hr_run_notfiltered"].mean(), df_no["hr_run_filtered"].mean()
#mean_hit_age = df_age["hr_run_notfiltered"].mean(), df_age["hr_run_filtered"].mean()
#print("mean hits[no attrib]: ", mean_hit_no)
#print("mean hits[age]: ", mean_hit_age)

No attrib, hr_run_notfiltered, mean/var: 0.6725904527658357, 0.022704056242912347
No attrib, hr_run_filtered, mean/var: 0.35562808812081315, 0.008416685798878342
Age cuts, hr_run_notfiltered, mean/var: 0.670979394549858, 0.022524922616978613
Age cuts, hr_run_filtered, mean/var: 0.3535377690957051, 0.009193800912547886


In [383]:
rankfm = newlib.Rankfm(dct={}, infile='activity_reduced_with_attributes.csv', seed=1)
rankfm.train_valid()
rankfm.create_model()

(714086, 2)


In [384]:
rankfm.create_model()
rankfm.run_model()

hr (previous filtered):  0.3615451659092324
hr (previous not filtered):  0.6677146236693021


In [385]:
rankfm.create_model()
rankfm.run_model()

hr (previous filtered):  0.3619498225736164
hr (previous not filtered):  0.6879163294527797


In [386]:
rankfm.run_model()

hr (previous filtered):  0.3651248210172446
hr (previous not filtered):  0.6913714748179045


In [374]:
np.random.seed(1)
rankfm.run_model()

hr (previous filtered):  0.35766105177943147
hr (previous not filtered):  0.6785814366223495


In [375]:
np.random.seed(1)
rankfm.run_model()

hr (previous filtered):  0.35766105177943147
hr (previous not filtered):  0.6785814366223495


In [388]:
rankfm.dct['nb_epochs'] = 300
rankfm.run_model()

hr (previous filtered):  0.36783290792504514
hr (previous not filtered):  0.7592604121272489


In [376]:
rankfm.recommender()

hit rate (without previous filter) =  0.5888800212822559
hit rate (with previous filter) =  0.35766105177943147
recommend, dct=  {'train_offset': 0.0, 'train_perc': 0.3, 'valid_perc': 0.3, 'nb_epochs': 100, 'with_attrib': True, 'factors': 20, 'loss': 'bpr', 'max_samples': 300, 'alpha': 0.05, 'beta': 0.1, 'learning_rate': 0.1, 'learning_schedule': 'constant', 'keep_nb_members': None, 'temporal': True, 'age_cuts': [0, 30, 60, 70, 150], 'topN': 5, 'df_members':         MEMBER_ID    D
184610  230144517  LIM
376198  230153282  HAV
556112  231307987  PTY
539512  230098494  GUA
60815   231183678  PTY
...           ...  ...
473973  232648465  SNU
705784  247791775  CUN
150110  230149314  SAL
683002  234902782  PTY
400597  231333036  MDE

[714086 rows x 2 columns], 'df_user_attr':         MEMBER_ID  age_dep_(0, 30]  age_dep_(30, 60]  age_dep_(60, 70]  \
184610  230144517                0                 1                 0   
376198  230153282                0                 1                 

In [366]:
rankfm.update_storage()

after pd.concat


In [367]:
rankfm.save("gordon.csv")

In [340]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test', 'model'])

In [253]:
eval(rankfm.df.age_cuts.values[0])

[0, 30, 60, 70, 150]

In [144]:
len(rankfm.dct.keys()), len(set(rankfm.dct.keys()))

(19, 19)

In [320]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test', 'model'])

In [322]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test', 'model', 'hr_run_notfiltered', 'hr_run_filtered'])

In [323]:
rankfm.update_storage()

dct_copy:  {'train_offset': 0.0, 'train_perc': 0.3, 'valid_perc': 0.3, 'nb_epochs': 100, 'with_attrib': True, 'factors': 20, 'loss': 'bpr', 'max_samples': 300, 'alpha': 0.05, 'beta': 0.1, 'learning_rate': 0.1, 'learning_schedule': 'constant', 'keep_nb_members': None, 'temporal': True, 'age_cuts': '[0, 30, 60, 70, 150]', 'topN': 5, 'model': <rankfm.rankfm.RankFM object at 0x7fb7f069e3d0>, 'hr_run_notfiltered': 0.6563160512246674, 'hr_run_filtered': 0.3474449832152182}
save, 
df updated


In [324]:
rankfm.dct

{'train_offset': 0.0,
 'train_perc': 0.3,
 'valid_perc': 0.3,
 'nb_epochs': 100,
 'with_attrib': True,
 'factors': 20,
 'loss': 'bpr',
 'max_samples': 300,
 'alpha': 0.05,
 'beta': 0.1,
 'learning_rate': 0.1,
 'learning_schedule': 'constant',
 'keep_nb_members': None,
 'temporal': True,
 'age_cuts': [0, 30, 60, 70, 150],
 'topN': 5,
 'df_members':         MEMBER_ID    D
 184610  230144517  LIM
 376198  230153282  HAV
 556112  231307987  PTY
 539512  230098494  GUA
 60815   231183678  PTY
 ...           ...  ...
 473973  232648465  SNU
 705784  247791775  CUN
 150110  230149314  SAL
 683002  234902782  PTY
 400597  231333036  MDE
 
 [714086 rows x 2 columns],
 'df_user_attr':         MEMBER_ID  age_dep_(0, 30]  age_dep_(30, 60]  age_dep_(60, 70]  \
 184610  230144517                0                 1                 0   
 376198  230153282                0                 1                 0   
 556112  231307987                0                 1                 0   
 539512  23009849

In [307]:
rankfm.dct['hr_run_filtered']

0.3534019975031211

In [288]:
rankfm.update_storage()
rankfm.save("gordon1.csv")

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [48]:
rankfm.run_model()
rankfm.recommender()

hr (previous filtered):  0.3554201889607161
hr (previous not filtered):  0.6575397812033814
hit rate (without previous filter) =  0.46593932943054817
hit rate (with previous filter) =  0.3057869219293884


In [16]:
loss = 'warp'
loss = 'bpr'  # nb neg samples = 1
model = (RankFMfactors=20, loss=loss, max_samples=10000, alpha=0.05, beta=0.1, learning_rate=0.1, learning_schedule='constant')

In [33]:
%%time
rankfm.recommender()

hit rate (without previous filter) =  0.5381852048962213
hit rate (with previous filter) =  0.31147439085032325
CPU times: user 16.5 s, sys: 216 ms, total: 16.7 s
Wall time: 16.6 s


In [32]:
%%time
rankfm.run_model()

hr (previous filtered):  0.35980233714569865
hr (previous not filtered):  0.69039035305818
CPU times: user 8 s, sys: 15.1 ms, total: 8.02 s
Wall time: 8 s


In [44]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test', 'model', 'train_dest_sets', 'valid_dest_sets'])

In [45]:
rankfm.dct

{'train_offset': 0.0,
 'train_perc': 0.3,
 'valid_perc': 0.3,
 'nb_epochs': 100,
 'with_attrib': True,
 'factors': 20,
 'loss': 'bpr',
 'max_samples': 300,
 'alpha': 0.05,
 'beta': 0.1,
 'learning_rate': 0.1,
 'learning_schedule': 'constant',
 'keep_nb_members': None,
 'temporal': True,
 'age_cuts': [0, 30, 60, 70, 150],
 'topN': 5,
 'df_members':         MEMBER_ID    D
 184610  230144517  LIM
 376198  230153282  HAV
 556112  231307987  PTY
 539512  230098494  GUA
 60815   231183678  PTY
 ...           ...  ...
 473973  232648465  SNU
 705784  247791775  CUN
 150110  230149314  SAL
 683002  234902782  PTY
 400597  231333036  MDE
 
 [714086 rows x 2 columns],
 'df_user_attr':         MEMBER_ID  age_dep_(0, 30]  age_dep_(30, 60]  age_dep_(60, 70]  \
 184610  230144517                0                 1                 0   
 376198  230153282                0                 1                 0   
 556112  231307987                0                 1                 0   
 539512  23009849

In [325]:
rankfm.save("xxx.csv")