In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import hit_rates as hr
import rankfmlib as fmlib
# new library for performance studies built from rankfmlib to read a single file rather than yearly files.
import newlib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from rankfm.rankfm import RankFM
import function_lib as flib

from rankfm.evaluation import hit_rate

Read member attributes

In [37]:
%%time
# Perhaps add an argument stating which colums are attributes
interaction_dct = newlib.read_data_attributes_single_file("activity_reduced_with_attributes.csv",
                                                        age_cuts=[0,30,50,70,150])

CPU times: user 44.7 ms, sys: 24 µs, total: 44.7 ms
Wall time: 44.1 ms


The problem to solve: Why are results worse with filter=False? To figure this out, I may have to write my own recommender and hit_rate functions. 
 1) optimize parameter without attributes
 2) optimize age brackets using just one attribute
 3) optimize temperature brackets (destination features) using just one attribute
 4) add gender, country of origin to the member attributes
 
 One problem found was that I was removing duplicate Member/Destinations too early. They must be removed from 
 the training and validation sets separately. Now I get up to 40% if data is not filtered. However, I only get 10% accuracy
 when the previous data is filtered. WHY? 

___

# Create newlib.recommender(data_train)

In [38]:
interaction_dct.keys()

dict_keys(['df_members', 'df_user_attr', 'df_item_attr'])

In [39]:
%%time
# Add a percentage offset that will not be used. 
newlib.train_valid_dct(interaction_dct, train_perc=0.3, valid_perc=0.3, temporal=False)
print(interaction_dct.keys())

(714086, 2)
dict_keys(['df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test'])
CPU times: user 91.5 ms, sys: 3.83 ms, total: 95.3 ms
Wall time: 94.2 ms


In [40]:
loss = 'warp'
loss = 'bpr'  # nb neg samples = 1
model = RankFM(factors=20, loss=loss, max_samples=10000, alpha=0.05, beta=0.1, learning_rate=0.1, learning_schedule='constant')

1. create dataframe list list of destinations for each member. Do this for training and validation

In [46]:
%%time
newlib.run_model(model, interaction_dct, nb_epochs=100, topN=5, with_attrib=True)
#topN=5, nb_epochs=500, hr=0.16 (filtered), hr=0.64 (not filtered), with_attrib=True`
# better results without attributes (age). Perhaps must improve on subdivision. Start experiments. 
# Add gender and country of origin. 
# Experiment with training/validation split. Two different types: 
#  1) Keeping training fixed, vary validation (should not change much)
#  2) Keep training and validation the same, vary initial offset. (set test_error to zero)
# Test different age splits
# Put Read data, train_valid, run_model into a single method. 
# Create dictionary for each run with all the parameters (train_perc, valid_perc, temporal, use_attrib, learning_schedule, 
# learning_rate, alpha, beta, loss, factors, run_nb if repeated for mean/variance)
# No temporal (lose 3% on both filtered and non-filtered)
# I might have to run multiple times to estimate variance. Say, run 10 times.

# Compare against the neural net version of fm, which can run adamW. See if I get the same results. 
# Interface this program with the otherone that uses a DataLoader

hr (previous filtered):  0.36060784983995775
hr (previous not filtered):  0.6895180086391746
CPU times: user 8.03 s, sys: 49.2 ms, total: 8.08 s
Wall time: 8.01 s


In [47]:
%%time
# no filter: 63% accurate. That makes sense. 
# with filter of hits in training set: 11% (same results as those of rankfm)
# Same answers when repeated multiple times
result = newlib.recommender(model, interaction_dct, keep_nb_members=None, topN=5)
# topN=5, nb_epochs=500, hr=0.11 (filtered), hr=0.6 (not filtered), with_attrib=True

hit rate (without previous filter) =  0.5982003026039869
hit rate (with previous filter) =  0.36060784983995775
CPU times: user 16.3 s, sys: 201 ms, total: 16.5 s
Wall time: 16.4 s


In [None]:
%%time
# no filter: 63% accurate. That makes sense. 
# with filter of hits in training set: 11% (same results as those of rankfm)
result = newlib.recommender(model, interaction_dct, keep_nb_members=None, topN=5)

In [296]:
class Rankfm:
    """
    Collect useful routines, and create a dictionary with parameter for every run
    Keys for dct: otpN, keep_nb_members, age_cuts, 
    offset_perc (?), train_perc, valid_perc, temporal
    factors, loss, alpha, beta, learning_rate, learning_schedule
    """
    def __init__(self, dct, infile):
        # dct: dictionary of model parameters and results
        dct['train_offset'] = 0.0
        dct['train_perc'] = 0.3
        dct['valid_perc'] = 0.3
        dct['nb_epochs'] = 100
        dct['with_attrib'] = True
        dct['factors'] = 20   # embedding size
        dct['loss'] = 'bpr'
        dct['max_samples'] = 300 # not used
        dct['alpha'] = 0.05
        dct['beta'] = 0.1
        dct['learning_rate'] = 0.1
        dct['learning_schedule'] = 'constant'
        # if None, keep all members
        dct['keep_nb_members'] = None
        dct['temporal'] = True
        dct['age_cuts'] = [0,30,60,70,150]
        dct['topN'] = 5
        
        self.df = pd.DataFrame()

        self.dct = newlib.read_data_attributes_single_file("activity_reduced_with_attributes.csv",
                        age_cuts=dct['age_cuts'], dct = dct, overwrite_cache = False)
    
    def update_storage(self):
        # Append dictionary into the class dataframe
        dct_copy = self.dct.copy()
        dct_copy.pop('df_members')
        dct_copy.pop('df_user_attr')
        dct_copy.pop('df_item_attr')
        dct_copy['age_cuts'] = str(dct_copy['age_cuts'])
                    
        self.df = pd.concat([self.df, pd.DataFrame(dct_copy, index=[0])])
        
    def save(self, out_file):
        self.df.to_csv(out_file, index=0)
    
    def train_valid(self):
        dct = self.dct
        newlib.train_valid_dct(dct, train_perc=dct['train_perc'], 
                                    valid_perc=dct['valid_perc'], 
                                    temporal=dct['temporal'])
    
    def update_dict(self, dct):
        for k,v in dct.items():
            self.dct[k] = v
            
    def create_model(self):
        dct = self.dct
        dct['model'] = RankFM(factors=dct['factors'], loss=dct['loss'], 
                max_samples=dct['max_samples'], alpha=dct['alpha'], beta=dct['beta'], 
                learning_rate=dct['learning_rate'], 
                learning_schedule=dct['learning_schedule'])
        
    def run_model(self):
        dct = self.dct
        hr_notfiltered, hr_filtered = newlib.run_model(dct['model'], dct, nb_epochs=dct['nb_epochs'], 
                 topN=dct['topN'], with_attrib=dct['with_attrib'])
        self.dct['hr_run_notfiltered'] = hr_notfiltered
        self.dct['hr_run_filtered'] = hr_filtered
        print("run_model, dct= ", dct)
    
    def recommender(self):
        dct = self.dct
        pairs, hr_notfiltered, hr_filtered = newlib.recommender(dct['model'], dct, keep_nb_members=dct['keep_nb_members'], topN=dct['topN'])
        self.dct['hr_rec_notfiltered'] = hr_notfiltered
        self.dct['hr_rec_filtered'] = hr_filtered
        print("recommend, dct= ", dct)
    

In [302]:
rankfm = Rankfm(dct={}, infile='activity_reduced_with_attributes.csv')
rankfm.train_valid()
rankfm.create_model()

(714086, 2)


In [301]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr'])

In [284]:
rankfm.update_storage()

In [285]:
rankfm.save("gordon.csv")

In [253]:
eval(rankfm.df.age_cuts.values[0])

[0, 30, 60, 70, 150]

In [144]:
len(rankfm.dct.keys()), len(set(rankfm.dct.keys()))

(19, 19)

In [303]:
rankfm.create_model()
rankfm.run_model()

hr (previous filtered):  0.3534019975031211
hr (previous not filtered):  0.6948501872659176
run_model, dct=  {'train_offset': 0.0, 'train_perc': 0.3, 'valid_perc': 0.3, 'nb_epochs': 100, 'with_attrib': True, 'factors': 20, 'loss': 'bpr', 'max_samples': 300, 'alpha': 0.05, 'beta': 0.1, 'learning_rate': 0.1, 'learning_schedule': 'constant', 'keep_nb_members': None, 'temporal': True, 'age_cuts': [0, 30, 60, 70, 150], 'topN': 5, 'df_members':         MEMBER_ID    D
184610  230144517  LIM
376198  230153282  HAV
556112  231307987  PTY
539512  230098494  GUA
60815   231183678  PTY
...           ...  ...
473973  232648465  SNU
705784  247791775  CUN
150110  230149314  SAL
683002  234902782  PTY
400597  231333036  MDE

[714086 rows x 2 columns], 'df_user_attr':         MEMBER_ID  age_dep_(0, 30]  age_dep_(30, 60]  age_dep_(60, 70]  \
184610  230144517                0                 1                 0   
376198  230153282                0                 1                 0   
556112  2313079

In [307]:
rankfm.dct['hr_run_filtered']

0.3534019975031211

In [288]:
rankfm.update_storage()
rankfm.save("gordon1.csv")

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [48]:
rankfm.run_model()
rankfm.recommender()

hr (previous filtered):  0.3554201889607161
hr (previous not filtered):  0.6575397812033814
hit rate (without previous filter) =  0.46593932943054817
hit rate (with previous filter) =  0.3057869219293884


In [16]:
loss = 'warp'
loss = 'bpr'  # nb neg samples = 1
model = (RankFMfactors=20, loss=loss, max_samples=10000, alpha=0.05, beta=0.1, learning_rate=0.1, learning_schedule='constant')

In [33]:
%%time
rankfm.recommender()

hit rate (without previous filter) =  0.5381852048962213
hit rate (with previous filter) =  0.31147439085032325
CPU times: user 16.5 s, sys: 216 ms, total: 16.7 s
Wall time: 16.6 s


In [32]:
%%time
rankfm.run_model()

hr (previous filtered):  0.35980233714569865
hr (previous not filtered):  0.69039035305818
CPU times: user 8 s, sys: 15.1 ms, total: 8.02 s
Wall time: 8 s


In [44]:
rankfm.dct.keys()

dict_keys(['train_offset', 'train_perc', 'valid_perc', 'nb_epochs', 'with_attrib', 'factors', 'loss', 'max_samples', 'alpha', 'beta', 'learning_rate', 'learning_schedule', 'keep_nb_members', 'temporal', 'age_cuts', 'topN', 'df_members', 'df_user_attr', 'df_item_attr', 'data_train', 'data_valid', 'data_test', 'model', 'train_dest_sets', 'valid_dest_sets'])

In [45]:
rankfm.dct

{'train_offset': 0.0,
 'train_perc': 0.3,
 'valid_perc': 0.3,
 'nb_epochs': 100,
 'with_attrib': True,
 'factors': 20,
 'loss': 'bpr',
 'max_samples': 300,
 'alpha': 0.05,
 'beta': 0.1,
 'learning_rate': 0.1,
 'learning_schedule': 'constant',
 'keep_nb_members': None,
 'temporal': True,
 'age_cuts': [0, 30, 60, 70, 150],
 'topN': 5,
 'df_members':         MEMBER_ID    D
 184610  230144517  LIM
 376198  230153282  HAV
 556112  231307987  PTY
 539512  230098494  GUA
 60815   231183678  PTY
 ...           ...  ...
 473973  232648465  SNU
 705784  247791775  CUN
 150110  230149314  SAL
 683002  234902782  PTY
 400597  231333036  MDE
 
 [714086 rows x 2 columns],
 'df_user_attr':         MEMBER_ID  age_dep_(0, 30]  age_dep_(30, 60]  age_dep_(60, 70]  \
 184610  230144517                0                 1                 0   
 376198  230153282                0                 1                 0   
 556112  231307987                0                 1                 0   
 539512  23009849