# Kiva collaborative filtering
## Polara framework

***
### Imports

In [None]:
# essentials
import sys
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from scipy.sparse import csr_matrix, lil_matrix

# polara framework
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel, PopularityModel, RandomModel
from polara.recommender.external.implicit.ialswrapper import ImplicitALS
from polara.datasets.movielens import get_movielens_data
from polara.evaluation import evaluation_engine as ee

# utilities
import codecs
import logging
import time
import tqdm

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# serialization
import pickle

In [None]:
pd.set_option('display.max_columns', 40)

In [None]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

***
### Pickle loads

In [None]:
loans_table = pickle.load(open("pickle/loans_table.p", "rb"))
funded_loans_table = pickle.load(open("pickle/funded_loans_table.p", "rb"))
funded_loan_ids_set = pickle.load(open("pickle/funded_loan_ids_set.p", "rb"))

In [None]:
utility_matrix = pickle.load(open("pickle/utility_matrix.p", "rb"))

In [None]:
utility_matrix_df = pickle.load(open("pickle/utility_matrix_df.p", "rb"))

In [None]:
utility_matrix_df.head()

***
### Dataset reading

In [None]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [None]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [None]:
START_DATE = '2013-10-01'
END_DATE = '2015-05-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]
del mask

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [None]:
# pickle.dump(loans_table, open("/pickle/loans_table.p", "wb"))
# pickle.dump(funded_loans_table, open("pickle/funded_loans_table.p", "wb"))
# pickle.dump(funded_loan_ids_set, open("pickle/funded_loan_ids_set.p", "wb"))

In [None]:
# free ram
del loans_table
del funded_loans_table

In [None]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

loans = list(loans)
lenders = list(lenders)

print('Loans-lenders dict filled')

***
### Utility matrix creation

In [None]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders)}
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.float64)

In [None]:
for loan_index, loan in enumerate(loans):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1.0

print('Filled utility matrix')

In [None]:
utility_matrix = utility_matrix.tocsr()

In [None]:
# pickle.dump(utility_matrix, open("pickle/utility_matrix.p", "wb"))

#### Conversion to SparseDataFrame

In [None]:
utility_matrix_df = pd.SparseDataFrame(utility_matrix)

In [None]:
# pickle.dump(utility_matrix_df, open("pickle/utility_matrix_df.p", "wb"))

In [None]:
def write_matrix_to_csv(matrix, filename):
    if not isinstance(matrix, scipy.sparse.coo_matrix):
        matrix = matrix.tocoo()
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("%s,%s,%s\n" % ('itemid', 'userid', 'feedback'))
        for i,j,v in zip(matrix.row, matrix.col, matrix.data):
            f.write("%s,%s,%s\n" % (i, j, v))

In [None]:
kiva_dataframe = pd.read_csv('kiva_dataframe.csv', engine='c')

In [None]:
kiva_dataframe.shape

***
### Polara example

In [None]:
# define models
ml_data = get_movielens_data(get_genres=False)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')

# data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback')

svd = SVDModel(data_model)
popular = PopularityModel(data_model)
random = RandomModel(data_model)
models = [svd, popular, random]

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3, 4, 5] # use all 5 folds for cross-validation (default)
topk_values = [1, 5, 10, 20, 50] # values of k to experiment with

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)

In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
scores.xs('nDCG', level='metric', axis=1).unstack('model')

***
### Cross-validation

In [41]:
data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback', seed=0)

start = time.time()
data_model.warm_start = False

data_model.prepare()
logging.debug("Prepared data in %0.2fs", time.time() - start)

Preparing data...
207 unique itemid's within 228 holdout interactions were filtered. Reason: not in the training data.
85452 unique userid's within 136096 holdout interactions were filtered. Reason: not in the training data.
114 of 44225 userid's were filtered out from holdout. Reason: incompatible number of items.
Done.
There are 5070574 events in the training and 132333 events in the holdout.


DEBUG:root:Prepared data in 136.31s


In [42]:
als_default = ImplicitALS(data_model) # create model
als_default.rank = 10
als_default.alpha = 1
als_default.epsilon = 1
als_default.weight_func = np.log2
als_default.regularization = 0.1
als_default.num_threads = 0
als_default.num_epochs = 30

# als_default.build() # fit model

# start = time.time()
# als_default.evaluate() # by default it calculates the total number of hits
# logging.debug("Evaluated model ALS in %0.2fs", time.time() - start)

DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.169s
DEBUG:implicit:Calculated transpose in 0.132s
DEBUG:implicit:Initialized factors in 0.43894195556640625
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:02<00:00,  7.23it/s]


iALS training time: 4.291s


DEBUG:root:Evaluated model ALS in 159.35s


In [43]:
als_alpha_forty = ImplicitALS(data_model) # create model
als_alpha_forty.rank = 10
als_alpha_forty.alpha = 100
als_alpha_forty.epsilon = 1
als_alpha_forty.weight_func = np.log2
als_alpha_forty.regularization = 0.1
als_alpha_forty.num_threads = 0
als_alpha_forty.num_epochs = 30

# als_alpha_forty.build() # fit model

# start = time.time()
# als_alpha_forty.evaluate() # by default it calculates the total number of hits
# logging.debug("Evaluated model ALS in %0.2fs", time.time() - start)

DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.163s
DEBUG:implicit:Calculated transpose in 0.132s
DEBUG:implicit:Initialized factors in 0.3855011463165283
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:02<00:00,  7.23it/s]


iALS training time: 4.187s


DEBUG:root:Evaluated model ALS in 221.60s


In [48]:
popular = PopularityModel(data_model)
random = RandomModel(data_model)

In [49]:
models = [als_default, popular, random, als_alpha_forty]

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1] # use all 5 folds for cross-validation (default)
topk_values = [10, 20, 30] # values of k to experiment with

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)

DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.187s
DEBUG:implicit:Calculated transpose in 0.172s
DEBUG:implicit:Initialized factors in 0.4842336177825928
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:02<00:00,  7.39it/s]


iALS training time: 4.151s


DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.219s
DEBUG:implicit:Calculated transpose in 0.141s
DEBUG:implicit:Initialized factors in 0.39055919647216797
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:02<00:00,  7.49it/s]


iALS training time: 3.026s


In [50]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('recall', level='metric', axis=1).unstack('model')
scores

Unnamed: 0_level_0,type,relevance,relevance,relevance,ranking
Unnamed: 0_level_1,metric,precision,recall,miss_rate,nDCG
top-n,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10,iALS,9e-06,3e-05,0.99997,2e-05
10,MP,0.000834,0.002779,0.997221,0.002047
10,RND,1.8e-05,6e-05,0.99994,3.5e-05
10,iALS,9e-06,3e-05,0.99997,2.1e-05
20,iALS,1.2e-05,8.3e-05,0.999917,3.8e-05
20,MP,0.000896,0.005974,0.994026,0.003209
20,RND,1.5e-05,9.8e-05,0.999902,4.9e-05
20,iALS,9e-06,6e-05,0.99994,3.2e-05
30,iALS,1.2e-05,0.000121,0.999879,5e-05
30,MP,0.000751,0.007507,0.992493,0.00368


In [None]:
ml_data = get_movielens_data(get_genres=False)
ml_data

***
### Variable sizes

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))