# Kiva collaborative filtering
## Polara framework

***
### Imports

In [21]:
# essentials
import os
import sys
import csv
import itertools
import copy
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix

# polara framework
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel, PopularityModel, RandomModel, CooccurrenceModel
from polara.recommender.external.implicit.ialswrapper import ImplicitALS
from polara.recommender.external.implicit.ibprwrapper import ImplicitBPR
from polara.datasets.movielens import get_movielens_data
from polara.evaluation import evaluation_engine as ee
from polara.evaluation.plotting import show_hit_rates, show_precision_recall, show_ranking, show_relevance

# utilities
import codecs
import logging
import time
import tqdm

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# serialization
import pickle

In [4]:
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 300)

In [5]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

env: MKL_NUM_THREADS=1


In [23]:
def write_matrix_to_csv(matrix, filename):
    if not isinstance(matrix, coo_matrix):
        matrix = matrix.tocoo()
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("%s,%s,%s\n" % ('itemid', 'userid', 'feedback'))
        for i,j,v in zip(matrix.row, matrix.col, matrix.data):
            f.write("%s,%s,%s\n" % (i, j, v))

***
## Hyperparameter evaluation

### Pickle loads

In [None]:
loans_table = pickle.load(open("pickle/loans_table.p", "rb"))
funded_loans_table = pickle.load(open("pickle/funded_loans_table.p", "rb"))
funded_loan_ids_set = pickle.load(open("pickle/funded_loan_ids_set.p", "rb"))

In [None]:
utility_matrix = pickle.load(open("pickle/utility_matrix.p", "rb"))

In [4]:
utility_matrix_df = pickle.load(open("pickle/utility_matrix_df.p", "rb"))

In [None]:
utility_matrix_df.head()

***
### Dataset reading

In [4]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [5]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [6]:
START_DATE = '2013-10-01'
END_DATE = '2015-05-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]
del mask

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [None]:
# pickle.dump(loans_table, open("pickle/loans_table.p", "wb"))
# pickle.dump(funded_loans_table, open("pickle/funded_loans_table.p", "wb"))
# pickle.dump(funded_loan_ids_set, open("pickle/funded_loan_ids_set.p", "wb"))

In [7]:
# free ram
del loans_table
del funded_loans_table

In [8]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

loans = list(loans)
lenders = list(lenders)

print('Loans-lenders dict filled')

Loans-lenders dict filled


***
### Utility matrix creation

In [9]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders)}
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.float64)

In [10]:
for loan_index, loan in enumerate(loans):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1.0

print('Filled utility matrix')

Filled utility matrix


In [11]:
utility_matrix = utility_matrix.tocsr()

In [None]:
# pickle.dump(utility_matrix, open("pickle/utility_matrix.p", "wb"))

In [18]:
print(utility_matrix.nnz)
print('lenders: ', len(lenders))
print('loans: ', len(loans))

5339455
lenders:  648650
loans:  250888


In [None]:
write_matrix_to_csv(utility_matrix, "kiva_dataframe.csv")

In [19]:
matrix_size = utility_matrix.shape[0]*utility_matrix.shape[1] # Number of possible interactions in the matrix
num_interactions = utility_matrix.nnz # Number of items interacted with
sparsity = 100*(1 - (num_interactions/matrix_size))
print('Sparsitiy: %f %%' % sparsity)

Sparsitiy: 99.996719 %


In [17]:
print(utility_matrix)

  (0, 6640)	1.0
  (0, 10202)	1.0
  (0, 28107)	1.0
  (0, 28187)	1.0
  (0, 35318)	1.0
  (0, 40001)	1.0
  (0, 50325)	1.0
  (0, 57890)	1.0
  (0, 64050)	1.0
  (0, 68006)	1.0
  (0, 69985)	1.0
  (0, 74717)	1.0
  (0, 75169)	1.0
  (0, 75666)	1.0
  (0, 80041)	1.0
  (0, 82347)	1.0
  (0, 83544)	1.0
  (0, 85627)	1.0
  (0, 99992)	1.0
  (0, 100921)	1.0
  (0, 104778)	1.0
  (0, 112377)	1.0
  (0, 113820)	1.0
  (0, 116622)	1.0
  (0, 132796)	1.0
  :	:
  (250887, 528884)	1.0
  (250887, 530003)	1.0
  (250887, 537902)	1.0
  (250887, 539431)	1.0
  (250887, 539635)	1.0
  (250887, 547342)	1.0
  (250887, 553134)	1.0
  (250887, 557078)	1.0
  (250887, 575728)	1.0
  (250887, 578309)	1.0
  (250887, 579099)	1.0
  (250887, 591684)	1.0
  (250887, 593250)	1.0
  (250887, 594308)	1.0
  (250887, 595907)	1.0
  (250887, 599768)	1.0
  (250887, 603712)	1.0
  (250887, 608322)	1.0
  (250887, 612239)	1.0
  (250887, 613717)	1.0
  (250887, 619613)	1.0
  (250887, 634168)	1.0
  (250887, 638143)	1.0
  (250887, 643444)	1.0
  (250887, 6

#### Conversion to SparseDataFrame

In [None]:
# utility_matrix_df = pd.SparseDataFrame(utility_matrix)

In [None]:
# pickle.dump(utility_matrix_df, open("pickle/utility_matrix_df.p", "wb"))

#### Load Kiva dataframe

In [4]:
kiva_dataframe = pd.read_csv('kiva_dataframe.csv', engine='c')

In [5]:
kiva_dataframe.shape

(5339455, 3)

In [6]:
kiva_dataframe.head()

Unnamed: 0,itemid,userid,feedback
0,0,7703,1.0
1,0,8354,1.0
2,0,9000,1.0
3,0,10247,1.0
4,0,21284,1.0


***
### Polara example

In [None]:
# define models
ml_data = get_movielens_data(get_genres=False)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')

# data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback')

svd = SVDModel(data_model)
popular = PopularityModel(data_model)
random = RandomModel(data_model)
models = [svd, popular, random]

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3, 4, 5] # use all 5 folds for cross-validation (default)
topk_values = [1, 5, 10, 20, 50] # values of k to experiment with

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)

In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('nDCG', level='metric', axis=1).unstack('model')

In [None]:
scores

***
### Cross-validation

In [7]:
data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback', seed=0)
data_model.warm_start = False

start = time.time()
data_model.prepare()
logging.debug("Prepared data in %0.2fs", time.time() - start)

Preparing data...
207 unique itemid's within 228 holdout interactions were filtered. Reason: not in the training data.
85452 unique userid's within 136096 holdout interactions were filtered. Reason: not in the training data.
114 of 44225 userid's were filtered out from holdout. Reason: incompatible number of items.
Done.
There are 5070574 events in the training and 132333 events in the holdout.


DEBUG:root:Prepared data in 152.57s


In [8]:
random = RandomModel(data_model)
popular = PopularityModel(data_model)
svd = SVDModel(data_model)

# bpr = ImplicitBPR(data_model)
# bpr.rank = 200
# bpr.num_epochs = 100
# bpr.use_gpu = False

Generate a list of ALS models by parameter grid:

In [9]:
def get_base_model(model_name):
    if (model_name == 'bpr'):
        model = ImplicitBPR(data_model)
    else:
        model = ImplicitALS(data_model)
        model.epsilon = 1e-8
        model.weight_func = np.log2
    
    model.use_gpu = False
    return model

def get_grid_models(cv_param_grid, model_name):
    models = []
    keys, values = zip(*cv_param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        next_model = get_base_model(model_name)
        name = next_model.method
        for k, v in params.items():
            setattr(next_model, k, v)
            name += '_%s-%s' % (k, v)
        
        next_model.method = name
        models.append(next_model)
    return models

Cross-validation setup:

In [17]:
cv_param_grid = {
    'rank': [100],
    'regularization': [0.01],
    'num_epochs': [60],
    'alpha': [100]
}

In [18]:
basic_models = [random, popular, svd]

als_models = get_grid_models(cv_param_grid, 'als')
models = basic_models + als_models

# bpr_models = get_grid_models(cv_param_grid, 'bpr')
# models = basic_models + bpr_models

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3]
topk_values = [5, 10, 20] # values of k to experiment with

In [19]:
start = time.time()

# run 3-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)


logging.debug("Cross-validation experiment finished in %0.2fs", time.time() - start)

Preparing data...
211 unique itemid's within 234 holdout interactions were filtered. Reason: not in the training data.
85443 unique userid's within 136172 holdout interactions were filtered. Reason: not in the training data.
100 of 44237 userid's were filtered out from holdout. Reason: incompatible number of items.
Done.
There are 5070446 events in the training and 132411 events in the holdout.
PureSVD training time: 3.694s


DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.203s
DEBUG:implicit:Calculated transpose in 0.156s
DEBUG:implicit:Initialized factors in 1.345975637435913
DEBUG:implicit:Running 60 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 60.0/60 [02:51<00:00,  2.77s/it]


iALS_rank-100_regularization-0.01_num_epochs-60_alpha-100 training time: 02m:53s
Evaluated model RND in 282.35s
Evaluated model MP in 122.57s


KeyboardInterrupt: 

In [20]:
pickle.dump(result, open("eval_results/result_7.p", "wb"))

In [21]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,type,relevance,relevance,relevance,relevance,relevance,ranking,ranking
Unnamed: 0_level_1,Unnamed: 1_level_1,metric,precision,recall,fallout,specifity,miss_rate,nDCG,nDCL
fold,top-n,model,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,5,RND,1.4e-05,2.3e-05,,,0.999977,2.7e-05,
1,5,MP,0.001115,0.001858,,,0.998142,0.001654,
1,5,PureSVD,0.001355,0.002258,,,0.997742,0.002124,
1,5,BPR_learning_rate-0.05,0.002388,0.00398,,,0.99602,0.003662,
1,5,BPR_learning_rate-0.1,0.00266,0.004433,,,0.995567,0.004208,
1,5,BPR_learning_rate-0.5,0.0,0.0,,,1.0,0.0,
1,5,BPR_learning_rate-1,0.0,0.0,,,1.0,0.0,
1,10,RND,9e-06,3e-05,,,0.99997,3e-05,
1,10,MP,0.000834,0.002779,,,0.997221,0.002047,
1,10,PureSVD,0.000931,0.003104,,,0.996896,0.002503,


In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('recall', level='metric', axis=1).unstack('model')
scores

***
### Variable sizes

In [14]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

               RecommenderData:   2.0KiB
                   ImplicitALS:   2.0KiB
                      SVDModel:   1.4KiB
               PopularityModel:   1.4KiB
                   RandomModel:   1.4KiB
                    csr_matrix:   1.0KiB
                    lil_matrix:   1.0KiB
             CooccurrenceModel:   1.0KiB
                           _i1:   917.0B
                    als_models:   768.0B


In [13]:
del kiva_dataframe

## Final Testing

### Dataset reading

In [45]:
loans_table_f = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table_f = loans_table_f.sort_values(by='raised_time')

In [46]:
funded_loans_table_f = loans_table_f[loans_table_f.status == 'funded']

In [47]:
START_DATE_f = '2011-11-01'
END_DATE_f = '2013-09-30'

mask_f = (funded_loans_table_f['raised_time'] > START_DATE_f) & (funded_loans_table_f['raised_time'] <= END_DATE_f)
funded_loans_table_f = funded_loans_table_f.loc[mask_f]
del mask_f

funded_loan_ids_set_f = set(funded_loans_table_f['loan_id'])

In [48]:
# pickle.dump(loans_table_f, open("pickle/loans_table_f.p", "wb"))
# pickle.dump(funded_loans_table_f, open("pickle/funded_loans_table_f.p", "wb"))
# pickle.dump(funded_loan_ids_set_f, open("pickle/funded_loan_ids_set_f.p", "wb"))

In [49]:
# free ram
del loans_table_f
del funded_loans_table_f

In [50]:
loans_f = set()
lenders_f = set()
loans_lenders_dict_f = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set_f:
            continue
        
        loans_f.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict_f[loan_id] = new_lenders
        lenders_f.update(new_lenders)
        line_num += 1

loans_f = list(loans_f)
lenders_f = list(lenders_f)

print('Loans-lenders dict filled')

Loans-lenders dict filled


### Utility matrix creation

In [51]:
lenders_reverse_index_f = {k: v for v, k in enumerate(lenders_f)}
utility_matrix_f = lil_matrix((len(loans_f), len(lenders_f)), dtype=np.float64)

In [52]:
for loan_index, loan in enumerate(loans_f):
    for lender in loans_lenders_dict_f[loan]:
        lender_index = lenders_reverse_index_f[lender]
        utility_matrix_f[loan_index, lender_index] = 1.0

print('Filled utility matrix')

Filled utility matrix


In [53]:
utility_matrix_f = utility_matrix_f.tocsr()

In [54]:
# pickle.dump(utility_matrix_f, open("pickle/utility_matrix_f.p", "wb"))

In [55]:
print(utility_matrix_f.nnz)
print('lenders: ', len(lenders_f))
print('loans: ', len(loans_f))

5944688
lenders:  649453
loans:  248040


In [56]:
write_matrix_to_csv(utility_matrix_f, "kiva_dataframe_f.csv")

#### Load Kiva dataframe

In [57]:
kiva_dataframe_f = pd.read_csv('kiva_dataframe_f.csv', engine='c')

In [58]:
kiva_dataframe_f.shape

(5944688, 3)

In [59]:
kiva_dataframe_f.head()

Unnamed: 0,itemid,userid,feedback
0,0,33826,1.0
1,0,61025,1.0
2,0,337745,1.0
3,0,413925,1.0
4,0,510382,1.0


In [None]:
data_model = RecommenderData(kiva_dataframe_f, 'userid', 'itemid', 'feedback', seed=0)
data_model.warm_start = False

start = time.time()
data_model.prepare()
logging.debug("Prepared data in %0.2fs", time.time() - start)

In [None]:
random = RandomModel(data_model)
popular = PopularityModel(data_model)
svd = SVDModel(data_model)

bpr = ImplicitBPR(data_model)
bpr.rank = 200
bpr.num_epochs = 100
bpr.use_gpu = False

als = ImplicitALS(data_model)
als.epsilon = 1e-8
als.weight_func = np.log2
als.rank = 200
als.num_epochs = 60
als.alpha = 50
als.regularization = 0.01

models = [random, popular, svd, bpr, als]

In [None]:
metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1]
topk_values = [5, 10, 20] # values of k to experiment with

start = time.time()

result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)


logging.debug("Experiment finished in %0.2fs", time.time() - start)

In [None]:
pickle.dump(result, open("eval_results/result_f_2.p", "wb"))

In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('recall', level='metric', axis=1).unstack('model')
scores

#### Test data sparsity

In [None]:
matrix_size = utility_matrix_f.shape[0]*utility_matrix_f.shape[1] # Number of possible interactions in the matrix
num_interactions = utility_matrix_f.nnz # Number of items interacted with
sparsity = 100*(1 - (num_interactions/matrix_size))
print('Sparsitiy: %f %%' % sparsity)