# Kiva collaborative filtering
## Polara framework

***
### Imports

In [1]:
# essentials
import os
import sys
import csv
import itertools
import copy
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from scipy.sparse import csr_matrix, lil_matrix

# polara framework
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel, PopularityModel, RandomModel, CooccurrenceModel
from polara.recommender.external.implicit.ialswrapper import ImplicitALS
from polara.datasets.movielens import get_movielens_data
from polara.evaluation import evaluation_engine as ee
from polara.evaluation.plotting import show_hit_rates, show_precision_recall, show_ranking, show_relevance

# utilities
import codecs
import logging
import time
import tqdm

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# serialization
import pickle

In [2]:
pd.set_option('display.max_columns', 40)

In [3]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

env: MKL_NUM_THREADS=1


***
### Pickle loads

In [None]:
loans_table = pickle.load(open("pickle/loans_table.p", "rb"))
funded_loans_table = pickle.load(open("pickle/funded_loans_table.p", "rb"))
funded_loan_ids_set = pickle.load(open("pickle/funded_loan_ids_set.p", "rb"))

In [None]:
utility_matrix = pickle.load(open("pickle/utility_matrix.p", "rb"))

In [None]:
utility_matrix_df = pickle.load(open("pickle/utility_matrix_df.p", "rb"))

In [None]:
utility_matrix_df.head()

***
### Dataset reading

In [None]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [None]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [None]:
START_DATE = '2013-10-01'
END_DATE = '2015-05-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]
del mask

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [None]:
# pickle.dump(loans_table, open("pickle/loans_table.p", "wb"))
# pickle.dump(funded_loans_table, open("pickle/funded_loans_table.p", "wb"))
# pickle.dump(funded_loan_ids_set, open("pickle/funded_loan_ids_set.p", "wb"))

In [None]:
# free ram
del loans_table
del funded_loans_table

In [None]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

loans = list(loans)
lenders = list(lenders)

print('Loans-lenders dict filled')

***
### Utility matrix creation

In [None]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders)}
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.float64)

In [None]:
for loan_index, loan in enumerate(loans):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1.0

print('Filled utility matrix')

In [None]:
utility_matrix = utility_matrix.tocsr()

In [None]:
# pickle.dump(utility_matrix, open("pickle/utility_matrix.p", "wb"))

#### Conversion to SparseDataFrame

In [None]:
utility_matrix_df = pd.SparseDataFrame(utility_matrix)

In [None]:
# pickle.dump(utility_matrix_df, open("pickle/utility_matrix_df.p", "wb"))

In [None]:
def write_matrix_to_csv(matrix, filename):
    if not isinstance(matrix, scipy.sparse.coo_matrix):
        matrix = matrix.tocoo()
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("%s,%s,%s\n" % ('itemid', 'userid', 'feedback'))
        for i,j,v in zip(matrix.row, matrix.col, matrix.data):
            f.write("%s,%s,%s\n" % (i, j, v))

#### Load Kiva dataframe

In [4]:
kiva_dataframe = pd.read_csv('kiva_dataframe.csv', engine='c')

In [5]:
kiva_dataframe.shape

(5339455, 3)

In [6]:
kiva_dataframe.head()

Unnamed: 0,itemid,userid,feedback
0,0,7703,1.0
1,0,8354,1.0
2,0,9000,1.0
3,0,10247,1.0
4,0,21284,1.0


***
### Polara example

In [None]:
# define models
ml_data = get_movielens_data(get_genres=False)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')

# data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback')

svd = SVDModel(data_model)
popular = PopularityModel(data_model)
random = RandomModel(data_model)
models = [svd, popular, random]

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3, 4, 5] # use all 5 folds for cross-validation (default)
topk_values = [1, 5, 10, 20, 50] # values of k to experiment with

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)

In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('nDCG', level='metric', axis=1).unstack('model')

In [None]:
scores

***
### Cross-validation

In [None]:
data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback', seed=0)
data_model.warm_start = False

start = time.time()
data_model.prepare()
logging.debug("Prepared data in %0.2fs", time.time() - start)

In [None]:
random = RandomModel(data_model)
popular = PopularityModel(data_model)
svd = SVDModel(data_model)

Generate a list of ALS models by parameter grid:

In [None]:
def get_base_model():
    model = ImplicitALS(data_model)
    model.epsilon = 1e-8
    model.weight_func = np.log2
    model.use_gpu = False
    return model

def get_grid_models(cv_param_grid):
    models = []
    keys, values = zip(*cv_param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        next_model = get_base_model()
        name = 'ALS'
        for k, v in params.items():
            setattr(next_model, k, v)
            name += '_%s-%s' % (k, v)
        
        next_model.method = name
        models.append(next_model)
    return models

Cross-validation setup:

In [None]:
cv_param_grid = {
    'rank': [10, 50, 100],
    'regularization': [0.01, 0.1, 1],
    'alpha': [1, 50, 100],
    'num_epochs': [15, 30, 40]
}

In [None]:
basic_models = [random, popular, svd]
als_models = get_grid_models(cv_param_grid)
models = basic_models + als_models

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3]
topk_values = [5, 10, 20] # values of k to experiment with

In [None]:
start = time.time()

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)


logging.debug("Cross-validation experiment finished in %0.2fs", time.time() - start)

In [None]:
pickle.dump(result, open("pickle/result.p", "wb"))

In [None]:
# holdout_sizes = [1, 2, 5]

# result = ee.run_cv_experiment(models, folds, metrics,
#                               fold_experiment=ee.holdout_test,
#                               holdout_sizes=holdout_sizes)

In [None]:
# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
# scores.xs('recall', level='metric', axis=1).unstack('model')
scores

***
### Variable sizes

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))