# Kiva collaborative filtering
## Polara framework

***
### Imports

In [14]:
# essentials
import sys
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from scipy.sparse import csr_matrix, lil_matrix

# polara framework
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel, PopularityModel, RandomModel
from polara.datasets.movielens import get_movielens_data
from polara.evaluation import evaluation_engine as ee

# utilities
import codecs
import logging
import time
import tqdm

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# serialization
import pickle

In [2]:
pd.set_option('display.max_columns', 40)

In [3]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

env: MKL_NUM_THREADS=1


***
### Pickle loads

In [None]:
loans_table = pickle.load(open("pickle/loans_table.p", "rb"))
funded_loans_table = pickle.load(open("pickle/funded_loans_table.p", "rb"))
funded_loan_ids_set = pickle.load(open("pickle/funded_loan_ids_set.p", "rb"))

In [4]:
utility_matrix = pickle.load(open("pickle/utility_matrix.p", "rb"))

In [None]:
utility_matrix_df = pickle.load(open("pickle/utility_matrix_df.p", "rb"))

***
### Dataset reading

In [None]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [None]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [None]:
START_DATE = '2013-10-01'
END_DATE = '2015-05-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]
del mask

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [None]:
# pickle.dump(loans_table, open("/pickle/loans_table.p", "wb"))
# pickle.dump(funded_loans_table, open("pickle/funded_loans_table.p", "wb"))
# pickle.dump(funded_loan_ids_set, open("pickle/funded_loan_ids_set.p", "wb"))

In [None]:
# free ram
del loans_table
del funded_loans_table

In [None]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

loans = list(loans)
lenders = list(lenders)

print('Loans-lenders dict filled')

***
### Utility matrix creation

In [None]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders)}
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.float64)

In [None]:
for loan_index, loan in enumerate(loans):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1.0

print('Filled utility matrix')

In [None]:
utility_matrix = utility_matrix.tocsr()

In [None]:
# pickle.dump(utility_matrix, open("pickle/utility_matrix.p", "wb"))

#### Conversion to SparseDataFrame

In [None]:
utility_matrix_df = pd.SparseDataFrame(utility_matrix)

In [None]:
# pickle.dump(utility_matrix_df, open("pickle/utility_matrix_df.p", "wb"))

***
### Polara example

In [27]:
def write_matrix_to_csv(matrix, filename):
    if not isinstance(matrix, scipy.sparse.coo_matrix):
        matrix = matrix.tocoo()
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("%s,%s,%s\n" % ('itemid', 'userid', 'feedback'))
        for i,j,v in zip(matrix.row, matrix.col, matrix.data):
            f.write("%s,%s,%s\n" % (i, j, v))

In [30]:
kiva_dataframe = pd.read_csv('kiva_dataframe.csv', engine='c')
kiva_dataframe

Unnamed: 0,itemid,userid,feedback
0,0,7703,1.0
1,0,8354,1.0
2,0,9000,1.0
3,0,10247,1.0
4,0,21284,1.0
5,0,24672,1.0
6,0,26363,1.0
7,0,28667,1.0
8,0,31213,1.0
9,0,31577,1.0


In [39]:
# define models
ml_data = get_movielens_data(get_genres=False)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')

# data_model = RecommenderData(kiva_dataframe, 'userid', 'itemid', 'feedback')

svd = SVDModel(data_model)
popular = PopularityModel(data_model)
random = RandomModel(data_model)
models = [svd, popular, random]

metrics = ['ranking', 'relevance'] # metrics for evaluation: NDGC, Precision, Recall, etc.
folds = [1, 2, 3, 4, 5] # use all 5 folds for cross-validation (default)
topk_values = [1, 5, 10, 20, 50] # values of k to experiment with

# run 5-fold CV experiment
result = ee.run_cv_experiment(models, folds, metrics,
                              fold_experiment=ee.topk_test,
                              topk_list=topk_values)

# calculate average values across all folds for e.g. relevance metrics
scores = result.mean(axis=0, level=['top-n', 'model']) # use .std instead of .mean for standard deviation
scores.xs('recall', level='metric', axis=1).unstack('model')

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): files.grouplens.org:80
DEBUG:urllib3.connectionpool:http://files.grouplens.org:80 "GET /datasets/movielens/ml-1m.zip HTTP/1.1" 200 5917549


Preparing data...
24 unique movieid's within 28 testset interactions were filtered. Reason: not in the training data.
Done.
There are 803312 events in the training and 3624 events in the holdout.
PureSVD training time: 0.201s
Preparing data...
48 unique movieid's within 58 testset interactions were filtered. Reason: not in the training data.
Done.
There are 792890 events in the training and 3624 events in the holdout.
PureSVD training time: 0.223s
Preparing data...
20 unique movieid's within 22 testset interactions were filtered. Reason: not in the training data.
Done.
There are 808443 events in the training and 3624 events in the holdout.
PureSVD training time: 0.105s
Preparing data...
30 unique movieid's within 41 testset interactions were filtered. Reason: not in the training data.
Done.
There are 788733 events in the training and 3624 events in the holdout.
PureSVD training time: 0.098s
Preparing data...
18 unique movieid's within 26 testset interactions were filtered. Reason: not 

type,relevance,relevance,relevance
model,MP,PureSVD,RND
top-n,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0.017828,0.079428,0.00011
5,0.086604,0.219408,0.001159
10,0.138546,0.300658,0.002208
20,0.232384,0.399184,0.004581
50,0.351167,0.549595,0.012585


***
### Variable sizes

In [31]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

                           ___: 278.3MiB
                            ss: 278.3MiB
                           _19: 278.3MiB
                          ssdf: 278.3MiB
                           _23: 278.3MiB
                             _: 122.2MiB
                kiva_dataframe: 122.2MiB
                           _30: 122.2MiB
                           _17:  98.3MiB
                       ml_data:  22.9MiB
