# Kiva collaborative filtering

### Imports

In [None]:
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import implicit
from scipy.sparse import csr_matrix, lil_matrix

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', 40)

In [None]:
import codecs
import logging
import time
import tqdm

In [None]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

from implicit.datasets.movielens import get_movielens

In [None]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

### Dataset reading

In [None]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [None]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [None]:
START_DATE = '2017-01-01'
END_DATE = '2019-01-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [None]:
# with open('additional-kiva-snapshot/lenders.csv', newline='', encoding="utf8") as csvfile:
#     csv_reader = csv.reader(csvfile)
#     line_num = 0
#     for row in csv_reader:
#         if line_num == 0:
#             line_num += 1
#             continue
#         lenders.add(row[0])
#         line_num += 1

# print('Lenders filled.')

In [None]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

print('Loans-lenders dict filled')
print('Loans set filled')
print('Lenders set filled')

### Utility matrix creation

In [None]:
loans_list = list(loans)
lenders_list = list(lenders)
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.int8)

In [None]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders_list)}

In [None]:
for loan_index, loan in enumerate(loans_list):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1
    
print('Filled utiility matrix')

In [None]:
utility_matrix = utility_matrix.tocsr()

In [None]:
print(utility_matrix.shape)

### Pandas dataset reading
###### (used only for data analysis purposes)

In [None]:
lenders_table = pd.read_csv('additional-kiva-snapshot/lenders.csv')

In [None]:
lenders = lenders_table['permanent_name']
lenders = lenders.dropna()

In [None]:
loans_lenders_table = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv')

In [None]:
loans_lenders_table

### Utility functions

In [None]:
MODELS = {
    "als":  AlternatingLeastSquares,
    "nmslib_als": NMSLibAlternatingLeastSquares,
    "annoy_als": AnnoyAlternatingLeastSquares,
    "faiss_als": FaissAlternatingLeastSquares,
    "tfidf": TFIDFRecommender,
    "cosine": CosineRecommender,
    "bpr": BayesianPersonalizedRanking,
    "bm25": BM25Recommender
}

In [None]:
def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32, 'use_gpu': True}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    else:
        params = {}

    return model_class(**params)

In [None]:
def train_model(model_name="als", alpha=40):    
    # create a model from the input data
    model = get_model(model_name)
    data_matrix = utility_matrix
    
    # if we're training an ALS based model, weight input by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # multiply positive inputs with alpha
        logging.debug("scaling matrix by alpha")
        data_matrix = data_matrix.multiply(alpha)
        
        logging.debug("weighting matrix by bm25_weight")
        data_matrix = bm25_weight(data_matrix)

        # also disable building approximate recommend index
        model.approximate_similar_items = False
        
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(data_matrix)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
    
    return model

In [None]:
def calculate_recommendations(model, output_filename, N=10):
    """ Generates loan recommendations for each lender in the dataset """

    # generate recommendations for each lender and write out to a file
    start = time.time()
    lenders_loans = utility_matrix.T.tocsr()
    with tqdm.tqdm(total=len(lenders)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for lender_index, lender in enumerate(lenders_list):
                for loan_index, score in model.recommend(lender_index, lenders_loans, N=N):
                    o.write("%s\t%s\t%s\n" % (lender, loans_list[loan_index], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs", time.time() - start)

### Training model

In [None]:
model = train_model(alpha=100, model_name="als")

In [None]:
calculate_recommendations(model, "output.tsv", N=20)

#### This section is an usage example on the MovieLens dataset.

In [None]:
titles, ratings = get_movielens('20m')

# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
ratings.data[ratings.data < 4.0] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))
print(ratings)

In [None]:
ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
print(ratings)

### Testing environment
###### (skip for now)

The train/test splitting script is used from [here](https://gist.github.com/tgsmith61591/ce7d614d7a0442f94cd5ae5d1e51d3c2).

In [None]:
from collab_split import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
coo_mat = utility_matrix.tocoo()

In [None]:
users, items, ratings = coo_mat.row, coo_mat.col, coo_mat.data
users = LabelEncoder().fit_transform(users)
items = LabelEncoder().fit_transform(items)

Splitting the train/test samples 80/20.

In [None]:
train, test = train_test_split(users, items, ratings, train_size=0.8)

### Implicit testing environment

In [None]:
from implicit.evaluation import precision_at_k, train_test_split
from implicit.datasets.movielens import get_movielens

# movies, ratings = get_movielens("20m")
# train, test = train_test_split(ratings)

coo_mat = utility_matrix.tocoo()
train, test = train_test_split(coo_mat)

model = AlternatingLeastSquares(use_gpu=True)
model.fit(train)

precision = precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)
#map_measure = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)

In [None]:
print('Precision @10: %f' % precision)

### AUC development

In [None]:
def mean_roc_auc_at_k(model, train_user_items, test_user_items, K=10, show_progress=True):
    auc_list = []
    lenders_count, loans_count = train_user_items.shape
    start = time.time()
    
    with tqdm.tqdm(total=lenders_count) as progress:
        for lender_index in range(lenders_count):
            lender_row = np.zeros(loans_count)
            for loan_index, score in model.recommend(lender_index, train_user_items, N=K):
                lender_row[loan_index] = score
            
            test_lender_row = test_user_items[lender_index, :].toarray().flatten()
            if np.count_nonzero(test_lender_row) == 0:
#                 print(np.count_nonzero(train_user_items[lender_index, :].toarray().flatten()))
#                 print("BAD ROW")
                continue
            
            roc_auc = roc_auc_score(test_lender_row, lender_row)
            auc_list.append(roc_auc)
            progress.update(1)
            
    logging.debug("generated mean ROC AUC in %0.2fs", time.time() - start)
    return np.mean(auc_list)  

In [None]:
def mean_prec_auc_at_k(model, train_user_items, test_user_items, K=10, show_progress=True):
    auc_list = []
    lenders_count, loans_count = train_user_items.shape
    start = time.time()
    
    with tqdm.tqdm(total=lenders_count) as progress:
        for lender_index in range(lenders_count):
            lender_row = np.zeros(loans_count)
            for loan_index, score in model.recommend(lender_index, train_user_items, N=K):
                lender_row[loan_index] = score
            
            test_lender_row = test_user_items[lender_index, :].toarray().flatten()
            
            precision, recall, thresholds = precision_recall_curve(test_lender_row, lender_row, pos_label=1)
            prec_auc = auc(recall, precision)                
            auc_list.append(prec_auc)
            progress.update(1)
            
    logging.debug("generated mean Precision/Recall curve AUC in %0.2fs", time.time() - start)
    return np.mean(auc_list)  

In [None]:
from implicit.evaluation import train_test_split

coo_mat = utility_matrix.tocoo()
train, test = train_test_split(coo_mat)
train_user_items = train.T.tocsr()
# test_user_items = test.T.tocsr()
test_user_items = utility_matrix.T.tocsr()

model = AlternatingLeastSquares(use_gpu=True)
model.fit(train)

mean_roc_auc = mean_roc_auc_at_k(model, train_user_items, test_user_items, K=10)
print('Mean ROC AUC score: ', mean_roc_auc)

# mean_prec_auc = mean_prec_auc_at_k(model, train_user_items, test_user_items, K=10)
# print('Mean Prec AUC score: ', mean_prec_auc)

In [None]:
from implicit.evaluation import precision_at_k, train_test_split
from implicit.datasets.movielens import get_movielens

movies, ratings = get_movielens("1m")
ratings.data[ratings.data < 4.0] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))

train, test = train_test_split(ratings)

model = AlternatingLeastSquares(use_gpu=True)
model.fit(train)

# precision = precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)
# map_measure = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)
mean_roc_auc = mean_roc_auc_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10)
print('Mean ROC AUC score: ', mean_roc_auc)

# mean_prec_auc = mean_prec_auc_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10)
# print('Mean Precision/Recall curve AUC score: ', mean_prec_auc)

### Visualizations

In [None]:
# START_DATE = '2000-01-01'
# END_DATE = '2015-01-01'

# mask = (loans_table['raised_time'] > START_DATE) & (loans_table['raised_time'] <= END_DATE)
# plot_data = loans_table.loc[mask]

plot_data = loans_table

plot_data['raised_time'] = pd.to_datetime(plot_data['raised_time'])
plot_data['date_month_year'] = plot_data['raised_time'].dt.to_period("M")

plt.figure(figsize=(20,10))
g1 = sns.pointplot(x='date_month_year', y='loan_amount', 
                   data=plot_data)
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)
g1.set_title("Mean Loan by Month Year", fontsize=15)
g1.set_xlabel("")
g1.set_ylabel("Loan Amount", fontsize=12)
plt.show()