# Kiva collaborative filtering

### Imports

In [None]:
import csv
import pandas as pd
import numpy as np
import implicit
from scipy.sparse import csr_matrix, lil_matrix

In [None]:
import codecs
import logging
import time
import tqdm

In [None]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

In [None]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

### Inits

In [None]:
loans = set()
lenders = set()
loans_lenders_dict = {}

### Dataset reading

In [None]:
with open('additional-kiva-snapshot/lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        lenders.add(row[0])
        line_num += 1

print('Lenders filled.')

In [None]:
with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loans.add(loan_id)
        loans_lenders_dict[loan_id] = set(lender_ids.split(", "))
        line_num += 1

print('Loans-lenders dict filled')

### Utility matrix creation

In [None]:
loans_list = list(loans)
lenders_list = list(lenders)
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.int8)

In [None]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders_list)}

In [None]:
for loan_index, loan in enumerate(loans_list):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1
    
print('Filled utiility matrix')

In [None]:
utility_matrix = utility_matrix.tocsr()

### Pandas dataset reading

In [None]:
lenders_data = pd.read_csv('additional-kiva-snapshot/lenders.csv')

In [None]:
lenders = lenders_data['permanent_name']
lenders = lenders.dropna()

In [None]:
loans_data = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv')

In [None]:
loans = loans_data['loan_id']

### Utility functions

In [None]:
MODELS = {
    "als":  AlternatingLeastSquares,
    "nmslib_als": NMSLibAlternatingLeastSquares,
    "annoy_als": AnnoyAlternatingLeastSquares,
    "faiss_als": FaissAlternatingLeastSquares,
    "tfidf": TFIDFRecommender,
    "cosine": CosineRecommender,
    "bpr": BayesianPersonalizedRanking,
    "bm25": BM25Recommender
}

In [None]:
def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32, 'use_gpu': True}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    else:
        params = {}

    return model_class(**params)

In [None]:
def calculate_recommendations(output_filename, alpha=40, model_name="als"):
    """ Generates loan recommendations for each lender in the dataset """
    # train the model based off input params

    # create a model from the input data
    model = get_model(model_name)
    data_matrix = utility_matrix
    

    # if we're training an ALS based model, weight input by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        data_matrix = data_matrix.multiply(alpha)
        data_matrix = bm25_weight(data_matrix)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(data_matrix)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # generate recommendations for each lender and write out to a file
    start = time.time()
    lenders_loans = data_matrix.T.tocsr()
    with tqdm.tqdm(total=len(lenders)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for lender_index, lender in enumerate(lenders_list):
                for loan_index, score in model.recommend(lender_index, lenders_loans, N=5):
                    o.write("%s\t%s\t%s\n" % (lender, loans_list[loan_index], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs", time.time() - start)

### Alternating least squares

In [None]:
calculate_recommendations("output.tsv", alpha=100, model_name="nmslib_als")