# Kiva collaborative filtering

### Imports

In [1]:
import csv
import pandas as pd
import numpy as np
import implicit
from scipy.sparse import csr_matrix, lil_matrix

In [2]:
import codecs
import logging
import time
import tqdm

In [3]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

from implicit.datasets.movielens import get_movielens

In [4]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

env: MKL_NUM_THREADS=1


### Inits

In [5]:
loans = set()
lenders = set()
loans_lenders_dict = {}

### Dataset reading

In [6]:
with open('additional-kiva-snapshot/lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        lenders.add(row[0])
        line_num += 1

print('Lenders filled.')

Lenders filled.


In [7]:
with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loans.add(loan_id)
        loans_lenders_dict[loan_id] = set(lender_ids.split(", "))
        line_num += 1

print('Loans-lenders dict filled')

Loans-lenders dict filled


### Utility matrix creation

In [8]:
loans_list = list(loans)
lenders_list = list(lenders)
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.int8)

In [9]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders_list)}

In [10]:
for loan_index, loan in enumerate(loans_list):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1
    
print('Filled utiility matrix')

Filled utiility matrix


In [11]:
utility_matrix = utility_matrix.tocsr()

### Pandas dataset reading
###### (used only for visualization purposes)

In [None]:
lenders_table = pd.read_csv('additional-kiva-snapshot/lenders.csv')

In [None]:
lenders = lenders_table['permanent_name']
lenders = lenders.dropna()

In [None]:
loans_lenders_table = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv')

In [None]:
loans = loans_lenders_table['loan_id']

### Utility functions

In [12]:
MODELS = {
    "als":  AlternatingLeastSquares,
    "nmslib_als": NMSLibAlternatingLeastSquares,
    "annoy_als": AnnoyAlternatingLeastSquares,
    "faiss_als": FaissAlternatingLeastSquares,
    "tfidf": TFIDFRecommender,
    "cosine": CosineRecommender,
    "bpr": BayesianPersonalizedRanking,
    "bm25": BM25Recommender
}

In [13]:
def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32, 'use_gpu': True}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    else:
        params = {}

    return model_class(**params)

In [14]:
def train_model(model_name="als", alpha=40):    
    # create a model from the input data
    model = get_model(model_name)
    data_matrix = utility_matrix
    
    # if we're training an ALS based model, weight input by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # multiply positive inputs with alpha
        logging.debug("scaling matrix by alpha")
        data_matrix = data_matrix.multiply(alpha)
        
        logging.debug("weighting matrix by bm25_weight")
        data_matrix = bm25_weight(data_matrix)

        # also disable building approximate recommend index
        model.approximate_similar_items = False
        
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(data_matrix)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
    
    return model

In [15]:
def calculate_recommendations(model, output_filename):
    """ Generates loan recommendations for each lender in the dataset """

    # generate recommendations for each lender and write out to a file
    start = time.time()
    lenders_loans = utility_matrix.T.tocsr()
    with tqdm.tqdm(total=len(lenders)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for lender_index, lender in enumerate(lenders_list):
                for loan_index, score in model.recommend(lender_index, lenders_loans, N=5):
                    o.write("%s\t%s\t%s\n" % (lender, loans_list[loan_index], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs", time.time() - start)

### Training model

In [16]:
model = train_model(alpha=100, model_name="als")

getting model als


DEBUG:root:scaling matrix by alpha
DEBUG:root:weighting matrix by bm25_weight
DEBUG:root:training model als
DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.344s
DEBUG:implicit:Calculated transpose in 3.277s
DEBUG:implicit:Initialized factors in 2.0001230239868164
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:27<00:00,  1.76s/it]
DEBUG:root:trained model 'als' in 36.18s


In [None]:
calculate_recommendations(model, "output.tsv")

#### This section is an usage example on the MovieLens dataset.

In [None]:
titles, ratings = get_movielens('20m')

# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
ratings.data[ratings.data < 4.0] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))
print(ratings)

In [None]:
ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
print(ratings)

In [None]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')

In [None]:
loans_table

In [None]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [None]:
funded_loans = funded_loans_table['loan_id']

In [None]:
loans_table.status.unique()

### Testing environment

The train/test splitting script is used from [here](https://gist.github.com/tgsmith61591/ce7d614d7a0442f94cd5ae5d1e51d3c2).

In [17]:
from collab_split import train_test_split
from sklearn.preprocessing import LabelEncoder

In [18]:
coo_mat = utility_matrix.tocoo()

In [19]:
users, items, ratings = coo_mat.row, coo_mat.col, coo_mat.data
users = LabelEncoder().fit_transform(users)
items = LabelEncoder().fit_transform(items)

Splitting the train/test samples 80/20.

In [20]:
train, test = train_test_split(users, items, ratings, train_size=0.8)