# Kiva collaborative filtering

### Imports

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import implicit
from scipy.sparse import csr_matrix, lil_matrix

# visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', 40)

In [3]:
import codecs
import logging
import time
import tqdm

In [4]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

from implicit.datasets.movielens import get_movielens

In [5]:
%env MKL_NUM_THREADS=1
logging.basicConfig(level=logging.DEBUG)

env: MKL_NUM_THREADS=1


### Dataset reading

In [6]:
loans_table = pd.read_csv('additional-kiva-snapshot/loans.csv')
loans_table = loans_table.sort_values(by='raised_time')

In [7]:
funded_loans_table = loans_table[loans_table.status == 'funded']

In [8]:
START_DATE = '2017-01-01'
END_DATE = '2019-01-01'

mask = (funded_loans_table['raised_time'] > START_DATE) & (funded_loans_table['raised_time'] <= END_DATE)
funded_loans_table = funded_loans_table.loc[mask]

funded_loan_ids_set = set(funded_loans_table['loan_id'])

In [9]:
# with open('additional-kiva-snapshot/lenders.csv', newline='', encoding="utf8") as csvfile:
#     csv_reader = csv.reader(csvfile)
#     line_num = 0
#     for row in csv_reader:
#         if line_num == 0:
#             line_num += 1
#             continue
#         lenders.add(row[0])
#         line_num += 1

# print('Lenders filled.')

In [10]:
loans = set()
lenders = set()
loans_lenders_dict = {}

with open('additional-kiva-snapshot/loans_lenders.csv', newline='', encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    line_num = 0
    for row in csv_reader:
        if line_num == 0:
            line_num += 1
            continue
        loan_id, lender_ids = row
        loan_id = int(loan_id)
        if loan_id not in funded_loan_ids_set:
            continue
        
        loans.add(loan_id)
        new_lenders = set(lender_ids.split(", "))
        loans_lenders_dict[loan_id] = new_lenders
        lenders.update(new_lenders)
        line_num += 1

print('Loans-lenders dict filled')
print('Loans set filled')
print('Lenders set filled')

Loans-lenders dict filled
Loans set filled
Lenders set filled


### Utility matrix creation

In [11]:
loans_list = list(loans)
lenders_list = list(lenders)
utility_matrix = lil_matrix((len(loans), len(lenders)), dtype=np.int8)

In [12]:
lenders_reverse_index = {k: v for v, k in enumerate(lenders_list)}

In [13]:
for loan_index, loan in enumerate(loans_list):
    for lender in loans_lenders_dict[loan]:
        lender_index = lenders_reverse_index[lender]
        utility_matrix[loan_index, lender_index] = 1
    
print('Filled utiility matrix')

Filled utiility matrix


In [14]:
utility_matrix = utility_matrix.tocsr()

In [15]:
print(utility_matrix.shape)

(197221, 457022)


### Pandas dataset reading
###### (used only for data analysis purposes)

In [None]:
lenders_table = pd.read_csv('additional-kiva-snapshot/lenders.csv')

In [None]:
lenders = lenders_table['permanent_name']
lenders = lenders.dropna()

In [None]:
loans_lenders_table = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv')

In [None]:
loans_lenders_table

### Utility functions

In [16]:
MODELS = {
    "als":  AlternatingLeastSquares,
    "nmslib_als": NMSLibAlternatingLeastSquares,
    "annoy_als": AnnoyAlternatingLeastSquares,
    "faiss_als": FaissAlternatingLeastSquares,
    "tfidf": TFIDFRecommender,
    "cosine": CosineRecommender,
    "bpr": BayesianPersonalizedRanking,
    "bm25": BM25Recommender
}

In [17]:
def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32, 'use_gpu': True}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    else:
        params = {}

    return model_class(**params)

In [18]:
def train_model(model_name="als", alpha=40):    
    # create a model from the input data
    model = get_model(model_name)
    data_matrix = utility_matrix
    
    # if we're training an ALS based model, weight input by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # multiply positive inputs with alpha
        logging.debug("scaling matrix by alpha")
        data_matrix = data_matrix.multiply(alpha)
        
        logging.debug("weighting matrix by bm25_weight")
        data_matrix = bm25_weight(data_matrix)

        # also disable building approximate recommend index
        model.approximate_similar_items = False
        
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(data_matrix)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
    
    return model

In [23]:
def calculate_recommendations(model, output_filename, N=10):
    """ Generates loan recommendations for each lender in the dataset """

    # generate recommendations for each lender and write out to a file
    start = time.time()
    lenders_loans = utility_matrix.T.tocsr()
    with tqdm.tqdm(total=len(lenders)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for lender_index, lender in enumerate(lenders_list):
                for loan_index, score in model.recommend(lender_index, lenders_loans, N=N):
                    o.write("%s\t%s\t%s\n" % (lender, loans_list[loan_index], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs", time.time() - start)

### Training model

In [24]:
model = train_model(alpha=100, model_name="als")

getting model als


DEBUG:root:scaling matrix by alpha
DEBUG:root:weighting matrix by bm25_weight
DEBUG:root:training model als
DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.031s
DEBUG:implicit:Calculated transpose in 0.109s
DEBUG:implicit:Initialized factors in 0.31866908073425293
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:03<00:00,  4.02it/s]
DEBUG:root:trained model 'als' in 5.49s


In [None]:
calculate_recommendations(model, "output.tsv", N=20)

#### This section is an usage example on the MovieLens dataset.

In [None]:
titles, ratings = get_movielens('20m')

# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
ratings.data[ratings.data < 4.0] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))
print(ratings)

In [None]:
ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
print(ratings)

### Testing environment
###### (skip for now)

The train/test splitting script is used from [here](https://gist.github.com/tgsmith61591/ce7d614d7a0442f94cd5ae5d1e51d3c2).

In [None]:
from collab_split import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
coo_mat = utility_matrix.tocoo()

In [None]:
users, items, ratings = coo_mat.row, coo_mat.col, coo_mat.data
users = LabelEncoder().fit_transform(users)
items = LabelEncoder().fit_transform(items)

Splitting the train/test samples 80/20.

In [None]:
train, test = train_test_split(users, items, ratings, train_size=0.8)

### Implicit testing environment

In [16]:
from implicit.evaluation import precision_at_k, train_test_split
from implicit.datasets.movielens import get_movielens

# movies, ratings = get_movielens("20m")
# train, test = train_test_split(ratings)

coo_mat = utility_matrix.tocoo()
train, test = train_test_split(coo_mat)

model = AlternatingLeastSquares(use_gpu=True)
model.fit(train)

precision = precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)
#map_measure = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4)

DEBUG:implicit:Calculated transpose in 0.100s
DEBUG:implicit:Initialized factors in 1.9693121910095215
DEBUG:implicit:Running 15 ALS iterations
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:12<00:00,  1.25it/s]
100%|█████████████████████████████████████████████████████████████████████████| 457022/457022 [32:14<00:00, 236.22it/s]


In [28]:
print('Precision @10: %f' % precision)

Precision @10: 0.015661


### ROC AUC development

In [None]:
# def calc_mean_auc(model, train_user_items, test_user_items, K=10, show_progress=True):    
#     user_aucs = [] # An empty list to store the AUC for each user
    
#     for user in altered_users: # Iterate through each user that had an item altered
#         training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
#         zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
#         # Get the predicted values based on our user/item vectors
#         user_vec = predictions[0][user,:]
#         pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
#         # Get only the items that were originally zero
#         # Select all ratings from the MF prediction for this user that originally had no iteraction
#         actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
#         # Select the binarized yes/no interaction pairs from the original full data
#         # that align with the same pairs in training 
#         pop = pop_items[zero_inds] # Get the item popularity for our chosen items
#         store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
#         popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
#     # End users iteration
    
#     return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
#    # Return the mean AUC rounded to three decimal places for both test and popularity benchmark


In [None]:
# def make_train(ratings, pct_test = 0.2):
#     '''
#     This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
#     user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
#     while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    
#     parameters: 
    
#     ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
#     copy of the original set. This is in the form of a sparse csr_matrix. 
    
#     pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
#     training set for later comparison to the test set, which contains all of the original ratings. 
    
#     returns:
    
#     training_set - The altered version of the original data with a certain percentage of the user-item pairs 
#     that originally had interaction set back to zero.
    
#     test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
#     compares with the actual interactions.
    
#     user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
#     This will be necessary later when evaluating the performance via AUC.
#     '''
#     test_set = ratings.copy() # Make a copy of the original set to be the test set. 
#     test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
#     training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
#     nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
#     nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
#     random.seed(0) # Set the random seed to zero for reproducibility
#     num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
#     samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
#     user_inds = [index[0] for index in samples] # Get the user row indices
#     item_inds = [index[1] for index in samples] # Get the item column indices
#     training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
#     training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
#     return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

# This will return our training set, a test set that has been binarized to 0/1 for purchased/not 

In [None]:
from implicit.evaluation import train_test_split

coo_mat = utility_matrix.tocoo()
train, test = train_test_split(coo_mat)
train_user_items = train.T.tocsr()
test_user_items = test.T.tocsr()

In [None]:
def mean_roc_auc_at_k(model, train_user_items, test_user_items, K=10, show_progress=True):
    auc_list = []
    lenders_count, loans_count = train_user_items.shape
    start = time.time()
    
    with tqdm.tqdm(total=lenders_count) as progress:
        for lender_index, lender in enumerate(lenders_list):
            lender_row = np.zeros(loans_count)
            for loan_index, score in model.recommend(lender_index, train_user_items, N=K):
                lender_row[loan_index] = 1
            progress.update(1)

            test_lender_row = test_user_items[lender_index, :].toarray()[0]
            roc_auc = roc_auc_score(test_lender_row, lender_row)
            auc_list.append(roc_auc)
    logging.debug("generated mean ROC AUC in %0.2fs", time.time() - start)

### Visualizations

In [None]:
# START_DATE = '2000-01-01'
# END_DATE = '2015-01-01'

# mask = (loans_table['raised_time'] > START_DATE) & (loans_table['raised_time'] <= END_DATE)
# plot_data = loans_table.loc[mask]

plot_data = loans_table

plot_data['raised_time'] = pd.to_datetime(plot_data['raised_time'])
plot_data['date_month_year'] = plot_data['raised_time'].dt.to_period("M")

plt.figure(figsize=(20,10))
g1 = sns.pointplot(x='date_month_year', y='loan_amount', 
                   data=plot_data)
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)
g1.set_title("Mean Loan by Month Year", fontsize=15)
g1.set_xlabel("")
g1.set_ylabel("Loan Amount", fontsize=12)
plt.show()