# Collaborative filtering for beer recommendations

In [None]:
# importing the required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
os.chdir("C:/Users/Rishi's pet dragon/Desktop/6740/Project")

In [None]:
# reading the train and test data set

train_df = pd.read_csv('train.csv', encoding=result['encoding'])
test_df = pd.read_csv('test.csv' , encoding = result['encoding'])

In [None]:
#creating the training user-bottle matrix

n_users = max(train_df.user_label.unique())
n_items = max(train_df.bottle_label.unique())

A = np.zeros((n_users, n_items))
for line in train_df.itertuples():
    A[int(line[6])-1, int(line[7])-1] = int(line[8]) 

In [None]:
A.shape

In [None]:
#similiarly creating the test matrix
n_users = max(test_df.user_label.unique()) 
n_items = max(test_df.bottle_label.unique()) 

A_t = np.zeros((n_users, n_items))
for line in test_df.itertuples():
    A_t[int(line[6])-1, int(line[7])-1] = int(line[8]) 


In [None]:
# creating the similarity matrix using cosine and pearson correlation distances

from scipy import spatial
users_avg = np.sum(A , axis = 1) / np.sum( (A != 0) , axis = 1)

#creating the similarity matrix

sim_cos = np.empty(shape = (A.shape[1],A.shape[1]))
sim_corr = np.empty(shape = (A.shape[1],A.shape[1]))

#running the loop on the upper triangle as the matrix is symmetric

for row in list(range(0,A.shape[1]-1)):
    for col in list(range(row+1 , A.shape[1])):
        
        #selecting the users common for a particular (i1,i2) movie combination
        union = A[:,[row,col]][(A[:,row] != 0) & (A[:,col] != 0)]
        
        if union.size:
            ru1 = union[:,0] - users_avg[(A[:,row] != 0) & (A[:,col] != 0)]
            ru2 = union[:,1] - users_avg[(A[:,row] != 0) & (A[:,col] != 0)]
            sim_cos[row,col] = 1 - spatial.distance.cosine(ru1, ru2)
            sim_corr[row,col] = np.corrcoef(ru1,ru2)[0,1]
        else:
            #if no users are common between (i1,i2) we set the similarity value to zero
            sim_cos[row,col] = 0
            sim_corr[row,col] = 0

sim_cos[np.isnan(sim_cos)]=1
np.fill_diagonal(sim_cos,1)

#setting the lower triangle to the values in the upper triangle
i_lower = np.tril_indices(A.shape[1], -1)
sim_cos[i_lower] = sim_cos.T[i_lower]



sim_corr[np.isnan(sim_corr)]=1
np.fill_diagonal(sim_cos,1)

#setting the lower triangle to the values in the upper triangle
sim_corr[i_lower] = sim_corr.T[i_lower]

In [None]:
# Performing k nearest neighbour to make the final predictions

def knn_mae (k, A , A_t , sim):
    uu = np.nonzero(A_t)[0]
    ii = np.nonzero(A_t)[1]
    A_t_p = np.zeros((A_t.shape[0], A_t.shape[1]))
    
    for u,i in zip(uu,ii):
        x = np.nonzero(A[u,:])[0]
        x = x[x!=i]
        k_i = x[np.argsort(sim[i,x])[::-1]][:k]
        k_dist = sim[i,x][np.argsort(sim[i,x])[::-1]][:k]

        A_t_p[u,i] = np.sum(np.multiply(A[u,k_i],k_dist))/np.sum(np.absolute(k_dist))

        if np.isnan(A_t_p[u,i]):
            A_t_p[u,i] = np.mean(A[u,:][A[u,:] != 0])
    A_t_p = np.nan_to_num(A_t_p)
        
    result = np.absolute(A_t - A_t_p)
    result = np.mean(result[result !=0])
    return result

In [None]:
# tuning k by performing a grid search

k = [1,5,10,20,30,40,50]

# using the cosine similarity matrix first
mae_cos = np.zeros(len(k))
for kk,i in zip(k,range(len(k))):
                mae_cos[i] = knn_mae (kk, A , A_t , sim_cos)

# using the pearson correlation similarity matrix
mae_corr = np.zeros(len(k))
for kk,i in zip(k,range(len(k))):
                mae_corr[i] = knn_mae (kk, A , A_t , sim_corr)

In [None]:
# creating the result plots

import matplotlib.pyplot as plt

K=[1,5,10,20,30,40,50]
MAE_corr_CF = [ 1.42829121,  0.8954746 ,  0.89530384,  0.98537369,  1.0760362 ,1.17499251,  1.24868311]
MAE_con_CF = [1.44819743,  0.92803631,  0.93786554,  1.06331131,  1.19974013, 1.34069922,  1.44821194]
MAE_con_lsi = [0.5504, 0.4611, 0.4486, 0.4441, 0.4437, 0.4442, 0.4449]
MAE_con_lda = [0.5904, 0.4870, 0.4705, 0.4632, 0.4610, 0.4606, 0.4603]
MAE_corr_lsi = [0.5501, 0.4611, 0.4484, 0.4439, 0.4435, 0.4441, 0.4447]
MAE_corr_lda = [0.5904, 0.4886, 0.4724, 0.4680, 0.4655, 0.4656, 0.4689]

In [None]:
def plotting(lsi, lda, CF, K, width=8, height=6):
    plt.figure(figsize=(width, height))
    #plt.title('Errors vs Iterations', fontsize = 14)
    plt.plot(lsi, color="navy", lw=3)
    plt.plot(lda, color="deepskyblue", lw=3)
    plt.plot(CF, color="blue", lw=3)
    plt.legend(['LSI', 'LDA', 'CF'], loc='upper right')
    plt.xlabel('K', fontsize=12)
    plt.ylabel('MAE', fontsize=12)
    plt.xticks(range(0, 7), K,fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid()
    plt.show()

In [None]:
plotting(MAE_corr_lsi, MAE_corr_lda,MAE_corr_CF, K, width=8, height=6)

In [None]:
plotting(MAE_con_lsi, MAE_con_lda,MAE_con_CF, K, width=8, height=6)