# This code contains two kinds of collaborative filterings:
## 1.user-to-user: (1) cosine similarity, (2) KNN
## 2.item-to-item: cosine

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random

In [2]:
df = pd.read_csv('../data/x.csv',encoding= 'unicode_escape')

FileNotFoundError: [Errno 2] File b'../data/xyz.csv' does not exist: b'../data/xyz.csv'

In [None]:
df.head()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df_new = df.dropna()

In [None]:
df_new.describe()

# Here we can see quantity has some negative values which is a part of incorrect data so we will drop such entries

In [None]:
df_new = df_new[df_new.Quantity > 0]

In [None]:
df_new.describe()

## User-to-User Collaborative Filtering

In [None]:
# We are creating a df which contains CustomerID and whether they have ever purchased a product using groupby 

purchase = (df_new.groupby(['CustomerID', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID'))

In [None]:
purchase.head(30)

In [None]:
#We are getting the quantity ordered (example : 48,24,126) while we just want to know if that particular item is purchased or not
#thus we are encoding units as 1(if purchased) or 0(not purchased)

def encode_units(x):
    if x < 1:
        return 0
    if x >= 1:
        return 1


purchase = purchase.applymap(encode_units)

In [None]:
purchase.head(30)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
user_similarity = cosine_similarity(purchase)

In [None]:
user_similarity_df = pd.DataFrame(user_similarity,index=purchase.index,columns=purchase.index)

In [None]:
user_similarity_df

In [None]:
def similar_users(user_id,k=5):
    # separating df rows for the entered user id
    user = user_similarity_df[user_similarity_df.index == user_id]
    
    # a df of all other users
    other_users = user_similarity_df[user_similarity_df.index != user_id]
    
    # calc cosine similarity between user and each other user
    similarities = cosine_similarity(user,other_users)[0].tolist()
    
    # create list of indices of these users
    indices = other_users.index.tolist()
    
    # create key/values pairs of user index and their similarity
    index_similarity = dict(zip(indices, similarities))
    
    # sort by similarity
    index_similarity_sorted = sorted(index_similarity.items(),reverse=True)
    
    # grab k users off the top
    top_users_similarities = index_similarity_sorted[:k]
    users = [u[0] for u in top_users_similarities]
    
    return users

In [None]:
simu = similar_users(12347)

simu
# further the similar users can be stored in a list and later we can display the items purchased by the similar users 

In [None]:
def simu_recommendation(userid):
    
    simu = similar_users(userid)

    #obtaining all the items bought by similar users
    simu_rec = []
    for j in simu:
        desc = df_new[df_new["CustomerID"]==j]['Description'].to_list()
        simu_rec.append(desc)
    
    #this gives us multi-dimensional list
    # we need to flatten it
    flat_list = []
    for sublist in simu_rec:
        for item in sublist:
            flat_list.append(item)
    final_list = list(dict.fromkeys(flat_list))
    
    # storing 10 random recommendations in a list
    ten_recs = random.sample(final_list, 10)
    
    print('Items bought by Similar users based on Cosine Similarity')
    
    #returning 10 random recommendations
    return ten_recs

In [None]:
simu_recommendation(12347)

## User-to-User Collaborative using KNN

In [None]:
# For passing our sparse matrix into KNN we need to convert it into CSR
# CSR divides a sparse matrix into 3 arrays : values, extent of rows, index of columns

from scipy.sparse import csr_matrix

purchase_matrix = csr_matrix(purchase.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(purchase_matrix)

In [None]:
simu_knn = []

In [None]:
def similar_users_knn(purchase,query_index):
    distances, indices = model_knn.kneighbors(purchase.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(purchase.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, purchase.index[indices.flatten()[i]], distances.flatten()[i]))
            simu_knn.append(purchase.index[indices.flatten()[i]])    

In [None]:
similar_users_knn(purchase,1497)

In [None]:
simu_knn

In [None]:
def simu_recommendation_knn(simu_knn):
    

    #obtaining all the items bought by similar users
    simu_rec = []
    for j in simu_knn:
        desc = df_new[df_new["CustomerID"]==j]['Description'].to_list()
        simu_rec.append(desc)
    
    #this gives us multi-dimensional list
    # we need to flatten it
    flat_list = []
    for sublist in simu_rec:
        for item in sublist:
            flat_list.append(item)
    final_list = list(dict.fromkeys(flat_list))
    
    # storing 10 random recommendations in a list
    ten_recs = random.sample(final_list, 10)
    
    print('Items bought by Similar users based on KNN')
    
    #returning 10 random recommendations
    return ten_recs

In [None]:
simu_recommendation_knn(simu_knn)

## Item-to-Item Collaborative Filtering

In [None]:
# We are creating a df which contains item names and whether they have been ever purchased by a customer using groupby 

items_purchase = (df_new.groupby(['Description','CustomerID'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('Description'))

In [None]:
items_purchase.head(30)

In [None]:
items_purchase = items_purchase.applymap(encode_units)

In [None]:
item_similarity = cosine_similarity(items_purchase)

In [None]:
item_similarity_df = pd.DataFrame(item_similarity,index=items_purchase.index,columns=items_purchase.index)

In [None]:
item_similarity_df.head(10)

In [None]:
def similar_items(item,k=10):
    # separating df rows of the selected item
    item = item_similarity_df[item_similarity_df.index == item]
    
    # a df of all other items
    other_items = item_similarity_df
    
    # calc cosine similarity between selected item with other items
    similarities = cosine_similarity(item,other_items)[0].tolist()
    
    # create list of indices of these items
    indices = other_items.index.tolist()
    
    # create key/values pairs of item index and their similarity
    index_similarity = dict(zip(indices, similarities))
    
    # sort by similarity
    index_similarity_sorted = sorted(index_similarity.items())
    
    # grab k items from the top
    top_item_similarities = index_similarity_sorted[:k]
    items = [u[0] for u in top_item_similarities]
    
    print('Similar items based on purchase behaviour (item-to-item collaborative filtering)')
    return items

In [None]:
similar_items(' 4 PURPLE FLOCK DINNER CANDLES')