In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
import heapq

In [2]:
data = pd.read_csv("./data_cleaned.csv")
# ultility matrix
ultil_matrix = data.groupby(["StockCode", "Description", "CustomerID"])["Quantity"].sum().reset_index()

In [3]:
ultil_matrix["product_id"] = pd.factorize(ultil_matrix["StockCode"])[0] 
ultil_matrix["customer_id"] = pd.factorize(ultil_matrix["CustomerID"])[0]

In [4]:
ultil_matrix.head()

Unnamed: 0,StockCode,Description,CustomerID,Quantity,product_id,customer_id
0,10002,INFLATABLE POLITICAL GLOBE,12451.0,12,0,0
1,10002,INFLATABLE POLITICAL GLOBE,12510.0,24,0,1
2,10002,INFLATABLE POLITICAL GLOBE,12637.0,12,0,2
3,10002,INFLATABLE POLITICAL GLOBE,12673.0,1,0,3
4,10002,INFLATABLE POLITICAL GLOBE,12681.0,12,0,4


In [5]:
Y_data = ultil_matrix[["customer_id", "product_id", "Quantity"]].to_numpy()
Y_data

array([[   0,    0,   12],
       [   1,    0,   24],
       [   2,    0,   12],
       ...,
       [ 612, 3530,   -1],
       [2110, 3530,   -1],
       [3429, 3530,   -1]], dtype=int64)

In [6]:
# create dict product_id with product name and customer_id with CustomerID
df_product = ultil_matrix[["product_id", "Description"]].drop_duplicates().set_index("product_id")
dict_product = df_product.to_dict()["Description"]

# dict customer
df_customer = ultil_matrix[["CustomerID", "customer_id"]].drop_duplicates().set_index("customer_id")
dict_customer = df_customer.to_dict()["CustomerID"]

print(len(dict_product), len(dict_customer))

3531 4235


In [7]:
class CF():
    """Collaborative Filtering class for user-user and item-item recommendations."""
    
    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF  # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    
    def normalize_Y(self):
        users = self.Y_data[:, 0]  # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))

        for n in range(self.n_users):
            ids = np.where(users == n)[0].astype(np.int32)
            ordered = self.Y_data[ids, 2]

            if ordered.size > 0:  # Check if there are ordered
                m = np.mean(ordered)
            else:
                m = 0  # Default to 0 if no ordered

            self.mu[n] = m
            self.Ybar_data[ids, 2] = ordered - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                                        (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), 
                                        (self.n_items, self.n_users)).tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def refresh(self):
        """Normalize data and calculate similarity matrix again."""
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    def __pred(self, u, i, normalized=1):
        """Predict the rating of user u for item i (normalized)."""
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        sim = self.S[u, users_rated_i]
        a = np.argsort(sim)[-self.k:] 
        nearest_s = sim[a]
        r = self.Ybar[i, users_rated_i[a]]
        
        if normalized:
            return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8)
        return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
            
    def recommend(self, u):
        """Determine all items that should be recommended for user u."""
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        list_recommended_items = []
        
        for i in range(self.n_items):
            # print(self.n_items)
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    list_recommended_items.append((i, rating))
        
        top_5_largest = heapq.nlargest(5, list_recommended_items, key=lambda x: x[1])
        top_5_items = [item[0] for item in top_5_largest]
        return top_5_items 

    def print_recommendation(self):
        
        """Print all items which should be recommended for each user."""
        print('Recommendation: ')
        for i, u in enumerate(range(self.n_users)):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Top 5 items for user:', dict_customer[u], [dict_product[item] for item in recommended_items])
            else: 
                print('Top 5 users for item:', dict_product[u], [dict_customer[item] for item in recommended_items])
            if i >= 5:
                break

In [8]:
rs = CF(Y_data, k = 30, uuCF = 1)
rs.fit()

In [9]:
rs.print_recommendation()

Recommendation: 
Top 5 item(s) for user: 12451.0 ['CREAM SWEETHEART MINI CHEST', 'ASSORTED COLOUR LIZARD SUCTION HOOK', 'LARGE ROUND CUTGLASS CANDLESTICK', 'FAMILY PHOTO FRAME CORNICE', 'DISCO BALL CHRISTMAS DECORATION']
Top 5 item(s) for user: 12510.0 ['BEADED CRYSTAL HEART GREEN ON STICK', 'BEADED CRYSTAL HEART PINK ON STICK', 'SKULLS AND CROSSBONES WRAP', 'WOOLLY HAT SOCK GLOVE ADVENT STRING', 'SPACEBOY GIFT WRAP']
Top 5 item(s) for user: 12637.0 ['BEADED CRYSTAL HEART GREEN ON STICK', 'Discount', 'MERCHANT CHANDLER CREDIT ERROR, STO', 'SKULLS AND CROSSBONES WRAP', 'RED TOADSTOOL LED NIGHT LIGHT']
Top 5 item(s) for user: 12673.0 ['BEADED CRYSTAL HEART GREEN ON STICK', 'ASSORTED COLOUR LIZARD SUCTION HOOK', 'WOOLLY HAT SOCK GLOVE ADVENT STRING', 'MERCHANT CHANDLER CREDIT ERROR, STO', 'DISCO BALL CHRISTMAS DECORATION']
Top 5 item(s) for user: 12681.0 ['BEADED CRYSTAL HEART GREEN ON STICK', 'CREAM SWEETHEART MINI CHEST', 'ASSORTED COLOUR LIZARD SUCTION HOOK', 'PACK OF 60 PINK PAISLEY C

In [10]:
rs = CF(Y_data, k = 30, uuCF = 0)
rs.fit()

rs.print_recommendation()

Recommendation: 
Top 5 users for item: INFLATABLE POLITICAL GLOBE  [13743.0, 14434.0, 16989.0, 14051.0, 16672.0]
Top 5 users for item: GROOVY CACTUS INFLATABLE [15061.0, 14434.0, 16672.0, 13767.0, 13743.0]
Top 5 users for item: DOGGY RUBBER [13743.0, 13767.0, 15078.0, 14051.0, 16681.0]
Top 5 users for item: HEARTS WRAPPING TAPE  [17045.0, 17980.0, 18220.0, 14960.0, 16018.0]
Top 5 users for item: SPOTS ON RED BOOKCOVER TAPE [15061.0, 18229.0, 18139.0, 16401.0, 13767.0]
Top 5 users for item: ARMY CAMO BOOKCOVER TAPE []
