1. Tạo ma trận tương quan giữa từng user - item
2. Chuẩn hoá ma trận dựa trên số min-max của từng user (item), điền những giá trị còn trống
3. Tạo similarity matrix dùng Cosine Similarity, thể hiện độ tương đồng giữa từng user (item) với nhau 
4. Dự báo về mức độ quan tâm của user với item hoặc item cho user và đề xuất

In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
import heapq

In [2]:
data = pd.read_csv("./data_cleaned.csv")
# ultility matrix
util_matrix = data.groupby(["StockCode", "Description", "CustomerID"])["Quantity"].sum().reset_index()

In [3]:
util_matrix["product_id"] = pd.factorize(util_matrix["StockCode"])[0] 
util_matrix["customer_id"] = pd.factorize(util_matrix["CustomerID"])[0]

In [4]:
util_matrix.head()

Unnamed: 0,StockCode,Description,CustomerID,Quantity,product_id,customer_id
0,10002,INFLATABLE POLITICAL GLOBE,12451.0,12,0,0
1,10002,INFLATABLE POLITICAL GLOBE,12510.0,24,0,1
2,10002,INFLATABLE POLITICAL GLOBE,12637.0,12,0,2
3,10002,INFLATABLE POLITICAL GLOBE,12673.0,1,0,3
4,10002,INFLATABLE POLITICAL GLOBE,12681.0,12,0,4


In [5]:
Y_data = util_matrix[["customer_id", "product_id", "Quantity"]].to_numpy()
Y_data

array([[   0,    0,   12],
       [   1,    0,   24],
       [   2,    0,   12],
       ...,
       [ 612, 3530,   -1],
       [2110, 3530,   -1],
       [3429, 3530,   -1]], dtype=int64)

In [6]:
# create dict product_id with product name and customer_id with CustomerID
df_product = util_matrix[["product_id", "Description"]].drop_duplicates().set_index("product_id")
dict_product = df_product.to_dict()["Description"]

# dict customer
df_customer = util_matrix[["CustomerID", "customer_id"]].drop_duplicates().set_index("customer_id")
dict_customer = df_customer.to_dict()["CustomerID"]

print("Number of products: {} \nNumber of users: {}".format(len(dict_product), len(dict_customer)))

Number of products: 3531 
Number of users: 4235


In [7]:
# idea get from this blog: https://machinelearningcoban.com/2017/05/24/collaborativefiltering/
class CF():
    """Collaborative Filtering class for user-user and item-item recommendations."""
    
    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF  # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    
    def normalize_Y(self):
        users = self.Y_data[:, 0]  # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))

        for n in range(self.n_users):
            ids = np.where(users == n)[0].astype(np.int32)
            ordered = self.Y_data[ids, 2]

            if ordered.size > 0:  # Check if there are ordered
                m = np.mean(ordered)
            else:
                m = 0  # Default to 0 if no ordered

            self.mu[n] = m
            self.Ybar_data[ids, 2] = ordered - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                                        (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), 
                                        (self.n_items, self.n_users)).tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def refresh(self):
        """Normalize data and calculate similarity matrix again."""
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    def __pred(self, u, i, normalized=1):
        """Predict the rating of user u for item i (normalized)."""
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        sim = self.S[u, users_rated_i]
        a = np.argsort(sim)[-self.k:] 
        nearest_s = sim[a]
        r = self.Ybar[i, users_rated_i[a]]
        
        if normalized:
            return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8)
        return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
            
    def recommend(self, u):
        """Determine all items that should be recommended for user u."""
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        list_recommended_items = []
        
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    list_recommended_items.append((i, rating))
        
        top_3_largest = heapq.nlargest(3, list_recommended_items, key=lambda x: x[1])
        top_3_largest = [item[0] for item in top_3_largest]   
        return top_3_largest 

    
    def export_recommendation(self):
        
        """Suggest top 3 items which should be recommended for each user."""
        print('Recommendation: ')
        list_uu = []
        for i, u in enumerate(range(self.n_users)):
            if i == 5: # run for first 5 items
                break
                
            recommended_items = self.recommend(u)
            if self.uuCF:
                list_uu.append({
                    "user": dict_customer[u],
                    "product_suggest_1": dict_product[recommended_items[0]] if len(recommended_items) > 0 else None,
                    "product_suggest_2": dict_product[recommended_items[1]] if len(recommended_items) > 1 else None, 
                    "product_suggest_3": dict_product[recommended_items[2]] if len(recommended_items) > 2 else None
                })
                
            else:      
                list_uu.append({
                    "product_name": dict_product[u],
                    "user_suggest_1": dict_customer[recommended_items[0]] if len(recommended_items) > 0 else None,
                    "user_suggest_2": dict_customer[recommended_items[1]] if len(recommended_items) > 1 else None,
                    "user_suggest_3": dict_customer[recommended_items[2]] if len(recommended_items) > 2 else None
                })
                
        return pd.DataFrame(list_uu)

In [8]:
rs = CF(Y_data, k = 30, uuCF = 1)
rs.fit()

rs.export_recommendation()

Recommendation: 


Unnamed: 0,user,product_suggest_1,product_suggest_2,product_suggest_3
0,12451.0,CREAM SWEETHEART MINI CHEST,ASSORTED COLOUR LIZARD SUCTION HOOK,LARGE ROUND CUTGLASS CANDLESTICK
1,12510.0,BEADED CRYSTAL HEART GREEN ON STICK,BEADED CRYSTAL HEART PINK ON STICK,SKULLS AND CROSSBONES WRAP
2,12637.0,BEADED CRYSTAL HEART GREEN ON STICK,Discount,"MERCHANT CHANDLER CREDIT ERROR, STO"
3,12673.0,BEADED CRYSTAL HEART GREEN ON STICK,ASSORTED COLOUR LIZARD SUCTION HOOK,WOOLLY HAT SOCK GLOVE ADVENT STRING
4,12681.0,BEADED CRYSTAL HEART GREEN ON STICK,CREAM SWEETHEART MINI CHEST,ASSORTED COLOUR LIZARD SUCTION HOOK


In [9]:
rs = CF(Y_data, k = 30, uuCF = 0)
rs.fit()

rs.export_recommendation()

Recommendation: 


Unnamed: 0,product_name,user_suggest_1,user_suggest_2,user_suggest_3
0,INFLATABLE POLITICAL GLOBE,13743.0,14434.0,16989.0
1,GROOVY CACTUS INFLATABLE,15061.0,14434.0,16672.0
2,DOGGY RUBBER,13743.0,13767.0,15078.0
3,HEARTS WRAPPING TAPE,17045.0,17980.0,18220.0
4,SPOTS ON RED BOOKCOVER TAPE,15061.0,18229.0,18139.0
