In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import NMF
from sklearn.metrics import precision_score, ndcg_score, f1_score, accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_absolute_error as mae

In [2]:
# read the data for both training and testing
df = pd.read_csv('Dataset/u.data',sep='\t',names=["user id", "item id", "rating", "timestamp"])
print("the whole data has {} users and {} items with {} rates".format(len(df['user id'].unique()), len(df['item id'].unique()), len(df)))

#read the training data
df_train = pd.read_csv('Dataset/ua.base',sep='\t',names=["user id", "item id", "rating", "timestamp"])
print("number of training data samples: ", len(df_train))
print('number of users: ', len(df_train['user id'].unique()))
print("number of items: ", len(df_train['item id'].unique()))

#read the testing data
df_test = pd.read_csv('Dataset/ua.test',sep='\t',names=["user id", "item id", "rating", "timestamp"])
print("number of training data samples: ", len(df_test))
print('number of users: ', len(df_test['user id'].unique()))
print("number of items: ", len(df_test['item id'].unique()))

#constructing the matrix T for training data
T_train = np.zeros((len(df['user id'].unique()), len(df['item id'].unique())))
print("The matrix T(train) is getting made\nthe shape of T is ",T_train.shape)
for i in tqdm(range(len(df_train))):
    T_train[df_train['user id'].iloc[i]-1, df_train['item id'].iloc[i]-1] = df_train['rating'].iloc[i]

#constructing the matrix T for testing data
T_test = np.zeros((len(df['user id'].unique()), len(df['item id'].unique())))
print("The matrix T(test) is getting made\nthe shape of T is ",T_test.shape)
for i in tqdm(range(len(df_test))):
    T_test[df_test['user id'].iloc[i]-1, df_test['item id'].iloc[i]-1] = df_test['rating'].iloc[i]

print("matrix T(train) is ",T_train.shape)

the whole data has 943 users and 1682 items with 100000 rates
number of training data samples:  90570
number of users:  943
number of items:  1680
number of training data samples:  9430
number of users:  943
number of items:  1129
The matrix T(train) is getting made
the shape of T is  (943, 1682)


100%|█████████████████████████████████████████████████████████████████████████| 90570/90570 [00:03<00:00, 28866.39it/s]


The matrix T(test) is getting made
the shape of T is  (943, 1682)


100%|███████████████████████████████████████████████████████████████████████████| 9430/9430 [00:00<00:00, 28612.15it/s]


matrix T(train) is  (943, 1682)


In [3]:
#perform NMF
k = 5
print("The NMF is running on {} clusters".format(k))
model = NMF(n_components=k)
V = model.fit_transform(T_train)
H = model.components_
print("The NMF is done!\nV.shape = {} H.shape = {}".format(V.shape, H.shape))

#construct the matrix P
P = np.concatenate((V, H.T), axis=0)
print(P.shape)

#normalize the rows of P to make U
sum_of_rows = P.sum(axis=1)
U = P / sum_of_rows[:, np.newaxis]
U[np.isnan(U)] = 0

Pusers = U[:len(df['user id'].unique()), :]
Pitems = U[len(df['user id'].unique()):, :]

The NMF is running on 5 clusters




The NMF is done!
V.shape = (943, 5) H.shape = (5, 1682)
(2625, 5)


  from ipykernel import kernelapp as app


In [4]:
#calculate the delta (the weight)
delta = np.zeros((T_train.shape[0], T_train.shape[1], k))
addresses = T_test.nonzero()

for i,j in zip(addresses[0], addresses[1]):
    for l in range(k):
        delta[i,j,l] = (Pusers[i,l] + Pitems[j,l]) / np.sum(Pusers[i]+Pitems[j])

In [5]:
#calculate the delta (the weight) (the second weight )
delta = np.zeros((T_train.shape[0], T_train.shape[1], k))
addresses = T_test.nonzero()

for i,j in zip(addresses[0], addresses[1]):
    for l in range(k):
        delta[i,j,l] = (Pusers[i,l] * Pitems[j,l]) / np.sum(Pusers[i] * Pitems[j])

  import sys


In [7]:
#Decompose multiple clusters for recommender system methods

threshold = 0.1
clusters = []
overlap_identifier = np.ndarray(shape = T_train.shape, dtype=list)

for c in tqdm(range(k)):
    users = np.where(Pusers[:,c] > threshold)[0]
    items = np.where(Pitems[:,c] > threshold)[0]

    #capture the overlaped recommenders
    oi = np.zeros(T_train.shape)
    oi[users,:]+=1
    oi[:,items]+=1

    addresses = np.where(oi == 2) 
    for i,j in zip(addresses[0], addresses[1]):
        if overlap_identifier[i,j] == None:
            overlap_identifier[i,j] = []
    overlap_identifier[i,j].append(c)

    T_temp = T_train[users,:][:,items]

    df = pd.DataFrame(T_temp, columns = items, index = users)

    #store all the subgroups
    clusters.append(df)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.22it/s]


In [8]:
#perform recommender systems to each subgroup (User & item memory based)
def eval(prediction, truth):
    prediction = prediction[truth.nonzero()].flatten()
    truth = truth[truth.nonzero()].flatten()
    return mae(prediction, truth)

def predict(rating, similarity,typ):
    if typ == 'user':
        mean = rating.mean(axis = 1)
        rating_deff = (rating - mean[:, np.newaxis])
        pred = mean[:, np.newaxis] + similarity.dot(rating_deff)/ np.array([np.abs(similarity).sum(axis = 1)]).reshape(similarity.shape[0],1)
    elif typ == 'item':
        pred = rating.dot(similarity)/ np.array([np.abs(similarity).sum(axis = 1)]).reshape(1, similarity.shape[0])
    return pred

"""
it just needs to put all the subgoups together
"""

metric = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
metric = metric[1]

user_preds = []
item_preds = []
for dfs in clusters:
    user_similarity = pairwise_distances(dfs.to_numpy(), metric= metric)
    item_similarity = pairwise_distances(dfs.to_numpy().T, metric= metric)

    user_prediction = predict(dfs.to_numpy(), user_similarity, typ='user')
    item_prediction = predict(dfs.to_numpy(), item_similarity, typ='item')

    user_preds.append(pd.DataFrame(user_prediction,columns=dfs.columns, index = dfs.index))
    item_preds.append(pd.DataFrame(item_prediction,columns=dfs.columns, index = dfs.index))

In [9]:
#apply merge

Y_userBased = np.zeros(T_train.shape)
Y_itemBased = np.zeros(T_train.shape)

addresses = np.where(overlap_identifier != None)
for i,j in tqdm(zip(addresses[0], addresses[1]), total=len(addresses[0])):
    temp_score_user = 0
    temp_score_item = 0
    for Cs in overlap_identifier[i,j]:
        temp_score_user += user_preds[Cs][j][i] * delta[i, j, Cs]
        temp_score_item += item_preds[Cs][j][i] * delta[i, j, Cs]
    Y_userBased[i,j] = temp_score_user
    Y_itemBased[i,j] = temp_score_item

100%|████████████████████████████████████████████████████████████████████| 1091741/1091741 [00:04<00:00, 252605.23it/s]


In [10]:
eval(Y_userBased, T_test)

3.5878048780487806

In [11]:
eval(Y_itemBased, T_test)

3.5878048780487806

In [6]:
df_test

Unnamed: 0,user id,item id,rating,timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201
5,1,160,4,875072547
6,1,171,5,889751711
7,1,189,3,888732928
8,1,202,5,875072442
9,1,265,4,878542441
