# Install and load necesary packages

In [5]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
# Please don't change this cell

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtaining top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]


In [3]:
df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923


# Split dataset

## Randomly select one rating from each user as test set

In [3]:
# Please don't change this cell

# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [4]:
# Please don't change this cell

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [5]:
# Please don't change this cell

# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [6]:
# Please don't change this cell
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [7]:
# Please don't change this cell

user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [8]:
# Please don't change this cell

MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# Your Solution
(Put all your implementation for your solution in the following cell only)

In [9]:
# Calculate the average rating using train_df and the result is saved into a dataframe called: average_rating.
average_rating=train_df.groupby('item_id').rating.mean()

average_rating=pd.DataFrame(average_rating)

average_rating.rename(columns={'rating':"average_rating"}, inplace=True)



#Calculating movie popularity using train_df.
popularity_movie=train_df.groupby('item_id').user_id.count()

df2=pd.merge(average_rating,popularity_movie, how='right', on="item_id")

df2.rename(columns={'user_id':"popularity","rating":"average_rating"}, inplace=True)

# Calculating "weight" based on unique user and polarity. Then, Calculating "log" as scientific paper indicated.
#Results are saved in an array called: "weight".
weight=np.log(len(train_df.user_id.unique())/df2.popularity)
weight=weight.to_numpy()


'''
Creating a zero matrix using the size of train_ds.
This step helps to calculated the average by user.
The average by user is added where there is a missing value to the corresponding user. If the user has already rated 
a movie the value is not modify. 
Therefore, all users have rating the movies. 
'''
avg=np.zeros(train_ds.shape[0])
df_mean=train_ds.copy()
for i, user_id in enumerate(df_mean):
    avg_user=np.sum(user_id)/np.sum(np.clip(user_id,0,1))
    avg[i]=avg_user
    x= lambda y:avg_user if y==0 else y
    df_mean[i]=np.array([x(rating) for rating in user_id])
    

'''
Calculating similarity using "weight" within center cosine similarity.

Formula: 

sim(a,u)=sum((w^2)(Ra-RaMean)(Ru-RuMean)) / (sqrt(sum(((Ra-RaMean)^2))) sqrt(sum(((Ru-RuMean)^2))))

'''

users_id=train_ds.shape[0]
#Construct a zero matrix with the size of train_ds

similarity=np.zeros((users_id, users_id))

#It iterated on each row.
for i, user_i in enumerate(train_ds):
    for j, user_j in enumerate(train_ds):
        
        mask_i=user_i >0
        mask_j=user_j >0
        
        union_index=np.union1d(np.where(mask_i), np.where(mask_j))
        
        rating_i=df_mean[i][union_index]
        rating_j=df_mean[j][union_index]
        union_weight=weight[union_index]
        
        mean_i=avg[i]
        mean_j=avg[j]
        
        user_subtract_i=rating_i - mean_i
        user_subtract_j=rating_j - mean_j
        
        value_numer = np.sum((union_weight**2)*(user_subtract_i)*(user_subtract_j))
        sqrt_1= np.sqrt(np.sum((union_weight**2)*(user_subtract_i**2)))
        sqrt_2 = np.sqrt(np.sum((union_weight**2)*(user_subtract_j**2)))
        sim_all_oper=  value_numer / (sqrt_1*sqrt_2)
        similarity[i][j]= sim_all_oper
similarity
        
    
#Evaluating the implementation by predicting the ratings in the test set (test_ds).

'''
Prediction using test_ds
'''
#saving the test_ds size in to new variables in order to use it easly.     
users_id_test=test_ds.shape[0]
items_id_test=test_ds.shape[1]

#K= 20 as it was requested it. 
prediction_test_ds = np.zeros((users_id_test,items_id_test))
k=20 
for ((i,j), ratings) in np.ndenumerate(test_ds):
    if ratings > 0:
        sim_item_ids = np.argsort(similarity[i])[::-1][1:k+1]
        sim_val = similarity[i][sim_item_ids]
        
        subtraction_ratings=df_mean[sim_item_ids,j] - avg[sim_item_ids]
        pred = (np.sum(sim_val*subtraction_ratings))/(np.sum(sim_val))
        predict = avg[i]+pred
        prediction_test_ds[i][j] = np.clip(predict, 0,5)
        
'''
Composite recomedation

ComRV(t)=Pt*Rt

First line at the top of this box. I calculated averating per item and its populary. This information is saved 
in a dataframe called "df2".
"df2" has a column called average_rating based on items and another column called "popularity" based how may user has 
rating a particular movie.
Multipling this two columns I get the "composite recommedation" (comRV(t)=Pt*Rt).
The result is saved in a new column called "Composite_Recommendation".It was sorted from the largest to smallest. 
ComRV_t has the top 10 movies popularity. This will help to recommend movies for a new user. 
'''        
df2.reset_index(drop=False, inplace=True)
comp_recom=df2.average_rating*df2.popularity
comp_recom=pd.DataFrame(comp_recom)
comp_recom.reset_index(drop=False, inplace=True)
comp_recom.rename(columns={'index':'item_id',0:"Composite_Recommendation"}, inplace=True)
comp_recom=pd.merge(df2,comp_recom, how='right', on="item_id")
ComRV_t=comp_recom.sort_values('Composite_Recommendation',ascending=False).head(10)
ComRV_t


'''
MAE and RMSE metrics to measure the prediction quality 
'''
#MAE
mask_test_ds = test_ds > 0
MAE = np.sum(np.abs(test_ds[mask_test_ds] - prediction_test_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - prediction_test_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))


## Print the MAE and RMSE of Your Implementation

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7688823150284251, RMSE: 0.9658601563338831
