In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import nan_euclidean_distances


In [2]:
# mount drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/MyDrive/DSA4212/dataset/ratings.dat', header = None, sep='::', engine = 'python', encoding='ISO-8859-1', names = ['UserIDs', 'MovieIDs', 'Ratings', 'Timestamp'])

In [4]:
train, test = train_test_split(data, test_size=0.1, random_state=1)

print('Number of obseravtions in the original dataset: ', len(data))
print('Number of obseravtions in the training dataset: ', len(train))
print('Number of obseravtions in the testing dataset: ', len(test))

Number of obseravtions in the original dataset:  1000209
Number of obseravtions in the training dataset:  900188
Number of obseravtions in the testing dataset:  100021


# EDA

In [None]:
train

Unnamed: 0,UserIDs,MovieIDs,Ratings,Timestamp
316448,1883,2020,3,974876148
82748,549,930,5,976116072
215642,1306,3510,3,1015051441
148022,953,3911,5,975265409
522854,3224,2599,4,968520931
...,...,...,...,...
491263,3020,1219,4,970511154
791624,4732,1254,5,963636212
470924,2899,2664,4,971920791
491755,3025,1296,5,970464063


In [None]:
train.info() # No Na

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900188 entries, 316448 to 128037
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   UserIDs    900188 non-null  int64
 1   MovieIDs   900188 non-null  int64
 2   Ratings    900188 non-null  int64
 3   Timestamp  900188 non-null  int64
dtypes: int64(4)
memory usage: 34.3 MB


In [None]:
train.describe()

Unnamed: 0,UserIDs,MovieIDs,Ratings,Timestamp
count,900188.0,900188.0,900188.0,900188.0
mean,3024.149099,1865.734558,3.581696,972242600.0
std,1727.894528,1096.042156,1.117043,12144200.0
min,1.0,1.0,1.0,956703900.0
25%,1507.0,1030.0,3.0,965303300.0
50%,3069.0,1835.0,4.0,972989800.0
75%,4473.0,2770.0,4.0,975221100.0
max,6040.0,3952.0,5.0,1046455000.0


# Naive Models

In [5]:
def rmse(y_true,y_pred):
  return np.sqrt(np.square(np.subtract(y_true,y_pred)).mean())

In [11]:
mean_rating = train['Ratings'].mean()

In [12]:
pred_1 = [mean_rating] * len(test)

In [13]:
rmse_1 = rmse(list(test['Ratings']), pred_1)

In [14]:
user_mean = pd.DataFrame(train.groupby('UserIDs')['Ratings'].mean())

In [15]:
pred_2 = []
for i in range(len(test)):
  pred_2.append(user_mean.loc[test.iloc[i,0]]['Ratings'])

In [16]:
rmse_2 = rmse(list(test['Ratings']), pred_2)

In [17]:
film_mean = pd.DataFrame(train.groupby('MovieIDs')['Ratings'].mean())

In [18]:
pred_3 = []
for i in range(len(test)):
  pred_3.append(user_mean.loc[test.iloc[i,1]]['Ratings'])

In [19]:
rmse_3 = rmse(list(test['Ratings']), pred_3)

In [20]:
print('RMSE using mean ratings: ', rmse_1)
print('RMSE using mean user ratings: ', rmse_2)
print('RMSE using mean movie ratings: ', rmse_3)

RMSE using mean ratings:  1.1176358645764495
RMSE using mean user ratings:  1.0347350838474925
RMSE using mean movie ratings:  1.2032333066598133


# Memory Based approach

## User Based

### Cosine Distance

In [22]:
user_item = data.pivot('UserIDs','MovieIDs','Ratings').fillna(0)
print(f'Shape: {user_item.shape}')

Shape: (6040, 3706)


In [23]:
x_user = cosine_similarity(user_item)

In [24]:
# Using Weighted
def get_rating_user(userid, movieid, similarity_matrix):
  # Similarity of user to all other users
  user_similarities = similarity_matrix[userid - 1]
  # All movie ratings for the movie
  movie_ratings = user_item.loc[:,movieid]
  # users who watched the movie
  user_index = np.where(movie_ratings != 0)[0]
  # Remove users who did not watch the movie
  user_similarities = user_similarities[user_index]
  movie_ratings = movie_ratings[movie_ratings != 0]
  return (user_similarities@movie_ratings)/sum(user_similarities)
  #return np.mean(np.multiply(user_similarities,movie_ratings))

In [25]:
pred = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred.append(get_rating_user(test_user, test_movie, x_user))

In [26]:
weighted_rmse_user = rmse(list(test['Ratings']), pred)

In [27]:
weighted_rmse_user

0.9572944961648672

### Euclidean Distance

In [33]:
user_item_1 = data.pivot('UserIDs','MovieIDs','Ratings')

In [134]:
x_user_ed = nan_euclidean_distances(user_item_1)/np.sqrt(3706)

In [None]:
pred_user_ed = []
actual_user_ed = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_user(test_user, test_movie, x_user_ed)
  if math.isnan(rating) == False:
    pred_user_ed.append(rating)
    actual_user_ed.append(test.iloc[i,2])

In [38]:
rmse_user_ed = rmse(actual_user_ed, pred_user_ed)

In [39]:
rmse_user_ed

1.0129166217957524

### Pearson Correlation

In [None]:
x_user_pc = np.corrcoef(user_item)

In [None]:
pred_user_pc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred_user_pc.append(get_rating_user(test_user, test_movie, x_user_pc))

In [None]:
rmse_user_pc = rmse(list(test['Ratings']), pred_user_pc)

In [None]:
rmse_user_pc

## Item bases

### Cosine distance

In [152]:
x_item_cs = cosine_similarity(user_item.T)
x_item_cs.shape

(3706, 3706)

In [None]:
# Using Weighted

def get_rating_item(userid, movieid, similarity_matrix):
  # Similarity of user to all other users
  movie_index = user_item.columns.get_loc(movieid)
  item_similarities = similarity_matrix[movie_index]
  # All movie ratings from the user
  user_ratings = user_item.loc[userid,:]
  # movies the user has watched
  item_index = np.where(user_ratings != 0)[0]
  # Remove movie the user did not watch
  item_similarities = item_similarities[item_index]
  user_ratings = user_ratings[user_ratings != 0]
  return (item_similarities@user_ratings)/sum(item_similarities)


In [None]:
pred_item_cs = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred_item_cs.append(get_rating_item(test_user, test_movie, x_item_cs))

In [None]:
weighted_rmse_item = rmse(list(test['Ratings']), pred_item_cs)

In [None]:
weighted_rmse_item

0.9744446220878337

### Euclidean distance

In [None]:
x_item_ed = nan_euclidean_distances(user_item_1.T)/np.sqrt(6040)
x_item_ed.shape

(3706, 3706)

In [None]:
pred_item_ed = []
actual_item_ed = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_item(test_user, test_movie, x_item_ed)
  if math.isnan(rating) == False:
    pred_item_ed.append(rating)
    actual_item_ed.append(test.iloc[i,2])

In [None]:
rmse_item_ed = rmse(actual_item_ed, pred_item_ed)

In [None]:
rmse_item_ed

1.0854495106236097

### Pearson Correlation

In [None]:
x_item_pc = np.corrcoef(user_item.T)

In [None]:
pred_item_pc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred_item_pc.append(get_rating_item(test_user, test_movie, x_item_pc))

In [None]:
rmse_item_pc = rmse(list(test['Ratings']), pred_item_pc)

In [None]:
rmse_item_pc

0.9400995021161574

## Mean Centering

In [8]:
user_item_2 = data.pivot('UserIDs','MovieIDs','Ratings')

In [9]:
user_mean = user_item_2.mean(axis=1)

In [10]:
user_item_norm = user_item_2.subtract(user_mean, axis = 0)

In [43]:
# Using Weighted and mean centering

def get_rating_user_mc(userid, movieid, similarity_matrix):
  # Similarity of user to all other users
  user_similarities = similarity_matrix[userid - 1]
  # All movie ratings for the movie
  movie_ratings = user_item_norm.loc[:,movieid]
  # users who watched the movie
  user_index = np.where(np.isnan(movie_ratings) == False)[0]
  # Remove users who did not watch the movie
  user_similarities = user_similarities[user_index]
  movie_ratings = movie_ratings[np.isnan(movie_ratings) == False]
  return (user_similarities@movie_ratings)/sum(abs(user_similarities)) + user_mean.loc[userid]
  #return np.mean(np.multiply(user_similarities,movie_ratings))

In [6]:
# Using Weighted and mean centering
def get_rating_item_mc(userid, movieid, similarity_matrix):
  # Similarity of item to all other items
  movie_index = user_item_norm.columns.get_loc(movieid)
  item_similarities = similarity_matrix[movie_index]
  # All movie ratings from the user
  user_ratings = user_item_norm.loc[userid,:]
  # movies the user has watched
  item_index = np.where(np.isnan(user_ratings) == False)[0]
  # Remove users who did not watch the movie
  item_similarities = item_similarities[item_index]
  user_ratings = user_ratings[np.isnan(user_ratings) == False]
  return (item_similarities@user_ratings)/sum(abs(item_similarities)) + user_mean.loc[userid]


### Item Based

#### Cosine Similarity

In [11]:
x_item_cs_mc = cosine_similarity(user_item_norm.fillna(0).T)

In [15]:
pred_item_cs_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred_item_cs_mc.append(get_rating_item_mc(test_user, test_movie, x_item_cs_mc))

In [17]:
rmse_item_cs_mc = rmse(list(test['Ratings']), pred_item_cs_mc)

In [18]:
rmse_item_cs_mc

0.7555139715164364

#### Euclidean Distance

In [19]:
x_item_ed_mc = nan_euclidean_distances(user_item_norm.T)/np.sqrt(6040)

In [20]:
x_item_ed_mc.shape

(3706, 3706)

In [21]:
pred_item_ed_mc = []
actual_item_ed_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_item_mc(test_user, test_movie, x_item_ed_mc)
  if math.isnan(rating) == False:
    pred_item_ed_mc.append(rating)
    actual_item_ed_mc.append(test.iloc[i,2])

In [22]:
rmse_item_ed_mc = rmse(pred_item_ed_mc, actual_item_ed_mc)

In [23]:
rmse_item_ed_mc

1.0854495105344457

#### Pearson Correlation

In [27]:
x_item_pc_mc = user_item_norm.corr()

In [28]:
x_item_pc_mc = x_item_pc_mc.to_numpy()

In [29]:
x_item_pc_mc.shape

(3706, 3706)

In [30]:
pred_item_pc_mc = []
actual_item_pc_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_item_mc(test_user, test_movie, x_item_pc_mc)
  if math.isnan(rating) == False:
    pred_item_pc_mc.append(rating)
    actual_item_pc_mc.append(test.iloc[i,2])

In [31]:
rmse_item_pc_mc = rmse(pred_item_pc_mc, actual_item_pc_mc)

In [32]:
rmse_item_pc_mc

0.7992480677479128

### User Based

#### Cosine Similarity

In [16]:
user_item_norm_0 = user_item_norm.fillna(0)

In [50]:
x_user_cs_mc = cosine_similarity(user_item_norm_0)

In [89]:
pred_cs_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  pred_cs_mc.append(get_rating_user_mc(test_user, test_movie, x_user_cs_mc))

In [91]:
rmse_user_cs_mc = rmse(list(test['Ratings']), pred_cs_mc)

In [92]:
rmse_user_cs_mc

0.7693630555174746

#### Euclidean Distance

In [124]:
x_user_ed_mc = nan_euclidean_distances(user_item_norm)/np.sqrt(3706)

In [125]:
x_user_ed_mc.shape

(6040, 6040)

In [129]:
pred_user_ed_mc = []
actual_user_ed_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_user_mc(test_user, test_movie, x_user_ed_mc)
  if math.isnan(rating) == False:
    pred_user_ed_mc.append(rating)
    actual_user_ed_mc.append(test.iloc[i,2])

  del sys.path[0]


In [139]:
rmse_user_ed_mc = rmse(pred_user_ed_mc, actual_user_ed_mc)

In [140]:
rmse_user_ed_mc

0.9443797636588582

#### Pearson Correlation

In [110]:
x_user_pc_mc = user_item_norm.T.corr()

In [111]:
x_user_pc_mc = x_user_pc_mc.to_numpy()

In [112]:
x_user_pc_mc

array([[ 1.        ,  0.41666667, -0.33218192, ...,         nan,
         0.05685735, -0.04351941],
       [ 0.41666667,  1.        ,  0.23683386, ..., -0.5       ,
         0.57207755, -0.0271435 ],
       [-0.33218192,  0.23683386,  1.        , ...,  0.5       ,
         0.30927686, -0.39528471],
       ...,
       [        nan, -0.5       ,  0.5       , ...,  1.        ,
         0.27116307, -0.39712226],
       [ 0.05685735,  0.57207755,  0.30927686, ...,  0.27116307,
         1.        ,  0.24230884],
       [-0.04351941, -0.0271435 , -0.39528471, ..., -0.39712226,
         0.24230884,  1.        ]])

In [113]:
x_user_pc_mc.shape

(6040, 6040)

In [114]:
pred_user_pc_mc = []
actual_user_pc_mc = []
for i in range(len(test)):
  test_user = test.iloc[i,0]
  test_movie = test.iloc[i,1]
  rating = get_rating_user_mc(test_user, test_movie, x_user_pc_mc)
  if math.isnan(rating) == False:
    pred_user_pc_mc.append(rating)
    actual_user_pc_mc.append(test.iloc[i,2])

In [115]:
rmse_user_pc_mc = rmse(pred_user_pc_mc, actual_user_pc_mc)

In [116]:
rmse_user_pc_mc

0.8277507466576526