# Recommender Systems

In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import chardet

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [49]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing enconding type
    """
    
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [50]:
# Loading ratings data
ratings_path ="./ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"}) # Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [51]:
#Loading movies data
# movies_path = "./movies.dat"
# movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
# movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

# movies.head()

In [52]:
#Loading users data
# users_path = "./users.dat"
# users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(users_path))
# users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

# users.head()

## Pre-processing Data

### Preparing Users

In [53]:
# # One Hot Encode Gender
# encoder = OneHotEncoder(sparse_output=False)

# # Encode genders
# encoded_gender = encoder.fit_transform(users[['Gender']])
# encoded_gender_df = pd.DataFrame(encoded_gender, columns = encoder.get_feature_names_out(['Gender']))

# # Concat new hot encoded columns
# users = pd.concat([users, encoded_gender_df], axis = 1)

# # Drop previous gender column
# users.drop(['Gender'], axis='columns', inplace=True)

In [54]:
# # Label Encode Zip-code
# le = LabelEncoder()

# # Update column
# users['Zip-code'] = le.fit_transform(users['Zip-code'])

In [55]:
# users.head()

In [56]:
# users.info()

# Naive Approaches

In [57]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [58]:
# FOUR FUCTIONS => FOUR NAIVE APPROACHES

# global average
def global_average(train, test, is_train=False):
    if is_train:
        return [train['Rating'].mean()] * len(train)
    else:
        return [train['Rating'].mean()] * len(test)

# movie average
def movie_average(train, test, is_train=False):
    if is_train:
        movie_avg_train = train.groupby('MovieID')['Rating'].mean()
        return movie_avg_train[train['MovieID']].to_numpy()    # A NumPy ndarray representing the values in this Series

    else:
        movie_avg_predictions = test['MovieID'].map(train.groupby('MovieID')['Rating'].mean())  # movie average predictions for a test set based on 
        movie_avg_predictions.fillna(train['Rating'].mean(), inplace=True)                       # the movie average ratings in the training set             
        return movie_avg_predictions

# user average
def user_average(train, test, is_train=False):
    if is_train:
        user_avg_train = train.groupby('UserID')['Rating'].mean()
        return user_avg_train[train['UserID']].to_numpy()    # A NumPy ndarray representing the values in this Series

    else:
        user_avg_predictions = test['UserID'].map(train.groupby('UserID')['Rating'].mean()) # user average predictions for a test set based on 
        user_avg_predictions.fillna(train['Rating'].mean(), inplace=True)                       # the user average ratings in the training set
        return user_avg_predictions


# linear combination of the three averages
def linear_combination(train, test, is_train=False):
    user_avg = train.groupby('UserID')['Rating'].mean()
    movie_avg = train.groupby('MovieID')['Rating'].mean()

    A = np.vstack([user_avg[train['UserID']], movie_avg[train['MovieID']], np.ones(len(train))]).T
    b = train['Rating']

    alpha, beta, gamma = np.linalg.lstsq(A, b, rcond=None)[0]     # https://numpy.org/doc/stable/reference/generated/numpy.linalg.lstsq.html

    if is_train: 
        prediction = alpha * user_average(train, test, is_train=True) + beta * movie_average(train, test, is_train=True) + gamma
    else:
        prediction = alpha * user_average(train, test) + beta * movie_average(train, test) + gamma

    prediction = np.clip(prediction, 1, 5)

    return prediction


    # I already handled the missing values is user_average() and movie_average()
    


## 5-fold Cross-Validation

In [59]:
# 5 - fold Cross Validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)   # random seed set at the beginning

# training set
rmse_global_train = []
mae_global_train = []

rmse_user_train = []
mae_user_train = []

rmse_movie_train = []
mae_movie_train = []

rmse_combination_train = []
mae_combination_train = []

# test set
rmse_global_test = []
mae_global_test = []

rmse_user_test = []
mae_user_test = []

rmse_movie_test = []
mae_movie_test = []

rmse_combination_test = []
mae_combination_test = []

for train_index, test_index in kf.split(ratings):
    train_data, test_data = ratings.iloc[train_index], ratings.iloc[test_index]

    # Compute RMSE and MAE over training set
    rmse_global_train.append(np.sqrt(mean_squared_error(train_data['Rating'], global_average(train_data, test_data, is_train=True))))
    mae_global_train.append(mean_absolute_error(train_data['Rating'], global_average(train_data, test_data, is_train=True)))

    rmse_user_train.append(np.sqrt(mean_squared_error(train_data['Rating'], user_average(train_data, test_data, is_train=True))))
    mae_user_train.append(mean_absolute_error(train_data['Rating'], user_average(train_data, test_data, is_train=True)))

    rmse_movie_train.append(np.sqrt(mean_squared_error(train_data['Rating'], movie_average(train_data, test_data, is_train=True))))
    mae_movie_train.append(mean_absolute_error(train_data['Rating'], movie_average(train_data, test_data, is_train=True)))

    rmse_combination_train.append(np.sqrt(mean_squared_error(train_data['Rating'], linear_combination(train_data, test_data, is_train=True))))
    mae_combination_train.append(mean_absolute_error(train_data['Rating'], linear_combination(train_data, test_data, is_train=True)))

    # Compute RMSE and MAE test set
    rmse_global_test.append(np.sqrt(mean_squared_error(test_data['Rating'], global_average(train_data, test_data))))
    mae_global_test.append(mean_absolute_error(test_data['Rating'], global_average(train_data, test_data)))

    rmse_user_test.append(np.sqrt(mean_squared_error(test_data['Rating'], user_average(train_data, test_data))))
    mae_user_test.append(mean_absolute_error(test_data['Rating'], user_average(train_data, test_data)))

    rmse_movie_test.append(np.sqrt(mean_squared_error(test_data['Rating'], movie_average(train_data, test_data))))
    mae_movie_test.append(mean_absolute_error(test_data['Rating'], movie_average(train_data, test_data)))

    rmse_combination_test.append(np.sqrt(mean_squared_error(test_data['Rating'], linear_combination(train_data, test_data))))
    mae_combination_test.append(mean_absolute_error(test_data['Rating'], linear_combination(train_data, test_data)))

    # print(rmse_global_train) # to chesk how 5-folds cross validation works



print("RMSE and MAE over training set:")
print("Global Average RMSE - training set:", np.mean(rmse_global_train).round(5))  
print("Global Average MAE - training set:", np.mean(mae_global_train).round(5))

print("-----------------------------")

print("User Average RMSE - training set:", np.mean(rmse_user_train).round(5))
print("User Average MAE - training set:", np.mean(mae_user_train).round(5))

print("-----------------------------")

print("Movie Average RMSE - training set:", np.mean(rmse_movie_train).round(5))
print("Movie Average MAE - training set:", np.mean(mae_movie_train).round(5))

print("-----------------------------")

print("Linear Combination RMSE - training set:", np.mean(rmse_combination_train).round(5))
print("Linear Combination MAE - training set:", np.mean(mae_combination_train).round(5))

print("---------------------------------------------")

print("RMSE and MAE over test set:")
print("Global Average RMSE - test set:", np.mean(rmse_global_test).round(5))  
print("Global Average MAE - test set:", np.mean(mae_global_test).round(5))

print("-----------------------------")

print("User Average RMSE - test set:", np.mean(rmse_user_test).round(5))
print("User Average MAE:", np.mean(mae_user_test).round(5))

print("-----------------------------")

print("Movie Average RMSE - test set:", np.mean(rmse_movie_test).round(5))
print("Movie Average MAE - test set:", np.mean(mae_movie_test).round(5))

print("-----------------------------")

print("Linear Combination RMSE - test set:", np.mean(rmse_combination_test).round(5))
print("Linear Combination MAE - test set:", np.mean(mae_combination_test).round(5))


RMSE and MAE over training set:
Global Average RMSE - training set: 1.1171
Global Average MAE - training set: 0.93386
-----------------------------
User Average RMSE - training set: 1.02767
User Average MAE - training set: 0.82272
-----------------------------
Movie Average RMSE - training set: 0.97423
Movie Average MAE - training set: 0.77834
-----------------------------
Linear Combination RMSE - training set: 0.91456
Linear Combination MAE - training set: 0.72481
---------------------------------------------
RMSE and MAE over test set:
Global Average RMSE - test set: 1.1171
Global Average MAE - test set: 0.93386
-----------------------------
User Average RMSE - test set: 1.03548
User Average MAE: 0.82895
-----------------------------
Movie Average RMSE - test set: 0.97937
Movie Average MAE - test set: 0.78228
-----------------------------
Linear Combination RMSE - test set: 0.92426
Linear Combination MAE - test set: 0.73243


In [62]:
d_train = {"Metric": ["RMSE", "MAE"],
          "Global Average": [np.mean(rmse_global_train), np.mean(mae_global_train)],
          "User Average": [np.mean(rmse_user_train), np.mean(mae_user_train)],
          "Movie Average": [np.mean(rmse_movie_train), np.mean(mae_movie_train)],
          "Linear Combination": [np.mean(rmse_combination_train),np.mean(mae_combination_train)]}
df_train = pd.DataFrame(data=d_train)

print("RMSE and MAE table over training set:")
df_train

RMSE and MAE table over training set:


Unnamed: 0,Metric,Global Average,User Average,Movie Average,Linear Combination
0,RMSE,1.117101,1.027673,0.974228,0.914556
1,MAE,0.933861,0.822719,0.778336,0.72481


In [60]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

d_test = {"Metric": ["RMSE", "MAE"],
          "Global Average": [np.mean(rmse_global_test), np.mean(mae_global_test)],
          "User Average": [np.mean(rmse_user_test), np.mean(mae_user_test)],
          "Movie Average": [np.mean(rmse_movie_test), np.mean(mae_movie_test)],
          "Linear Combination": [np.mean(rmse_combination_test),np.mean(mae_combination_test)]}
df_test = pd.DataFrame(data=d_test)

print("RMSE and MAE table over test set:")
df_test

Unnamed: 0,Metric,Global Average,User Average,Movie Average,Linear Combination
0,RMSE,1.117101,1.03548,0.979367,0.924256
1,MAE,0.933862,0.82895,0.782284,0.73243
