# Recommender Systems

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import chardet

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [2]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing encoding type
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [3]:
# Loading ratings data
ratings_path = "./data/ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"})# Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# Loading movies data
movies_path = "./data/movies.dat"
movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Loading users data
users_path = "./data/users.dat"
users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(users_path))
users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## k-fold Cross Validation

In [6]:
def rsme(y_true, y_pred):
    return math.sqrt(np.square(np.subtract(y_true, y_pred)).mean())

In [7]:
def mae(y_true, y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [8]:
def cross_validation(model, X, y, k = 5):
    # Shuffle data
    X, y = shuffle(X, y, random_state=42)
    
    # Split data
    n = len(X)
    X_folds = [X.iloc[(i - 1) * (n // k):i * (n // k),:] for i in range(1, k + 1)]
    y_folds = [y.iloc[(i - 1) * (n // k):i * (n // k)] for i in range(1, k + 1)]
    
    # Initialize array to store RSME calculations
    rsme_ = np.empty((0, 2), float)
    mae_ = np.empty((0, 2), float)
    
    for i, (X_test, y_test) in enumerate(zip(X_folds, y_folds)):
        X_train = pd.concat([X for j, X in enumerate(X_folds) if i != j])
        y_train = pd.concat([y for j, y in enumerate(y_folds) if i != j])
        
        # Train model
        model.fit(X_train, y_train)
        
        # Evaluate on training and test set
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate errors
        rsme_train = rsme(y_train, y_train_pred) 
        rsme_test = rsme(y_test, y_test_pred)
        mae_train = mae(y_train, y_train_pred)
        mae_test = mae(y_test, y_test_pred)
        
        rsme_ = np.append(rsme_, np.array([[rsme_train, rsme_test]]), axis = 0)
        mae_ = np.append(mae_, np.array([[mae_train, mae_test]]), axis = 0)
    
    rsme_train_aver = rsme_[:, 0].mean()
    rsme_test_aver = rsme_[:, 1].mean()
    mae_train_aver = mae_[:, 0].mean()
    mae_test_aver = mae_[:, 1].mean()
    
    return (rsme_train_aver, rsme_test_aver), (mae_train_aver, mae_test_aver)

## Naive Approaches

In [9]:
class NaiveApproach:
    def __init__(self, fit = lambda *args: None, predict = lambda *args: None):
        self.predict = predict
        self.fit = fit

In [10]:
X = ratings[['UserID', 'MovieID']]
y = ratings['Rating']

### Global average rating

$$
R_{global}(User, Item)=mean(\text{all ratings})
$$

In [11]:
r_global = ratings['Rating'].mean()
r_global

3.581564453029317

In [12]:
# Cross validation
global_average_rating_model = NaiveApproach(predict=lambda *args: r_global)
(_, rsme_test) , (_, mae_test) = cross_validation(global_average_rating_model, X, y)

print(f'RSME: {round(rsme_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 1.117
MAE: 0.934


### Average rating per Item

$$
R_{item}(User, Item)=mean(\text{all ratings for Item})
$$

In [13]:
def r_item(item):
    ratings_item = ratings[(ratings['MovieID'] == item)]
    return ratings_item['Rating'].mean() if len(ratings_item) > 0 else r_global

In [14]:
# Compute average rates of all items
R_i = {}

for movieID in ratings['MovieID'].unique():
    R_i[movieID] = r_item(movieID)

In [15]:
# Cross validation
average_rating_item_model = NaiveApproach(predict=lambda X: X['MovieID'].apply(lambda x: R_i[x]))
(_, rsme_test) , (_, mae_test) = cross_validation(average_rating_item_model, X, y)

print(f'RSME: {round(rsme_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 0.975
MAE: 0.779


### Average rating per User

$$
R_{User}(User, Item)=mean(\text{all ratings for User})
$$

In [16]:
def r_user(user):
    ratings_user = ratings[(ratings['UserID'] == user)]
    return ratings_user['Rating'].mean() if len(ratings_user) > 0 else r_global

In [17]:
# Compute average rates of all users
R_u = {}

for userID in ratings['UserID'].unique():
    R_u[userID] = r_user(userID)

In [18]:
# Cross validation
average_rating_user_model = NaiveApproach(predict=lambda X: X['UserID'].apply(lambda x: R_u[x]))
(_, rsme_test) , (_, mae_test) = cross_validation(average_rating_user_model, X, y)

print(f'RSME: {round(rsme_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 1.028
MAE: 0.823


### Optimal Linear Combination of 2 averages

$$
R_{user-item}(User, Item) = \alpha * R_{user}(User, Item) + \beta * R_{item}(User, Item) + \gamma
$$

In [59]:
class LinearCombination:
    def fit(self, X, y):
        y = y.to_numpy()
        
        X_u = X['UserID'].apply(lambda x: R_u[x]).to_numpy()
        X_i = X['MovieID'].apply(lambda x: R_i[x]).to_numpy()
        X = np.vstack([X_u, X_i])
        
        A = np.vstack([X, np.ones(X.shape[1])]).T
        self.alpha, self.beta, self.gamma = np.linalg.lstsq(A, y, rcond=None)[0]
    
    def formula(self, r_u, r_i):
        return 
    
    def predict(self, X):
        r_u = X['UserID'].apply(lambda x: R_u[x])
        r_i = X['MovieID'].apply(lambda x: R_i[x])
        
        return self.alpha * r_u + self.beta * r_i + self.gamma

In [60]:
# Cross validation
linear_combination_model = LinearCombination()
(_, rsme_test) , (_, mae_test) = cross_validation(linear_combination_model, X, y)

print(f'RSME: {round(rsme_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 0.916
MAE: 0.726
