# Recommender Systems

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
import chardet

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [2]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing encoding type
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [3]:
# Loading ratings data
ratings_path = "./data/ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"})# Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# Loading movies data
movies_path = "./data/movies.dat"
movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Loading users data
users_path = "./data/users.dat"
users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(users_path))
users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Naive Approaches

### Global average rating

$$
R_{global}(User, Item)=mean(\text{all ratings})
$$

In [12]:
r_global = ratings['Rating'].mean()
r_global

3.581564453029317

### Average rating per Item

$$
R_{item}(User, Item)=mean(\text{all ratings for Item})
$$

In [13]:
def r_item(item):
    ratings_item = ratings[(ratings['MovieID'] == item)]
    return ratings_item['Rating'].mean()
    
r_item(1193)

4.390724637681159

### Average rating per User

$$
R_{User}(User, Item)=mean(\text{all ratings for User})
$$

In [14]:
def r_user(user):
    ratings_user = ratings[(ratings['UserID'] == user)]
    return ratings_user['Rating'].mean()
    
r_user(1)

4.188679245283019

### Optimal Linear Combination of 2 averages

In [15]:
# Compute average rates of all users
R_u = {}

for userID in ratings['UserID'].unique():
    R_u[userID] = r_user(userID)

In [16]:
# Compute average rates of all items
R_i = {}

for movieID in ratings['MovieID'].unique():
    R_i[movieID] = r_item(movieID)

In [17]:
# Params training data
X = ratings[['UserID', 'MovieID']].to_numpy()
Y = ratings['Rating'].to_numpy()

#### Including bias term

$$
R_{user-item}(User, Item) = \alpha * R_{user}(User, Item) + \beta * R_{item}(User, Item) + \gamma
$$

In [18]:
def r_user_item_with_bias(alpha, beta, gamma, user, item):
    r_u = R_u[user]
    r_i = R_i[item]
    return alpha * r_u + beta * r_i + gamma

In [68]:
def learn_params_with_bias(X, Y):
    gamma = np.random.uniform(0, 0.1)
    beta = np.random.uniform(0, 0.1)
    alpha = np.random.uniform(0, 0.1)
    lrate = 0.01
    
    for epoch in range(1, 10):
        for x, y in zip(X, Y):
            r_u = R_u[x[0]]
            r_i = R_i[x[1]]
            
            y_pred = alpha * r_u + beta * r_i + gamma

            d_l = -2 * (y - y_pred)
            d_alpha = r_u * d_l
            d_beta = r_i * d_l
            d_gamma = d_l

            # Update params
            alpha -= lrate * d_alpha
            beta -= lrate * d_beta
            gamma -= lrate * d_gamma
        
    
    return alpha, beta, gamma

In [69]:
# Estimate params with bias
alpha, beta, gamma = learn_params_with_bias(X, Y)

In [70]:
Y_pred = [r_user_item_with_bias(alpha, beta, gamma, x[0], x[1]) for x,y in zip(X,Y)]
print(f'MSE: {mean_squared_error(Y, Y_pred)}')

MSE: 0.9739831459715035


#### Excluding bias term

$$
R_{user^{*}-item}(User, Item) = \alpha * R_{user}(User, Item) + \beta_{user} * R_{item}(User, Item)
$$

In [22]:
def r_user_item_without_bias(alpha, beta, user, item):
    r_u = R_u[user]
    r_i = R_i[item]
    return alpha * r_u + beta * r_i

In [64]:
def learn_params_without_bias(X, Y):
    beta = np.random.uniform(0, 0.1)
    alpha = np.random.uniform(0, 0.1)
    lrate = 0.01
    
    for epoch in range(1, 10):
        for x, y in zip(X, Y):
            r_u = R_u[x[0]]
            r_i = R_i[x[1]]
            
            y_pred = alpha * r_u + beta * r_i + gamma

            d_l = -2 * (y -  y_pred)
            d_alpha = r_u * d_l
            d_beta = r_i * d_l

            # Update params
            alpha -= lrate * d_alpha
            beta -= lrate * d_beta
        
        Y_pred = [r_user_item_without_bias(alpha, beta, x[0], x[1]) for x,y in zip(X,Y)]
    
    return alpha, beta

In [65]:
# Estimate params without bias
alpha, beta = learn_params_without_bias(X, Y)

In [66]:
Y_pred = [r_user_item_without_bias(alpha, beta, x[0], x[1]) for x,y in zip(X,Y)]
print(f'MSE: {mean_squared_error(Y, Y_pred)}')

MSE: 12.83186075662626
