# Recommender Systems

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import chardet
from tqdm import tqdm

np.random.seed(42)

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [2]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing encoding type
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [3]:
# Loading ratings data
ratings_path = "./data/ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"})# Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# Loading movies data
movies_path = "./data/movies.dat"
movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Loading users data
users_path = "./data/users.dat"
users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(users_path))
users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## k-fold Cross Validation

In [6]:
def rsme(y_true, y_pred):
    return math.sqrt(np.square(np.subtract(y_true, y_pred)).mean())

In [7]:
def mae(y_true, y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [8]:
def cross_validation(model, X, y, k = 5):
    # Shuffle data
    X, y = shuffle(X, y, random_state=42)
    
    # Split data
    n = len(X)
    X_folds = [X.iloc[(i - 1) * (n // k):i * (n // k),:] for i in range(1, k + 1)]
    y_folds = [y.iloc[(i - 1) * (n // k):i * (n // k)] for i in range(1, k + 1)]
    
    # Initialize array to store RSME calculations
    rmse_ = np.empty((0, 2), float)
    mae_ = np.empty((0, 2), float)
    
    
    for i, (X_test, y_test) in enumerate(zip(X_folds, y_folds)):
        X_train = pd.concat([X for j, X in enumerate(X_folds) if i != j])
        y_train = pd.concat([y for j, y in enumerate(y_folds) if i != j])
        
        # Train model
        model.fit(X_train, y_train)
        
        # Evaluate on training and test set
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate errors
        rmse_train = rsme(y_train, y_train_pred) 
        rmse_test = rsme(y_test, y_test_pred)
        mae_train = mae(y_train, y_train_pred)
        mae_test = mae(y_test, y_test_pred)
        
        rmse_ = np.append(rmse_, np.array([[rmse_train, rmse_test]]), axis = 0)
        mae_ = np.append(mae_, np.array([[mae_train, mae_test]]), axis = 0)
    
    rmse_train_aver = rmse_[:, 0].mean()
    rmse_test_aver = rmse_[:, 1].mean()
    mae_train_aver = mae_[:, 0].mean()
    mae_test_aver = mae_[:, 1].mean()
    
    return (rmse_train_aver, rmse_test_aver), (mae_train_aver, mae_test_aver)

## Naive Approaches

In [9]:
class NaiveApproach:
    def __init__(self, fit = lambda *args: None, predict = lambda *args: None):
        self.predict = predict
        self.fit = fit

In [10]:
X = ratings[['UserID', 'MovieID']]
y = ratings['Rating']

### Global average rating

$$
R_{global}(User, Item)=mean(\text{all ratings})
$$

In [11]:
r_global = ratings['Rating'].mean()
r_global

3.581564453029317

In [12]:
# Cross validation
global_average_rating_model = NaiveApproach(predict=lambda *args: r_global)
(_, rmse_test) , (_, mae_test) = cross_validation(global_average_rating_model, X, y)

print(f'RSME: {round(rmse_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 1.117
MAE: 0.934


### Average rating per Item

$$
R_{item}(User, Item)=mean(\text{all ratings for Item})
$$

In [13]:
def r_item(item):
    ratings_item = ratings[(ratings['MovieID'] == item)]
    return ratings_item['Rating'].mean() if len(ratings_item) > 0 else r_global

In [14]:
# Compute average rates of all items
R_i = {}

for movieID in ratings['MovieID'].unique():
    R_i[movieID] = r_item(movieID)

In [15]:
# Cross validation
average_rating_item_model = NaiveApproach(predict=lambda X: X['MovieID'].apply(lambda x: R_i[x]))
(_, rmse_test) , (_, mae_test) = cross_validation(average_rating_item_model, X, y)

print(f'RSME: {round(rmse_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 0.975
MAE: 0.779


### Average rating per User

$$
R_{User}(User, Item)=mean(\text{all ratings for User})
$$

In [16]:
def r_user(user):
    ratings_user = ratings[(ratings['UserID'] == user)]
    return ratings_user['Rating'].mean() if len(ratings_user) > 0 else r_global

In [17]:
# Compute average rates of all users
R_u = {}

for userID in ratings['UserID'].unique():
    R_u[userID] = r_user(userID)

In [18]:
# Cross validation
average_rating_user_model = NaiveApproach(predict=lambda X: X['UserID'].apply(lambda x: R_u[x]))
(_, rmse_test) , (_, mae_test) = cross_validation(average_rating_user_model, X, y)

print(f'RSME: {round(rmse_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 1.028
MAE: 0.823


### Optimal Linear Combination of 2 averages

$$
R_{user-item}(User, Item) = \alpha * R_{user}(User, Item) + \beta * R_{item}(User, Item) + \gamma
$$

In [19]:
class LinearCombination:
    def fit(self, X, y):
        y = y.to_numpy()
        
        X_u = X['UserID'].apply(lambda x: R_u[x]).to_numpy()
        X_i = X['MovieID'].apply(lambda x: R_i[x]).to_numpy()
        X = np.vstack([X_u, X_i])
        
        A = np.vstack([X, np.ones(X.shape[1])]).T
        self.alpha, self.beta, self.gamma = np.linalg.lstsq(A, y, rcond=None)[0]
    
    def formula(self, r_u, r_i):
        return 
    
    def predict(self, X):
        r_u = X['UserID'].apply(lambda x: R_u[x])
        r_i = X['MovieID'].apply(lambda x: R_i[x])
        
        return self.alpha * r_u + self.beta * r_i + self.gamma

In [20]:
# Cross validation
linear_combination_model = LinearCombination()
(_, rmse_test) , (_, mae_test) = cross_validation(linear_combination_model, X, y)

print(f'RSME: {round(rmse_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 0.916
MAE: 0.726


## UV Decomposition

In [21]:
class roundrobin:
    def __init__(self, U, V):
        self.U = U
        self.V = V
        
        self.maxrows1, self.maxcols1 = U.shape[0], U.shape[1]
        self.maxrows2, self.maxcols2 = V.shape[0], V.shape[1]
        
        
    def __iter__(self):
        self.iterU = True
        self.row1 = self.col1 = 0
        self.row2 = self.col2 = 0
        return self

    def __next__(self):
        if self.iterU:
            if self.row1 > self.maxrows1 - 1:
                self.row1 = 0
                self.col1 += 1
                if self.col1 > self.maxcols1 - 1:
                    raise StopIteration

            next_ = (self.iterU, self.row1, self.col1)
            self.iterU = False
            self.row1 += 1
            return next_
        else:    
            if self.row2 > self.maxrows2 - 1:
                self.row2 = 0
                self.col2 += 1
                if self.col2 > self.maxcols2 - 1:
                    raise StopIteration

            next_ = (self.iterU, self.row2, self.col2)
            self.iterU = True
            self.row2 += 1
            return next_

### Preprocessing

In [22]:
# n Users
unique_users = ratings['UserID'].unique()
n = len(unique_users)

# m Items
unique_items = ratings['MovieID'].unique()
m = len(unique_items)

# Initialize utility matrix
M = np.full((n,m), np.nan)

In [23]:
# Map ids to index & vice-versa
user_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
index_to_user = {idx: user_id for idx, user_id in enumerate(unique_users)}

item_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}
index_to_item = {idx: item_id for idx, item_id in enumerate(unique_items)}

In [24]:
# Populate matrix M
for _, row in ratings.iterrows():
    user_id = row['UserID']
    item_id = row['MovieID']
    
    norm = (R_u[user_id] + R_i[item_id]) / 2
    
    M[user_to_index[user_id], item_to_index[item_id]] = row['Rating'] - norm

In [25]:
# Define loss function
def rmse_2d(M, M_pred):
    mask = ~np.isnan(M)
    return math.sqrt(np.mean(np.square(M - M_pred), where=~np.isnan(M)))

### Initialization

In [26]:
# Length of short sizes of U & V
d = 10

# Mean of non-blank values in M
a = np.mean(M, where=~np.isnan(M))

initial_value = math.sqrt(a/d)

U = np.full((n, d), initial_value)
V = np.full((d, m), initial_value)

### Performing the Optimization

#### Optimizing elements of U
$$
x = \frac{\sum_{j} v_{sj}(m_{rj} - \sum_{k \neq s} u_{rk}v_{kj})}{\sum_{j} v^{2}_{sj}}
$$


In [27]:
def optimize_x(M, U, V, r, s):
    urk = np.delete(U[r, :], s, axis=0)
    vkj = np.delete(V, s, axis=0)

    vsj = V[s, :]

    numerator = vsj * (M[r, :] - np.dot(urk.reshape(-1,1).T, vkj))
    numerator = np.sum(numerator, where=~np.isnan(numerator))

    denominator = np.square(vsj)
    denominator = np.sum(denominator)

    return numerator / denominator

#### Optimizing elements of V
$$
y = \frac{\sum_{i} u_{ir}(m_{is} - \sum_{k \neq r} u_{ik}v_{ks})}{\sum_{i} u^{2}_{ir}}
$$

In [28]:
def optimize_y(M, U, V, r, s):   
    uik = np.delete(U, r, axis=1)
    vks = np.delete(V[:,s], r, axis=0)
    
    uir = U[:,r]
   
    numerator = uir * (M[:,s] - np.dot(uik, vks))
    numerator = np.sum(numerator, where=~np.isnan(numerator))

    denominator = np.square(uir)
    denominator = np.sum(denominator, where=~np.isnan(denominator))

    return numerator / denominator

#### Optimizing decompositions U and V

In [29]:
UV = np.dot(U, V)
print(f'Initial RMSE: {rmse_2d(M, UV)}')

Initial RMSE: 0.9515933193173758


In [30]:
same_err = 0
last_err = float('inf')
for _ in tqdm(range(15)):
    # Stop if error doesn't improve for 3 epochs
    if same_err >= 3:
      break

    # Training
    for isU, r, s in roundrobin(U, V):
        if isU:
            U[r,s] = optimize_x(M, U, V, r, s)
        else:
            V[r,s] = optimize_y(M, U, V, r, s)
    
    # Monitoring
    UV = np.dot(U, V)
    err = round(rmse_2d(M, UV),3)
    if err == last_err:
      same_err += 1
    last_err = err

    print(f'RMSE: {err}')

  7%|██▉                                         | 1/15 [00:14<03:16, 14.05s/it]

RMSE: 0.914


 13%|█████▊                                      | 2/15 [00:26<02:53, 13.32s/it]

RMSE: 0.903


 20%|████████▊                                   | 3/15 [00:40<02:39, 13.33s/it]

RMSE: 0.9


 27%|███████████▋                                | 4/15 [00:52<02:22, 12.98s/it]

RMSE: 0.898


 33%|██████████████▋                             | 5/15 [01:04<02:05, 12.56s/it]

RMSE: 0.897


 40%|█████████████████▌                          | 6/15 [01:18<01:56, 12.94s/it]

RMSE: 0.895


 47%|████████████████████▌                       | 7/15 [01:33<01:49, 13.75s/it]

RMSE: 0.894


 53%|███████████████████████▍                    | 8/15 [01:47<01:35, 13.68s/it]

RMSE: 0.893


 60%|██████████████████████████▍                 | 9/15 [02:00<01:21, 13.54s/it]

RMSE: 0.892


 67%|████████████████████████████▋              | 10/15 [02:12<01:05, 13.20s/it]

RMSE: 0.892


 73%|███████████████████████████████▌           | 11/15 [02:25<00:52, 13.07s/it]

RMSE: 0.891


 80%|██████████████████████████████████▍        | 12/15 [02:39<00:39, 13.21s/it]

RMSE: 0.891


 87%|█████████████████████████████████████▎     | 13/15 [02:51<00:26, 13.19s/it]

RMSE: 0.891





In [31]:
UV = np.dot(U, V)
print(f'Optimized RMSE: {rmse_2d(M, UV)}')

Optimized RMSE: 0.8910636397215524


In [39]:
# Cross validation
def decomp_predict(X_test):
    y_pred = []
    for _, x in X_test.iterrows():
        user_id = x['UserID']
        item_id = x['MovieID']
        norm = (R_u[user_id] + R_i[item_id]) / 2
        y_pred.append(UV[user_to_index[user_id], item_to_index[item_id]] + norm)
        
    return y_pred
    
uv_decomposition_model = NaiveApproach(predict=decomp_predict)
(_, rmse_test) , (_, mae_test) = cross_validation(uv_decomposition_model, X, y)

print(f'RSME: {round(rmse_test, 3)}\nMAE: {round(mae_test, 3)}')

RSME: 0.891
MAE: 0.708
