# Recommender Systems

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import chardet
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [31]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing enconding type
    """
    
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [32]:
# Loading ratings data
ratings_path = "./ml-1m/ml-1m/ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"}) # Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [33]:
#Loading movies data
movies_path = "./ml-1m/ml-1m/movies.dat"
movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
#Loading users data
users_path = "./ml-1m/ml-1m/users.dat"
users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(users_path))
users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Pre-processing Data

In [35]:
le = LabelEncoder()

In [36]:
unique_users_ids = ratings['UserID'].unique()
# Assuming your movies features matrix is named 'movies_features_df'
filtered_users = users[users['UserID'].isin(unique_users_ids)]

unique_movies_ids = ratings['MovieID'].unique()
# Assuming your movies features matrix is named 'movies_features_df'
filtered_movies = movies[movies['MovieID'].isin(unique_movies_ids)]


filtered_movies.loc[:, 'MovieID'] = le.fit_transform(filtered_movies['MovieID'])
filtered_users.loc[:, 'UserID'] = le.fit_transform(filtered_users['UserID'])

In [None]:
filtered_movies.reset_index(drop=True, inplace=True)
filtered_users.reset_index(drop=True, inplace=True)

In [37]:
user_movie_ratings = ratings.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

In [38]:
# Encode columns (MovieIDs)
user_movie_ratings.columns = le.fit_transform(user_movie_ratings.columns)

# Encode row names (UserIDs)
user_movie_ratings.index = le.fit_transform(user_movie_ratings.index)

# Matrix Factorization

In [39]:
class MatrixFactorization:
    def __init__(self, K=10, max_iter=75, eta=0.005, lamda=0.05):
        self.K = K  # Number of latent features
        self.max_iter = max_iter  # Maximum number of iterations
        self.eta = eta  # Learning rate
        self.lamda = lamda  # Regularization parameter

    def fit(self, train_set):
        self.zero_rows=np.where(~train_set.values.any(axis=1))[0]
        self.zero_columns=np.where(~train_set.values.any(axis=0))[0]

        prev_rmse = float('inf')
        consecutive_increase_count = 0

        probe_subset = [(i, j, train_set.iloc[i, j]) for i in range(len(train_set)) for j in range(len(train_set.columns)) if train_set.iloc[i, j] > 0]

        N, Z = train_set.shape  # Dimensions of the user-item matrix
        self.train_set_values=train_set.values

        self.U = np.random.rand(N, self.K)  # Initialize user matrix randomly
        self.M = np.random.rand(Z, self.K)  # Initialize movie matrix randomly

        raw_predictions = np.dot(self.U, self.M.T)
        self.predictions =  np.clip(raw_predictions, 1, 5)
        
        for step in range(self.max_iter):
            # Calculate errors for non-zero entries in train_set
            train_values = np.where(np.isnan(train_set.values), 0, train_set.values) #put zero where there is nan
            U_values = np.where(np.isnan(self.U), 0, self.U)
            M_T_values = np.where(np.isnan(self.M.T), 0, self.M.T)
            
            errors = (train_values > 0) * (train_values - np.dot(U_values, M_T_values)) #erors in all positions of array without zeros in train
            
            
            # Calculate gradients for U and M using matrix operations
            gradient_U = 2 * (np.dot(errors, self.M) - self.lamda * self.U)
            gradient_M = 2 * (np.dot(errors.T, self.U) - self.lamda * self.M)
            

            max_gradient = 1.0  # Set an appropriate maximum gradient value
            gradient_U = np.clip(gradient_U, -max_gradient, max_gradient)
            gradient_M = np.clip(gradient_M, -max_gradient, max_gradient)
            
            self.U += self.eta * gradient_U
            self.M += self.eta * gradient_M
            
            raw_predictions = np.dot(self.U, self.M.T)
            self.predictions =  np.clip(raw_predictions, 1, 5)

            # Calculate RMSE on the probe subset
            probe_rmse = self.get_rmse(probe_subset)
            probe_mae = self.get_mae(probe_subset)
            print("Iteration:", step + 1, " RMSE:", probe_rmse, " MAE:", probe_mae)

            # Check for convergence by comparing RMSE with the previous iteration
            if probe_rmse >= prev_rmse:
                consecutive_increase_count += 1
                if consecutive_increase_count >= 2 and probe_rmse<=1:
                    print("Converged. RMSE did not decrease for 2 consecutive iterations.")
                    print('Train RMSE:',probe_rmse)
                    print('Train MAE:',probe_mae)
                    break
            else:
                consecutive_increase_count = 0

            prev_rmse = probe_rmse

        
        raw_predictions = np.dot(self.U, self.M.T)
        self.predictions =  np.clip(raw_predictions, 1, 5)
        

        return probe_rmse,probe_mae

    def predict(self):
        return self.predictions
    

    def test_rmse_mae(self, test_data):
        test_data = test_data.values  
        nR = self.predictions

        non_zero_mask = (test_data > 0) & np.logical_not(np.isin(np.arange(len(test_data)), self.zero_rows)[:, np.newaxis]) & np.logical_not(np.isin(np.arange(len(test_data[0])), self.zero_columns))
        actual_ratings = test_data[non_zero_mask]
        predicted_ratings = nR[non_zero_mask]

        rmse = np.sqrt(np.mean((predicted_ratings - actual_ratings) ** 2))
        mae = np.mean(np.abs(predicted_ratings - actual_ratings))

        return rmse, mae


    def get_rmse(self, probe_subset):
        rmse = 0
        count = 0
        for i, j, actual_rating in probe_subset:
            predicted_rating = self.predictions [i,j]
            rmse += (predicted_rating - actual_rating) ** 2
            count += 1
        rmse = np.sqrt(rmse / count)
        return rmse

    def get_mae(self, probe_subset):
        mae = 0
        count = 0
        for i, j, actual_rating in probe_subset:
            predicted_rating = self.predictions [i,j]
            mae += np.abs(predicted_rating - actual_rating)
            count += 1
        mae = mae / count
        return mae
    



In [40]:
# #test on small subset #delete this sell after
# movies1=movies.loc[:80,:]
# users1=users.loc[:40,:]
# user_movie_ratings1=user_movie_ratings.loc[:80,:40]

# model_v1=MatrixFactorization()
# data_df_v1=user_movie_ratings1
# model_v1.fit(data_df_v1)
# # model_v.predict()

## Cross validation

In [41]:
def cross_validation(model, data_df, num_folds=5): 

    
    num_users, num_movies = data_df.shape

    # Create a list of (user, movie, rating) tuples from the DataFrame
    data = [(user, movie, data_df.iloc[user, movie]) for user in range(num_users) for movie in range(num_movies) if data_df.iloc[user, movie] != 0]

 
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Initialize empty DataFrames for train and test sets
    train_sets = [pd.DataFrame(np.zeros((num_users, num_movies))) for _ in range(num_folds)]
    test_sets = [pd.DataFrame(np.zeros((num_users, num_movies))) for _ in range(num_folds)]

    # Iterate through the folds
    train_sum_rmse = 0
    train_sum_mae = 0

    test_sum_rmse = 0
    test_sum_mae = 0

    for fold, (train_index, test_index) in enumerate(kf.split(data), 1):
        train_data = [data[i] for i in train_index]
        test_data = [data[i] for i in test_index]
        
        # Fill train and test DataFrames with ratings from the respective data tuples
        for user, movie, rating in train_data:
            train_sets[fold - 1].iloc[user, movie] = rating
        
        for user, movie, rating in test_data:
            test_sets[fold - 1].iloc[user, movie] = rating
        
        mf = model
        (train_rmse,train_mae) = mf.fit(train_sets[fold - 1])
        
        
        (test_rmse,test_mae) =mf.test_rmse_mae(test_sets[fold - 1])

        print('Train RMSE for fold:',fold,':', train_rmse)
        print('Train MAE for fold:',fold,':', train_mae)

        print('Test RMSE for fold:',fold,':', test_rmse)
        print('Test MAE for fold:',fold,':', test_mae)

        train_sum_rmse = train_sum_rmse + train_rmse
        train_sum_mae = train_sum_mae + train_mae


        test_sum_rmse = test_sum_rmse + test_rmse
        test_sum_mae = test_sum_mae + test_mae

    overall_train_rmse=train_sum_rmse/num_folds
    overall_train_mae=train_sum_mae/num_folds

    overall_test_rmse=test_sum_rmse/num_folds
    overall_test_mae=test_sum_mae/num_folds

    print('Overall Train RMSE:',overall_train_rmse)
    print('Overall Train MAE:',overall_train_mae)

    print('Overall Test RMSE:',overall_test_rmse)
    print('Overall Test MAE:',overall_test_mae)




    

In [42]:
# model=MatrixFactorization()
# cross_validation(model, user_movie_ratings1, num_folds=5)

## Implement

In [43]:
model=MatrixFactorization()
data_df=user_movie_ratings
cross_validation(model, data_df, num_folds=5)


Iteration: 1  RMSE: 1.6783192258639683  MAE: 1.4147181773926547
Iteration: 2  RMSE: 1.6479166447322926  MAE: 1.3857431377176015
Iteration: 3  RMSE: 1.6178707885973316  MAE: 1.357268823823823
Iteration: 4  RMSE: 1.5882224957119933  MAE: 1.3293257874134052
Iteration: 5  RMSE: 1.5590149530691895  MAE: 1.3019322849275268
Iteration: 6  RMSE: 1.5302905248228964  MAE: 1.2751277983989482
Iteration: 7  RMSE: 1.5020821218525977  MAE: 1.2489478754249856
Iteration: 8  RMSE: 1.4744220095218623  MAE: 1.2234138218132764
Iteration: 9  RMSE: 1.4473403276580874  MAE: 1.1985290946243747
Iteration: 10  RMSE: 1.42086576380658  MAE: 1.1743041328817803
Iteration: 11  RMSE: 1.3950289983131319  MAE: 1.1507640889986444
Iteration: 12  RMSE: 1.3698550120932824  MAE: 1.1279264018827824
Iteration: 13  RMSE: 1.3453694190477081  MAE: 1.1058124670734306
Iteration: 14  RMSE: 1.3215944688080912  MAE: 1.084414304677851
Iteration: 15  RMSE: 1.2985525123978419  MAE: 1.0637476952380964
Iteration: 16  RMSE: 1.276262448338610

In [44]:
model=MatrixFactorization(K=15)
data_df=user_movie_ratings
cross_validation(model, data_df, num_folds=5)


Iteration: 1  RMSE: 1.3453559404161008  MAE: 1.0770126138689624
Iteration: 2  RMSE: 1.32039543990401  MAE: 1.0560947613667626
Iteration: 3  RMSE: 1.2960030970099228  MAE: 1.0357215511899154
Iteration: 4  RMSE: 1.2722463956755972  MAE: 1.015902835141665
Iteration: 5  RMSE: 1.249182908097547  MAE: 0.9966689711931591
Iteration: 6  RMSE: 1.2268901124617844  MAE: 0.9781283622528991
Iteration: 7  RMSE: 1.2054203327196817  MAE: 0.9603257278398791
Iteration: 8  RMSE: 1.1847990477101216  MAE: 0.9432601542019601
Iteration: 9  RMSE: 1.1650584435747835  MAE: 0.9269735809170294
Iteration: 10  RMSE: 1.1462091274139234  MAE: 0.9114550420897843
Iteration: 11  RMSE: 1.1282448199348882  MAE: 0.8967254542277439
Iteration: 12  RMSE: 1.1111736945531945  MAE: 0.8827723007761215
Iteration: 13  RMSE: 1.094988907985151  MAE: 0.8695611064521335
Iteration: 14  RMSE: 1.0796990366713926  MAE: 0.8570855178626773
Iteration: 15  RMSE: 1.0652742947671752  MAE: 0.8453170641291907
Iteration: 16  RMSE: 1.0517012165513717

In [45]:
model=MatrixFactorization(K=20)
data_df=user_movie_ratings
cross_validation(model, data_df, num_folds=5)


Iteration: 1  RMSE: 1.5917782026521192  MAE: 1.2389028045029917
Iteration: 2  RMSE: 1.5688163391733632  MAE: 1.219992904595072
Iteration: 3  RMSE: 1.5449106977266873  MAE: 1.2005362687852856
Iteration: 4  RMSE: 1.5201899683465296  MAE: 1.1805857948824034
Iteration: 5  RMSE: 1.4948153668572723  MAE: 1.1603015554848
Iteration: 6  RMSE: 1.4689630230381086  MAE: 1.1398556374970341
Iteration: 7  RMSE: 1.4427785322816773  MAE: 1.1193212860696626
Iteration: 8  RMSE: 1.4163766932856567  MAE: 1.0987898305554382
Iteration: 9  RMSE: 1.389861432769145  MAE: 1.0783104365241791
Iteration: 10  RMSE: 1.3633715666377673  MAE: 1.0578811905956766
Iteration: 11  RMSE: 1.3371074312238087  MAE: 1.0377568207379912
Iteration: 12  RMSE: 1.311200576754012  MAE: 1.0180264261387748
Iteration: 13  RMSE: 1.285822325139142  MAE: 0.998812307327654
Iteration: 14  RMSE: 1.2610787800747032  MAE: 0.9801515468825024
Iteration: 15  RMSE: 1.2370257721362765  MAE: 0.9620531432906503
Iteration: 16  RMSE: 1.2137491795877298  M

# Output for Visualization

In [49]:
model_v=MatrixFactorization()
data_df_v=user_movie_ratings
model_v.fit(data_df_v)
U = model_v.U
M = model_v.M

Iteration: 1  RMSE: 1.6723599110932035  MAE: 1.4053197534357433
Iteration: 2  RMSE: 1.6424333442076908  MAE: 1.3768959859366652
Iteration: 3  RMSE: 1.6128689302934212  MAE: 1.348949091650833
Iteration: 4  RMSE: 1.583708783100971  MAE: 1.3215138803466133
Iteration: 5  RMSE: 1.5549932054781697  MAE: 1.294630209223266
Iteration: 6  RMSE: 1.5267599777282561  MAE: 1.268345288749682
Iteration: 7  RMSE: 1.4990453458617221  MAE: 1.242674486855773
Iteration: 8  RMSE: 1.4718796548901336  MAE: 1.2176376408447118
Iteration: 9  RMSE: 1.4452912911184237  MAE: 1.1932584088508473
Iteration: 10  RMSE: 1.4193089893841595  MAE: 1.1695410544314078
Iteration: 11  RMSE: 1.3939595774282039  MAE: 1.1465210755674582
Iteration: 12  RMSE: 1.3692692765816574  MAE: 1.124199841761164
Iteration: 13  RMSE: 1.3452615794195448  MAE: 1.1026005948363748
Iteration: 14  RMSE: 1.3219653210024906  MAE: 1.0817466919554646
Iteration: 15  RMSE: 1.2993896546404233  MAE: 1.0616575961144805
Iteration: 16  RMSE: 1.2775467646916379 

In [None]:
np.savetxt('users_mf.csv', U, delimiter=',')
np.savetxt('movies_mf.csv', M, delimiter=',')