# **Latent Factor Based Model for Recommendation System**
### It uses Matrix Factorization

In [1]:
# Extract dataset

from zipfile import ZipFile

# Create a ZipFile Object and load sample.zip in it
with ZipFile('/content/drive/My Drive/datasets/ml-100k.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [2]:
# we r interested in u.data. it has complete rating of movies along with userid
# this dataset contains 1682 movies and 943 users. It's a tab seperated file. cols are userid and itemid which is id for the movie.
# u.item contains information about the movies themselves. like title, genres, imdb url etc.
# u.data contains the information about the user. such as userid, movieid, the rating and date at which the rating was given.
# for our purpose we r gonna combine 2 files i.e. u.data and u.item


In [3]:
import pandas as pd
import numpy as np

datafile = "/content/ml-100k/u.data"
data = pd.read_csv(datafile, sep = '\t', header = None, names = ['userId', 'itemId', 'rating', 'timestamp'])
# header = None mean the file has no col name so col number will be asssigned.
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [22]:
movieInfoFile = '/content/ml-100k/u.item'
movieInfo = pd.read_csv(movieInfoFile, sep = '|', header = None, index_col = False, encoding='latin-1',
                        names = ['itemId', 'title'], usecols = [0, 1])

# the datset was giving utf-8 encoding error therefore i used latin-1 encoding, on which the dataset worked perfectly ok
# usecols mean dont use all the columns present in the dataset. just use 0 and 1 column. 
# if we dont set index_col = False then pandas by default would have taken itemId col as an row index no col. 

movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
# now merging both the tables with a common col name in both the dataframes

data = pd.merge(data, movieInfo, left_on='itemId', right_on= 'itemId')

data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
data.loc[0: 10, ['userId']]
# loc will give u 11 values from the userId col. 

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [7]:
toyStoryUsers = data[data.title =='Toy Story (1995)']
# this will give us subset of dataframe with title as Toy story. It is a kind of filter. 
toyStoryUsers.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)


In [8]:
# how to sort values in a dataframe
data = pd.DataFrame.sort_values(data, ['userId', 'itemId'], ascending = [0, 1])
# it will sort the dataframe data on the cols userId and itemId.
# ascending = [0, 1] mean 1st col will be sort in descending order and 2nd col will be sort in ascending order.
data.head()


Unnamed: 0,userId,itemId,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [29]:
# for finding the max userId and itemId

numUsers = max(data.userId)
numMovies = max(data.itemId)

# we can also see how many movies were rated by each user and the number of users that rated each movie

moviesPerUser = data.userId.value_counts()
# this will tell how many times each user appears in the dataframe. i.e how many movies each user rated. 
usersPerMovie = data.title.value_counts()
# this will tell how many times each title appears in the dataframe. i.e. how many users rated each movie 

print(moviesPerUser)
print("-------------")
print(usersPerMovie)

In [10]:
# lets write a function to find top N favourite movies of a user.

def favoriteMovies(activeUser, N):
    user = data[data.userId == activeUser]
    top_movies = pd.DataFrame.sort_values(user, ['rating'], ascending = [0])[:N]
    # 0 mean sort in descending order.
    # fetch all the records/rows till N. 
    return list(top_movies.title) # we only need title col of that user. and this will return a list not df.

print(favoriteMovies(5, 3))  # 5 is the userId and 3 is the total top 3 rated movies by that user.

['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)']


In [11]:
# let's get down to finding some recommendations now using latent factor based model.

# **MATRIX FACTORIZATION**

In [12]:
# we will represent each user as a vector. each element in the vector will be the rating of the specific movie 
# by that user. since we have 1600 odd movies so each vector will contain 1600 elements. 

userItemRatingMatrix = pd.pivot_table(data, values = 'rating', index= ['userId'],
                                     columns=['itemId'])
# each row will be a user and cols will be rating for each movie
# index hoga userId and uski values hongi ratings. cols name will be itemId.
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1653,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,5.0,5.0,5.0,5.0,3.0,4.0,5.0,4.0,1.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,1.0,3.0,3.0,5.0,4.0,2.0,1.0,2.0,2.0,3.0,4.0,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,4.0,,,,,,,,,2.0,,,4.0,4.0,,,,,3.0,,,,,,4.0,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,3.0,,,,,,,,,,,,,,,4.0,,,,3.0,,,4.0,3.0,,,,4.0,,,,,,,,,,,4.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
# The idea here is to identify some hidden factors which influence the user rating of a movie.
# The way to do this is by decomposing user item matrix into 2 other matrices. One will be a user factor matrix
# and other will be an item-factor matrix. 
# each row in the user factor matrix maps the user onto the set of hidden factors 
# each col in the item factor matrix maps the item on to the hidden factors. 
# in the matrix factorization u find all the ratings for all the users in one go. 

In [17]:
def matrixFactorization(R, K, steps = 10, gamma= 0.001, lamda = 0.02):
    """
    inputs
      - R: user item rating matrix
      - K: the no of factors k
      - steps, gamma and lambda are used to optimize our error fucntion and find the right factor vectors
    """
    # we will be using sgd to find the factors vectors
    # SGD will use steps, gamma and lambda
    
    N = len(R.index) # Number of users
    M = len(R.columns) # Number of items
    P = pd.DataFrame(np.random.rand(N, K), index = R.index)
    # now the matrix R will be decomposed into 2 matrices. i.e P and Q
    # P is the user factor matrix and Q is the product factor matrix
    # dim of P: N x K where N is the no of users and K is the no of factors
    # we will initialize this matrix with some random numbers. then we will iteratively adjust the values of P
    # so that it moves to a place where the dot product of P and Q will be very close to user Item matrix R.
    
    Q = pd.DataFrame(np.random.rand(M, K), index = R.columns)
    # Q is the product factor matrix. Its dim will be M x K. 
    # Now we have P and Q with some initial values 
    # we need to find the final values of P and Q with SGD
    for step in range(steps):
        # find the slope and find errors at each point and move downward in the direction of slope.
        # steps will specify how many time will SGD will loop through each rating which is in the training set.
        for i in R.index:
            for j in R.columns:
                if R.loc[i, j] > 0:
                    # check for each user and his each rating is > 0 and if it exist then it will compute the 
                    # error for that rating. 
                    eij = R.loc[i, j] - np.dot(P.loc[i], Q.loc[j])
                    # this is the error difference blw the actual rating and predicted rating. 
                    # the predicted rating will be the dot product of corresponding user factor vector and the 
                    # corresponding product/item factor vector.
                    # now we should adjust the value of p and q so that we r moving towards the minimum
                    # first we need to find the slope of current point
                    P.loc[i] = P.loc[i] + gamma * (eij * Q.loc[j] - lamda * P.loc[i])
                    # (eij * Q.loc[j] - lamda * P.loc[i]) the values in the bracket is the partial derivative
                    # of the error function at this particular point. 
                    Q.loc[j] = Q.loc[j] + gamma * (eij * P.loc[i] - lamda * Q.loc[j])
        # at the end of this we have looped through all of the ratings. 
        # Let's check the current value  of the error. 
        # if the error value is less than 0.001 else we will continue
        error = 0
        for i in R.index:
            for j in R.columns:
                if R.loc[i, j] > 0:
                    # computing error
                    e = error + pow(R.loc[i, j]- np.dot(P.loc[i], Q.loc[j]), 2)
                    error = e + lamda*(pow(np.linalg.norm(P.loc[i]), 2)+ pow(np.linalg.norm(Q.loc[j]), 2))
                    # we r taking sum of the squares of the error. We will add lamda regularizer as well
                    # we use norm from numpy module to find magnituide of particular vector. 
        if error < 0.001:
            break
        print("Step No : ", step)
    return P, Q



In [25]:
# Let's call this fucntion now 
# (P, Q) = matrixFactorization(userItemRatingMatrix.iloc[:100, :100], K = 2, gamma = 0.001, lamda = 0.2, steps = 100)
(P, Q) = matrixFactorization(userItemRatingMatrix, K = 2, gamma = 0.001, lamda = 0.2, steps = 20)
# this will return 2 matrices 
# 1 will be product factor matrices and another will be user factor matrix
# ideally we should run it over the entire matrix for few thousand steps. But it can take few hours. 
# for instance we can perform the computation over small part of the rating matrix to see how it works. 
# we can do this by saying userItemRatingMatrix[: 100, : 100] i.e.  1st hundred users and 1st hundred items


Step No :  0
Step No :  1
Step No :  2
Step No :  3
Step No :  4
Step No :  5
Step No :  6
Step No :  7
Step No :  8
Step No :  9
Step No :  10
Step No :  11
Step No :  12
Step No :  13
Step No :  14
Step No :  15
Step No :  16
Step No :  17
Step No :  18
Step No :  19


In [28]:
# Let's quickly use these ratings to find recommendations for a user. 
# if u take any user i and want to find their rating for product j, u would take the ith row of matrix P and jth row of matrix Q and take a dot product of those
# 2 vectors. 
# if u want to find the predictive item rating for all the items for an  active user, take the row corresponding to the active user from the matrix P and do a dot 
# product with entire matrix Q. 
activeUser = 1
N = 3
predictItemRating = pd.DataFrame(np.dot(P.loc[activeUser], Q.T), index = Q.index, columns = ['Rating'])

# now we will remove the movies that the user has allready rated. 
# Or lets just drop the rating for the movies which the user has allready watched. 
single_row = userItemRatingMatrix.loc[activeUser]
# single_row
moviesAllreadyWatched = list(single_row.loc[single_row > 0].index) 
predictItemRating = predictItemRating.drop(moviesAllreadyWatched)
topRecommendations = pd.DataFrame.sort_values(predictItemRating,
                                                  ["Rating"], ascending = [0])[:N]

topRecommendationTitles = movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)]
print(list(topRecommendationTitles.title))


["Schindler's List (1993)", 'Close Shave, A (1995)', 'Casablanca (1942)']


# **Storing P and Q values**

In [67]:
P.to_csv("Q_values.csv", sep='\t')
Q.to_csv("Q_values.csv", sep='\t')