# Matrix Factorization

Most of recommendation data will be sparse because of users rate only few of movies from thousands of movies. In order to eliminate this sparsity matrix factorization is used. Matrix Factorization is a method to generate latent factors from the user-movie rating data and to map the movies and users against those factors. 

In [6]:
import pandas as pd
import numpy as np

from scipy.sparse.linalg import svds

In [2]:
df_ratings = pd.read_csv('datasets/user_ratings.csv')
df_ratings['userId'] = df_ratings['userId'].apply(lambda x: 'user_' + str(x))
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,user_1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,user_5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,user_7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,user_15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,user_17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [4]:
# Generating pivot table from users, movies and ratings
pivot_user_ratings = df_ratings.pivot_table(index='userId',
                                                 columns='title',
                                                 values='rating')
pivot_user_ratings

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,,,,,,,,,,,...,,,,,,,,,4.0,
user_10,,,,,,,,,,,...,,,,,,,,,,
user_100,,,,,,,,,,,...,,,,,,,,,,
user_101,,,,,,,,,,,...,,,,,,4.0,,,,
user_102,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_95,,,,,,,3.0,,,,...,,,,,,2.5,3.0,,,
user_96,,,,,,,,,,,...,,,,,,,,,,
user_97,,,,,,,,,,,...,,,,,,,,,,
user_98,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Getting average rating for each movie
avg_ratings = pivot_user_ratings.mean(axis=1)

# Subtracting mean ratings from user ratings
pivot_user_ratings_sub = pivot_user_ratings.sub(avg_ratings, axis=0)

# Filling null values with zero
pivot_user_ratings_sub.fillna(0, inplace=True)
pivot_user_ratings_sub

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,-0.366379,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_101,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.442623,0.0000,0.0,0.000000,0.0
user_102,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_95,0.0,0.0,0.0,0.0,0.0,0.0,-1.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.562500,-1.0625,0.0,0.000000,0.0
user_96,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_97,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_98,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0


In [11]:
pivot_user_ratings_sub.loc[pivot_user_ratings_sub.index, :]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,-0.366379,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_101,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.442623,0.0000,0.0,0.000000,0.0
user_102,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_95,0.0,0.0,0.0,0.0,0.0,0.0,-1.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.562500,-1.0625,0.0,0.000000,0.0
user_96,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_97,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0
user_98,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0000,0.0,0.000000,0.0


### Singular Value Decomposition

Singular Value Decomposition method finds latent factors from the matrix. 
* U represents the user matrix
* sigma represents weights of latent features.
* Vt represents the features matrix

In [19]:
# Generating latent factors
U, sigma, Vt = svds(pivot_user_ratings_sub.to_numpy())

# Converting sigma to diagonal to recalculate ratings
sigma = np.diag(sigma)

# Recalculate all ratings
recalculated_ratings = np.dot(np.dot(U, sigma), Vt)

In [26]:
# Add average ratings back to generate predictions
uncentered_ratings = recalculated_ratings + avg_ratings.values.reshape(-1, 1)

# Generate predictions dataframe
df_predictions = pd.DataFrame(uncentered_ratings,
                             index=pivot_user_ratings_sub.index,
                             columns=pivot_user_ratings_sub.columns)
df_predictions.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,4.366758,4.366864,4.366117,4.366425,4.365874,4.367775,4.36411,4.365298,4.399567,4.366379,...,4.360731,4.365497,4.367345,4.36639,4.366333,4.387607,4.335274,4.357385,4.346337,4.366739
user_10,3.277002,3.27816,3.278756,3.278447,3.278661,3.273415,3.25931,3.274417,3.293472,3.270893,...,3.278355,3.273983,3.280404,3.283119,3.278677,3.267572,3.291666,3.295063,3.293902,3.277837
user_100,3.945609,3.945924,3.945975,3.945918,3.945924,3.942866,3.94242,3.943739,3.97144,3.946432,...,3.954521,3.946694,3.945764,3.94628,3.946025,3.940471,3.954394,3.944581,3.953173,3.945182
user_101,3.557679,3.557618,3.557232,3.557446,3.55714,3.560746,3.555661,3.55819,3.557041,3.561318,...,3.561151,3.555508,3.55849,3.557818,3.557388,3.57349,3.557671,3.555386,3.538731,3.558006
user_102,3.357724,3.357431,3.357008,3.357146,3.356893,3.357618,3.374846,3.358628,3.382811,3.358889,...,3.358963,3.359044,3.35671,3.355553,3.357102,3.361502,3.341253,3.345854,3.351404,3.357219


In [34]:
main_user = 'user_194'
# Get user watched movies
user_watched_movies = df_ratings[df_ratings['userId'] == main_user].title.unique()

# Generate recommendations
recommendations = df_predictions.loc[main_user, :].sort_values(ascending=False).reset_index()

# Remove watched movies of user. 
recommendations[~recommendations['title'].isin(user_watched_movies)][:10]

Unnamed: 0,title,user_194
0,Pulp Fiction (1994),3.501368
1,"Shawshank Redemption, The (1994)",3.497851
2,Forrest Gump (1994),3.496433
3,Fight Club (1999),3.494696
4,"Godfather, The (1972)",3.493774
5,Schindler's List (1993),3.493663
6,"Usual Suspects, The (1995)",3.492782
7,"Silence of the Lambs, The (1991)",3.492509
8,Star Wars: Episode IV - A New Hope (1977),3.491739
9,Back to the Future (1985),3.490464


In [38]:
main_user = 'user_234'
# Get user watched movies
user_watched_movies = df_ratings[df_ratings['userId'] == main_user].title.unique()

# Generate recommendations
recommendations = df_predictions.loc[main_user, :].sort_values(ascending=False).reset_index()

# Remove watched movies of user. 
recommendations[~recommendations['title'].isin(user_watched_movies)][:10]

Unnamed: 0,title,user_234
4,"Shawshank Redemption, The (1994)",3.770493
5,"Silence of the Lambs, The (1991)",3.74232
7,Back to the Future (1985),3.736432
8,"Princess Bride, The (1987)",3.731036
9,Schindler's List (1993),3.730251
10,"Lord of the Rings: The Return of the King, The...",3.724134
11,"Godfather, The (1972)",3.721017
12,"Lord of the Rings: The Fellowship of the Ring,...",3.714936
13,"Fugitive, The (1993)",3.712859
14,Forrest Gump (1994),3.710378


### Conclusion

Matrix factorization enables us to fill sparse vectors more meaningfully. Also this method enables us to generate personalized recommendations, but it has a bit bias from the popularity of the movie.