# Singular Value Decomposition

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

!!!! SCALE DATA FIRST !!!!

## NumPy (full version)

Step 1. input matrix X

In [4]:
# size 7x5
rating_matrix = np.array([
              [1, 2, 2, 0, 0],
              [3, 5, 5, 0, 0],
              [4, 4, 4, 0, 0],
              [5, 5, 5, 0, 0],
              [0, 2, 0, 4, 4],
              [0, 0, 0, 5, 5],
              [0, 1, 0, 2, 2]])

movies = ['TheMatrix', 'Alien', 'StarWars', 'Casablanca', 'Titanic']
users = ['Alice', 'Bob', 'Cindy', 'Dan', 'Emily', 'Frank', 'Greg']
ratings_df = pd.DataFrame(rating_matrix, index=users, columns=movies)
ratings_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
Alice,1,2,2,0,0
Bob,3,5,5,0,0
Cindy,4,4,4,0,0
Dan,5,5,5,0,0
Emily,0,2,0,4,4
Frank,0,0,0,5,5
Greg,0,1,0,2,2


Step 2. turn matrix X into three: U (user-indexed), $\Sigma$, I (item-indexed)

In [5]:
from numpy.linalg import svd

# Compute SVD
U, sigma, I = svd(M)

U, sigma, I = (np.around(x, 2) for x in (U, sigma, I))

U_df = pd.DataFrame(U, index=users)
I_df = pd.DataFrame(I, columns=movies)

In [8]:
# user-indexed matrix shape 7x7
U_df.shape

(7, 7)

In [9]:
# user-indexed
# movies turned into essential features
# ratings turned into weights showing how important that feature is to each user
# can take a new movie and calculate how much Alice will like it based on the weights from this table 
# if we have the features of that movie
U_df.head(7)

Unnamed: 0,0,1,2,3,4,5,6
Alice,-0.21,0.02,0.31,0.26,0.66,-0.51,0.33
Bob,-0.55,0.06,0.53,0.46,-0.33,0.25,-0.16
Cindy,-0.5,0.07,-0.31,-0.2,-0.37,-0.67,-0.18
Dan,-0.62,0.08,-0.39,-0.24,0.36,0.48,0.18
Emily,-0.12,-0.6,0.4,-0.52,0.2,-0.0,-0.4
Frank,-0.04,-0.73,-0.42,0.53,-0.0,0.0,-0.0
Greg,-0.06,-0.3,0.2,-0.26,-0.4,-0.0,0.8


In [10]:
# item-index matrix shape 5x5
I_df.shape

(5, 5)

In [11]:
# item-indexed
# users reduced to 5 essential types from 7 actual users
# numbers not actual ratings but weights
I_df.head()

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
0,-0.5,-0.62,-0.6,-0.06,-0.06
1,0.09,-0.05,0.11,-0.7,-0.7
2,-0.78,0.62,0.03,-0.07,-0.07
3,-0.36,-0.48,0.79,0.05,0.05
4,0.0,0.0,-0.0,-0.71,0.71


Step 3. use first two latent features to create new approximation

In [15]:
# Zero out all but the first two singular values
sigma_reduced = np.zeros(rating_matrix.shape)
np.fill_diagonal(sigma_reduced, sigma)
sigma_reduced[:, 2:] = 0

# Reconstruct the ratings matrix
ratings_reconstructed = np.dot(np.dot(U, sigma_reduced), I)

ratings_reconstructed_df = pd.DataFrame(ratings_reconstructed, index=users, columns=movies)
ratings_reconstructed_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
Alice,1.470336,1.792448,1.764784,0.041104,0.041104
Bob,3.857408,4.69088,4.630032,0.05688,0.05688
Cindy,3.519976,4.25708,4.225304,-0.05128,-0.05128
Dan,4.358944,5.282016,5.232256,-0.018272,-0.018272
Emily,0.31632,1.315296,0.36816,4.098048,4.098048
Frank,-0.348664,0.690712,-0.432296,4.897936,4.897936
Greg,0.15816,0.657648,0.18408,2.049024,2.049024


In [16]:
# original ratings for comparison
ratings_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
Alice,1,2,2,0,0
Bob,3,5,5,0,0
Cindy,4,4,4,0,0
Dan,5,5,5,0,0
Emily,0,2,0,4,4
Frank,0,0,0,5,5
Greg,0,1,0,2,2


## Using Sklearn (truncated version)

In [35]:
# n_components=number of latent features
from sklearn.decomposition import TruncatedSVD
model = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
model.fit(rating_matrix)

# components_
print(model.components_)
# singular_values_ sigma
print(model.singular_values_)
X = model.transform(rating_matrix)
print(X)

[[ 0.50235233  0.61952676  0.59696793  0.06106564  0.06106564]
 [-0.09486849  0.04591414 -0.11077974  0.69879171  0.69879171]]
[13.83663983  9.52139961]
[[ 2.93534171 -0.22459968]
 [ 7.58953043 -0.60893346]
 [ 6.87538807 -0.63893635]
 [ 8.59423509 -0.79867044]
 [ 1.7275786   5.68216197]
 [ 0.61065635  6.98791711]
 [ 0.8637893   2.84108099]]


In [28]:
# user-indexed matrix (7x2) 
# 2 because of n_components
# 7 latent features truncated to 2
U = model.fit_transform(rating_matrix)

U_df = pd.DataFrame(U, index=users)
U_df.head(7)

Unnamed: 0,0,1
Alice,2.935342,-0.2246
Bob,7.58953,-0.608933
Cindy,6.875388,-0.638936
Dan,8.594235,-0.79867
Emily,1.727579,5.682162
Frank,0.610656,6.987917
Greg,0.863789,2.841081


In [33]:
# item-indexed matrix (5x5)
I = model.components_

i_df = pd.DataFrame(I, columns=movies)
I_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
0,-0.5,-0.62,-0.6,-0.06,-0.06
1,0.09,-0.05,0.11,-0.7,-0.7
2,-0.78,0.62,0.03,-0.07,-0.07
3,-0.36,-0.48,0.79,0.05,0.05
4,0.0,0.0,-0.0,-0.71,0.71


In [29]:
# transformation back for check
ratings_reconstructed = model.inverse_transform(U)
ratings_reconstructed_df = pd.DataFrame(ratings_reconstructed, index=users, columns=movies)
ratings_reconstructed_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
Alice,1.495883,1.80821,1.777186,0.0223,0.0223
Bob,3.870387,4.673959,4.598164,0.037942,0.037942
Cindy,3.514482,4.230151,4.175167,-0.026633,-0.026633
Dan,4.393103,5.287688,5.218959,-0.033292,-0.033292
Emily,0.328795,1.331173,0.401841,4.076143,4.076143
Frank,-0.356169,0.699162,-0.409577,4.920389,4.920389
Greg,0.164397,0.665586,0.20092,2.038072,2.038072


In [30]:
# original ratings for comparison
ratings_df.head(7)

Unnamed: 0,TheMatrix,Alien,StarWars,Casablanca,Titanic
Alice,1,2,2,0,0
Bob,3,5,5,0,0
Cindy,4,4,4,0,0
Dan,5,5,5,0,0
Emily,0,2,0,4,4
Frank,0,0,0,5,5
Greg,0,1,0,2,2
