## Creating a Machine Learning Recommendation system using model based Collaborative Filtering System with a Singular Value Decomposition (SVD)

In [2]:
# Import the pandas library for data manipulation and analysis
import pandas as pd 
# Import numpy for numerical operations
import numpy as np

# Import scikit-learn machine learning library
import sklearn 
# Import TruncatedSVD for dimensionality reduction (similar to PCA but can work with sparse matrices). Useful for sentiment analysis 
from sklearn.decomposition import TruncatedSVD

The following dataset used for developing this movie recommendation system was obtained from:  https://grouplens.org/datasets/movielens/100k/

In [4]:
# Define column names for the dataset by adding the columns as indicated in the README file
columns = ['user_id', 'item_id', 'rating', 'timestamp']

# Read the MovieLens 100K dataset from a tab-separated file
# The file contains user ratings for movies without headers
frame = pd.read_csv('ml-100k/u.data', sep = '\t', names = columns)

# Display the first 5 rows of the dataframe to inspect the data
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Preparing Data 

In [6]:
# Define column names for the dataset
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load the movie data from the u.item file
# - sep='|' specifies pipe as the delimiter
# - names=columns assigns our predefined column names
# - encoding='latin-1' handles special characters in the dataset
movies = pd.read_csv('ml-100k/u.item', sep ='|', names = columns, encoding ='latin-1')

# Create a new DataFrame with only movie IDs and titles
movie_names = movies[['item_id', 'movie title']]

# Display the first 5 rows of the movies_names DataFrame
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
# Using the merge function to with the parameter on item_id
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()


Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [8]:
# Group data by movie ID, count ratings for each movie,
# sort in descending order (most rated first),
# and display the top 5 movies with the most ratings
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [9]:
# Create a filter to select rows where item_id equals 50
Filter = combined_movies_data['item_id']==50
# Display unique movie titles that match the filter condition
combined_movies_data[Filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

### Building Utility Matrix 

In [11]:
# Create a pivot table from the combined_movies_data DataFrame
# - rows (index) represent user_id
# - columns represent movie titles
# - values in the table are the ratings
# - fill missing values with 0 (where a user hasn't rated a movie)
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
# Display the first few rows of the pivot table
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


### Transposing the Matrix 

In [28]:
# Get the dimensions of the rating_crosstab DataFrame
# Returns a tuple with (number of rows, number of columns)
rating_crosstab.shape

(943, 1664)

### Anatomy of Truncated SVD
                 
     943 * 1664                                     943 * 12 
     {       } (users)          -->             U  =  {        } (users)
       Movies           n_components = 12     latent variables about movies

In [15]:
# Transpose the rating_crosstab matrix to have users as rows and items as columns
X  = rating_crosstab.values.T
# Check the shape of the transposed matrix (number of rows, number of columns)
X.shape

(1664, 943)

### Decomposing the Matrix

In [24]:
# Initialize TruncatedSVD for dimensionality reduction
# Setting n_components=12 to reduce to 12 dimensions
# random_state=17 ensures reproducibility of results
SVD = TruncatedSVD(n_components=12, random_state=17)

# Apply SVD transformation to input matrix X
# This transforms the data from original high-dimensional space to 12 dimensions
resultant_matrix = SVD.fit_transform(X)

# Display the shape of the transformed matrix
# Should be (number_of_samples, 12)
resultant_matrix.shape

(1664, 12)

### Generating Correlation Matrix

In [19]:
# Calculate the correlation coefficient matrix from resultant_matrix
# This creates a square matrix where each element (i,j) represents the correlation between variables i and j
corr_mat = np.corrcoef(resultant_matrix)

# Display the shape of the correlation matrix
# The shape will be (n,n) where n is the number of variables/features in resultant_matrix
corr_mat.shape

(1664, 1664)

### Isolating star Wars From the Correlation Matrix 

In [34]:
# Extract movie names from the columns of the rating_crosstab DataFrame
# This creates a list/index of movie titles that will be used for further analysis
movie_names = rating_crosstab.columns
movies_list= list(movie_names)

star_wars = movies_list.index('Star Wars (1977)')
print(star_wars)

1398


In [41]:
corr_star_wars= corr_mat[star_wars]
corr_star_wars.shape

(1664,)

### Recommending a Highly Correlated Movie


In [46]:
list(movie_names[(corr_star_wars< 1.0) & (corr_star_wars >0.9)])

['Empire Strikes Back, The (1980)',
 'Men in Black (1997)',
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Star Trek: First Contact (1996)',
 'Toy Story (1995)']