# Creating a Machine Learning Recommendation system using model based Collaborative Filtering System with a Singular Value Decomposition (SVD)

In [None]:
# Import the pandas library for data manipulation and analysis
import pandas as pd 
# Import numpy for numerical operations
import numpy as np

# Import scikit-learn machine learning library
import sklearn 
# Import TruncatedSVD for dimensionality reduction (similar to PCA but can work with sparse matrices). Useful for sentiment analysis 
from sklearn.decomposition import TruncatedSVD

The following dataset used for developing this movie recommendation system was obtained from:  https://grouplens.org/datasets/movielens/100k/

In [None]:
# Define column names for the dataset by adding the columns as indicated in the README file
columns = ['user_id', 'item_id', 'rating', 'timestamp']

# Read the MovieLens 100K dataset from a tab-separated file
# The file contains user ratings for movies without headers
frame = pd.read_csv('ml-100k/u.data', sep = '\t', names = columns)

# Display the first 5 rows of the dataframe to inspect the data
frame.head()

# Preparing Data 

In [None]:
# Define column names for the dataset
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load the movie data from the u.item file
# - sep='|' specifies pipe as the delimiter
# - names=columns assigns our predefined column names
# - encoding='latin-1' handles special characters in the dataset
movies = pd.read_csv('ml-100k/u.item', sep ='|', names = columns, encoding ='latin-1')

# Create a new DataFrame with only movie IDs and titles
movie_names = movies[['item_id', 'movie title']]

# Display the first 5 rows of the movies_names DataFrame
movie_names.head()

In [None]:
# Using the merge function to with the parameter on item_id
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()

In [None]:
# Group data by movie ID, count ratings for each movie,
# sort in descending order (most rated first),
# and display the top 5 movies with the most ratings
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

In [None]:
# Create a filter to select rows where item_id equals 50
Filter = combined_movies_data['item_id']==50
# Display unique movie titles that match the filter condition
combined_movies_data[Filter]['movie title'].unique()

# Building Utility Matrix 

In [29]:
# Create a pivot table from the combined_movies_data DataFrame
# - rows (index) represent user_id
# - columns represent movie titles
# - values in the table are the ratings
# - fill missing values with 0 (where a user hasn't rated a movie)
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
# Display the first few rows of the pivot table
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [None]:
# Transposing the Matrix 