# Creating a Machine Learning Recommendation system using model based Collaborative Filtering System with a Singular Value Decomposition (SVD)

In [57]:
# Import the pandas library for data manipulation and analysis
import pandas as pd 
# Import numpy for numerical operations
import numpy as np

# Import scikit-learn machine learning library
import sklearn 
# Import TruncatedSVD for dimensionality reduction (similar to PCA but can work with sparse matrices). Useful for sentiment analysis 
from sklearn.decomposition import TruncatedSVD

The following dataset used for developing this movie recommendation system was obtained from:  https://grouplens.org/datasets/movielens/100k/

In [61]:
# Define column names for the dataset by adding the columns as indicated in the README file
columns = ['user_id', 'item_id', 'rating', 'timestamp']

# Read the MovieLens 100K dataset from a tab-separated file
# The file contains user ratings for movies without headers
frame = pd.read_csv('ml-100k/u.data', sep = '\t', names = columns)

# Display the first 5 rows of the dataframe to inspect the data
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Preparing Data 

In [67]:
# Define column names for the dataset
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load the movie data from the u.item file
# - sep='|' specifies pipe as the delimiter
# - names=columns assigns our predefined column names
# - encoding='latin-1' handles special characters in the dataset
movies = pd.read_csv('ml-100k/u.item', sep ='|', names = columns, encoding ='latin-1')

# Create a new DataFrame with only movie IDs and titles
movie_names = movies[['item_id', 'movie title']]

# Display the first 5 rows of the movies_names DataFrame
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [73]:
# Using the merge function to with the parameter on item_id
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [94]:
# Group data by movie ID, count ratings for each movie,
# sort in descending order (most rated first),
# and display the top 5 movies with the most ratings
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [100]:
# Create a filter to select rows where item_id equals 50
Filter = combined_movies_data['item_id']==50
# Display unique movie titles that match the filter condition
combined_movies_data[Filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

# Building Utility Matrix 