In [1]:
import pandas as pd
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [2]:
movies = pd.read_csv('Data/ml-25m/movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Use str.extract to extract the year from the 'Title' column
movies['Year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)

# Convert the 'Year' column to numeric 
movies['Year'] = pd.to_datetime(movies['Year'], format='%Y', errors='coerce')

# Use str.replace to remove the year string in parentheses from the 'Title' column
movies['title'] = movies['title'].str.replace(r'\(\d{4}\)', '').str.strip()

movies.head()

Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995-01-01
1,2,Jumanji,Adventure|Children|Fantasy,1995-01-01
2,3,Grumpier Old Men,Comedy|Romance,1995-01-01
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995-01-01
4,5,Father of the Bride Part II,Comedy,1995-01-01


In [5]:
# Split the 'Genre' column and one-hot encode
genres = movies['genres'].str.get_dummies('|')

# Concatenate the new one-hot encoded columns with the original DataFrame
movies = pd.concat([movies, genres], axis=1)

# Drop the original 'Genre' column
movies = movies.drop('genres', axis=1)

# Reorder the columns
movies = movies[['movieId', 'title', 'Year'] + list(genres.columns)]

In [6]:
movies.head()

Unnamed: 0,movieId,title,Year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995-01-01,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995-01-01,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
movies2 = pd.read_csv('Data/ml-25m/movies2.csv', encoding='latin-1')


In [8]:
movies2.head()


Unnamed: 0,movieId,title,Year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
ratings = pd.read_csv('Data/ml-25m/ratings.csv', encoding='latin-1')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [10]:
ratings.drop('timestamp', axis=1)

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [11]:
master_ratings = pd.merge(movies2, ratings, on='movieId')
master_ratings.head()

Unnamed: 0,movieId,title,Year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating,timestamp
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,2,3.5,1141415820
1,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,3,4.0,1439472215
2,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,4,3.0,1573944252
3,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,5,4.0,858625949
4,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,8,4.0,890492517


In [12]:
master_ratings.drop('timestamp', axis=1)

Unnamed: 0,movieId,title,Year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,2,3.5
1,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,3,4.0
2,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,4,3.0
3,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,5,4.0
4,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,8,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24987339,209157,We,2018,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,119571,1.5
24987340,209159,Window of the Soul,2001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,115835,3.0
24987341,209163,Bad Poems,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,6964,4.5
24987342,209169,A Girl Thing,2001,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,119571,3.0


In [13]:
count_no_genre = master_ratings['(no genres listed)'].sum()
print(f"Number of rows with 'no genre listed' = 1: {count_no_genre}")

Number of rows with 'no genre listed' = 1: 23254


In [14]:
master_ratings.drop(master_ratings[master_ratings['(no genres listed)'] == 1].index, inplace=True)


In [15]:
master_ratings.drop('(no genres listed)', axis=1)

Unnamed: 0,movieId,title,Year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating,timestamp
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,2,3.5,1141415820
1,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,3,4.0,1439472215
2,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,4,3.0,1573944252
3,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,5,4.0,858625949
4,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,8,4.0,890492517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24987338,209155,Santosh Subramaniam,2008,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,134916,5.0,1574272160
24987339,209157,We,2018,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,119571,1.5,1574280748
24987340,209159,Window of the Soul,2001,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,115835,3.0,1574280985
24987341,209163,Bad Poems,2018,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,6964,4.5,1574284913


In [17]:
master_ratings.to_csv('Data\ml-25m\master_ratings2.csv', index=False)


In [18]:
master_ratings.drop('timestamp', axis=1)

Unnamed: 0,movieId,title,Year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,2,3.5
1,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,3,4.0
2,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,4,3.0
3,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,5,4.0
4,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,8,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24987338,209155,Santosh Subramaniam,2008,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,134916,5.0
24987339,209157,We,2018,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,119571,1.5
24987340,209159,Window of the Soul,2001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,115835,3.0
24987341,209163,Bad Poems,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,6964,4.5


In [19]:
master_rating_small = master_ratings.sample(frac=.9)

In [20]:
master_rating_small.to_csv('Data\ml-25m\master_rating_small.csv', index=False)

In [21]:
 master_rating_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22467681 entries, 13066827 to 9859737
Data columns (total 26 columns):
 #   Column              Dtype  
---  ------              -----  
 0   movieId             int64  
 1   title               object 
 2   Year                int64  
 3   (no genres listed)  int64  
 4   Action              int64  
 5   Adventure           int64  
 6   Animation           int64  
 7   Children            int64  
 8   Comedy              int64  
 9   Crime               int64  
 10  Documentary         int64  
 11  Drama               int64  
 12  Fantasy             int64  
 13  Film-Noir           int64  
 14  Horror              int64  
 15  IMAX                int64  
 16  Musical             int64  
 17  Mystery             int64  
 18  Romance             int64  
 19  Sci-Fi              int64  
 20  Thriller            int64  
 21  War                 int64  
 22  Western             int64  
 23  userId              int64  
 24  rating          