In [1]:
# Import dependencies 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
# from scipy.sparse import csr_matrix

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
#import our input dataset
#SAMPLES from MovieLens

In [3]:
links_df = pd.read_csv("Resources/ml-latest-small/links.csv")
links_df.head()

#example of imdb id for toy story is tt0114709 -- could use with OMDB API for extra info (excerpt etc) once presenting reccs?

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies_df = pd.read_csv("Resources/ml-latest-small/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df = pd.read_csv("Resources/ml-latest-small/ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_df = pd.read_csv("Resources/ml-latest-small/tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
#merge rating + movies to have titles ready
# merged_movie_ratings = pd.merge(movies_df, ratings_df, on='movieId', how='outer')
merged_movie_ratings = pd.merge(movies_df, ratings_df, on='movieId', how='inner')
merged_movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
#check shape of new merged movies+ratings
print("Rows, Columns:", merged_movie_ratings.shape)

#outer join: rows = 100854 ; include all and give nulls to those with no ratings. keeps individual user ratings!
# inner join: rows = 100836 ; exclude those with no ratings so no nulls

Rows, Columns: (100836, 6)


In [9]:
# count nulls
null_counts = merged_movie_ratings.isnull().sum()
print(null_counts)

has_nulls = merged_movie_ratings.isnull().values.any()
print(has_nulls)

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64
False


In [10]:
## 1. 
# a. change timestamp (orig rating csv) to year [see Brendan's method]
# b. separate year from title to get year_released

In [11]:
year_rated = 1970 + (merged_movie_ratings['timestamp'] / 31540000)
year_rated = np.floor(year_rated).astype('int')
year_rated

0         2000
1         1996
2         2005
3         2017
4         2011
          ... 
100831    2018
100832    2018
100833    2018
100834    2018
100835    2018
Name: timestamp, Length: 100836, dtype: int32

In [12]:
merged_movie_ratings['year_rated'] = year_rated
merged_movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_rated
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,2000
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,1996
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,2005
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,2017
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,2011


In [13]:
# change dtype of year_rated to categorical
merged_movie_ratings['year_rated'] = merged_movie_ratings['year_rated'].astype('category')
merged_movie_ratings.dtypes

movieId          int64
title           object
genres          object
userId           int64
rating         float64
timestamp        int64
year_rated    category
dtype: object

In [14]:
# separate year from title to get year_released column
# find the four-digit year within parenthese
merged_movie_ratings['year_released'] = merged_movie_ratings['title'].str.extract(r'\((\d{4})\)')
merged_movie_ratings.tail()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_rated,year_released
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082,2018,2017
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545,2018,2017
100833,193585,Flint (2017),Drama,184,3.5,1537109805,2018,2017
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021,2018,2018
100835,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,331,4.0,1537157606,2018,1991


In [15]:
# check year released dtype and change to categorical if needed
# merged_movie_ratings.dtypes
merged_movie_ratings['year_released'] = merged_movie_ratings['year_released'].astype('category')

merged_movie_ratings.dtypes

movieId             int64
title              object
genres             object
userId              int64
rating            float64
timestamp           int64
year_rated       category
year_released    category
dtype: object

In [16]:
# # change rating from continuous to discrete to avoid error with KNN model
# bins = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5.0, 5.5]
# labels = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

# # Convert continuous ratings to discrete classes
# merged_movie_ratings['discrete_rating'] = pd.cut(merged_movie_ratings['rating'], bins=bins, labels=labels, right=False)

# merged_movie_ratings.head(20)

In [17]:
# format genre as a list. may help with getting dummies
merged_movie_ratings['genres'] = merged_movie_ratings['genres'].str.split('|')
merged_movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_rated,year_released
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0,964982703,2000,1995
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0,847434962,1996,1995
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5,1106635946,2005,1995
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,1510577970,2017,1995
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5,1305696483,2011,1995


In [18]:
## 2. Scaling and Categorizing (prep for cosine ML)
# a. genres - dummies (one-hot encode (binary 0/1 for each))
# b. rating - numerical scale?
# c. year_rated - dummies (one-hot encode)
# d. year_released - dummies (one-hot encode)

In [19]:
# Convert categorical data to numeric with `pd.get_dummies`
genres_encoded = merged_movie_ratings['genres'].str.get_dummies(sep=', ')
merged_dummies_df = pd.concat([merged_movie_ratings, genres_encoded], axis=1)
merged_dummies_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_rated,year_released,'Adventure','Adventure'],...,['Musical'],['Mystery',['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Thriller'],['War'],['Western']
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0,964982703,2000,1995,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0,847434962,1996,1995,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5,1106635946,2005,1995,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,1510577970,2017,1995,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5,1305696483,2011,1995,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
print(merged_dummies_df.isna().count())
# print(merged_dummies_df.isnull())

movieId         100836
title           100836
genres          100836
userId          100836
rating          100836
                 ...  
['Sci-Fi'       100836
['Sci-Fi']      100836
['Thriller']    100836
['War']         100836
['Western']     100836
Length: 77, dtype: int64


In [22]:
rows_with_nan = merged_dummies_df[merged_dummies_df.isna().any(axis=1)]
print(rows_with_nan)

# IMPORTANT: not all movies have (year) which creates null values for year released :(
# Need to choose whether to drop year_released as a feature column OR drop all movies without year in title. Leaning towards rmv year_released
# ALT: supplement with info from a sep API/source to get full year_released column?

        movieId                                              title  \
81292     40697                                          Babylon 5   
81293     40697                                          Babylon 5   
99325    140956                                   Ready Player One   
99326    140956                                   Ready Player One   
99327    140956                                   Ready Player One   
99328    140956                                   Ready Player One   
99459    143410                                         Hyena Road   
99526    147250  The Adventures of Sherlock Holmes and Doctor W...   
99617    149334                                  Nocturnal Animals   
99799    156605                                           Paterson   
100051   162414                                          Moonlight   
100269   167570                                             The OA   
100426   171495                                             Cosmos   
100427   171495     

In [23]:
##3. basic ML model - cosine similarity

In [24]:
# # create features set - drop nulls due to year_released issue
# merged_dummies_df = merged_dummies_df.dropna()
# features = merged_dummies_df.drop(columns=['genres', 'title', 'movieId', 'userId', 'timestamp'])  # Drop non-feature columns
# # drop 'rating' if use 'discrete_rating'
# features.head()

In [26]:
# create features set - drop release_year due to issue with new NaNs if no year in title
features = merged_dummies_df.drop(columns=['genres', 'title', 'movieId', 'userId', 'timestamp', 'year_released'])  # Drop non-feature columns
features.head()

Unnamed: 0,rating,year_rated,'Adventure','Adventure'],'Animation','Animation'],'Children','Children'],'Comedy','Comedy'],...,['Musical'],['Mystery',['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Thriller'],['War'],['Western']
0,4.0,2000,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,1996,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.5,2005,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2.5,2017,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4.5,2011,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Normalize the Data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
features_scaled[:1]


array([[ 0.47811176, -1.12310485, -0.3891499 , -0.07439387,  4.69275162,
        -0.0241961 ,  3.77223059, -0.06987921,  2.97994472, -0.19324434,
        -0.28751204, -0.15520961, -0.02559212, -0.03701938, -0.44016958,
        -0.30328888, -0.25773426,  4.27210242, -0.07871898, -0.03688482,
        -0.18889487, -0.11675136, -0.20704714, -0.12969172, -0.15399914,
        -0.23527634, -0.11904285, -0.19407117, -0.40715419, -0.28686464,
        -0.31748014, -0.13181022, -0.56044451, -0.04718447, -0.21927681,
        -0.01259757, -0.13339775, -0.02159446, -0.65771885, -0.04298823,
         2.96077668, -0.01372808, -0.15990325, -0.01863381, -0.14544784,
        -0.00890747, -0.46648205, -0.27721382, -0.26440885, -0.01443268,
        -0.03508894, -0.09441877, -0.34592773, -0.25795299, -0.05398311,
        -0.0147724 , -0.03228591, -0.00833213, -0.13561398, -0.07992172,
        -0.0307085 , -0.01967019, -0.09783397, -0.01408479, -0.02500335,
        -0.02480399, -0.04469135, -0.0375528 , -0.0

In [28]:
# Calculate Cosine Similarity
cosine_sim = cosine_similarity(features_scaled)
cosine_sim[:1]

# not enough memory... may need to eith use AWS (also crashed free Google Colab) or vetcorize??

MemoryError: Unable to allocate 75.8 GiB for an array with shape (100836, 100836) and data type float64

In [None]:
# Create a Function to Get Recommendations
# You can create a function that takes a movie title as input and returns similar movies based on the cosine similarity.

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    title_index = df.index[df['title'] == title].tolist()[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[title_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]



In [None]:
# Use the Function (which utilizes fitted cosine similarity model)
# Now you can use the function to get recommendations for a specific movie.
recommended_movies = get_recommendations('Your Movie Title')
print(recommended_movies)

In [None]:
## Evaluate model

In [None]:
# Make Predictions: Based on the similarity scores, you can predict ratings for a user by taking a weighted average of the ratings of similar movies. For example, if you want to predict the rating for a movie that a user hasn't seen yet, you would look at the ratings the user has given to other movies and their cosine similarity to the target movie.

# Create a Test Set: You need a test set of known ratings (ground truth) to evaluate the predictions. This could be a subset of your data where you know the actual ratings.

# Calculate Accuracy Metrics: Since cosine similarity is often used in a recommendation context, you may want to calculate metrics such as:

# Mean Absolute Error (MAE): Measures the average magnitude of the errors in a set of predictions, without considering their direction.
# from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(actual_ratings, predicted_ratings)
# Root Mean Squared Error (RMSE): Measures the square root of the average of squared differences between predicted and actual ratings.
# from sklearn.metrics import mean_squared_error
# rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)
# R-squared (R²): Indicates how well the predicted ratings approximate the actual ratings.
# from sklearn.metrics import r2_score
# r2 = r2_score(actual_ratings, predicted_ratings)
# Evaluate the Results: Compare the calculated metrics to determine how well your cosine similarity model is performing. Lower MAE and RMSE values indicate better accuracy.

In [None]:
# potential optimization ideas:
# 1. cut off movies with only 1 (or other low number) rating (amt rated not score itself) --> drop or bin into 'less than x amt reviews' \
# will cause issues if only like niche movies...? might accidentally drop a movie if not enough ratings?; could be future advancement...?
# 2. cut off point for low rated reviews?? (but what if user likes their bad taste!) --> make less than 2 stars bin (like chlg 21)
# 3. clean up genres so that instead of unique combos, it is orgz'd by core genres

In [91]:
released_counts = merged_movie_ratings['year_released'].value_counts()

for years, count in released_counts.items():
    if count > 1:
        print(f"{years}: {count}")

1995: 6144
1994: 5296
1999: 4536
1996: 4509
2000: 4268
2001: 3914
1993: 3741
1997: 3643
2002: 3642
1998: 3557
2004: 3390
2003: 3145
2006: 2584
2005: 2498
2007: 2318
2008: 2151
1992: 2013
1990: 1926
1989: 1895
2009: 1856
1991: 1722
2010: 1715
1986: 1581
1988: 1551
1987: 1540
1984: 1441
2011: 1440
2012: 1386
2014: 1320
1985: 1237
2013: 1201
2015: 1088
1982: 1053
1980: 944
1983: 878
1981: 792
1979: 786
2016: 785
1975: 637
1977: 567
1971: 533
1978: 490
1973: 483
1974: 466
2017: 461
1968: 434
1976: 423
1967: 381
1964: 376
1972: 366
1963: 287
1954: 272
1962: 262
1960: 251
1970: 244
1959: 243
1965: 234
1961: 225
1940: 223
1969: 218
1957: 215
1939: 201
1941: 197
1966: 190
1942: 185
1955: 182
1951: 182
1958: 165
1950: 161
1953: 153
1946: 138
1956: 115
1937: 115
1952: 96
1944: 92
2018: 91
1948: 80
1931: 76
1949: 72
1933: 65
1938: 56
1936: 53
1947: 52
1935: 46
1945: 42
1934: 34
1927: 29
1932: 24
1943: 20
1925: 19
1930: 17
1922: 16
1926: 13
1928: 13
1929: 9
1920: 8
1923: 7
1924: 6
1916: 5
1902: 5


In [89]:
# merged_movie_ratings['genres'].dtypes
genre_counts = merged_movie_ratings['genres'].value_counts()
print(genre_counts)

genres
Comedy                             7196
Drama                              6291
Comedy|Romance                     3967
Comedy|Drama|Romance               3000
Comedy|Drama                       2851
                                   ... 
Crime|Drama|Film-Noir|Romance         1
Crime|Romance                         1
Adventure|Documentary|Western         1
Action|Fantasy|Western                1
Action|Animation|Comedy|Fantasy       1
Name: count, Length: 951, dtype: int64


In [93]:
for genres, count in genre_counts.items():
    if int(count) > 1:  # Convert count to an integer
        print(f"{genres}: {count}")

Comedy: 7196
Drama: 6291
Comedy|Romance: 3967
Comedy|Drama|Romance: 3000
Comedy|Drama: 2851
Drama|Romance: 2838
Action|Adventure|Sci-Fi: 2361
Crime|Drama: 2315
Action|Crime|Thriller: 1554
Action|Adventure|Thriller: 1455
Action|Adventure|Sci-Fi|Thriller: 1446
Drama|Thriller: 1365
Action|Sci-Fi|Thriller: 1195
Comedy|Crime: 1171
Crime|Drama|Thriller: 1119
Drama|War: 1044
Action|Drama|War: 1034
Action|Crime|Drama|Thriller: 1007
Documentary: 891
Drama|Mystery|Thriller: 712
Action|Sci-Fi: 689
Action|Comedy: 664
Action|Adventure|Sci-Fi|IMAX: 650
Action|Thriller: 650
Horror: 640
Horror|Thriller: 630
Children|Comedy: 629
Thriller: 628
Action|Adventure|Fantasy: 615
Mystery|Thriller: 598
Adventure|Fantasy: 584
Adventure|Animation|Children|Comedy|Fantasy: 574
Comedy|Crime|Drama|Thriller: 563
Action|Adventure: 555
Adventure|Comedy|Sci-Fi: 532
Action|Comedy|Sci-Fi: 515
Adventure|Animation|Children|Comedy: 514
Crime|Mystery|Thriller: 494
Adventure|Comedy: 485
Action|Adventure|Drama: 479
Action|Drama|

In [97]:
year_rated_counts = merged_movie_ratings['year_rated'].value_counts()
print(year_rated_counts)

year_rated
2000    10104
2017     7972
2007     7306
2016     6776
2018     6725
2015     6538
1996     5917
2005     5798
2012     4611
2008     4284
2009     4180
2003     4010
2006     3806
2002     3758
2001     3639
2004     3275
1999     2395
2010     2350
1997     2035
2011     1752
2013     1696
2014     1398
1998      511
Name: count, dtype: int64


In [99]:
rated_counts = merged_movie_ratings['rating'].value_counts()
print(rated_counts)
# potential 'less than 1' bin: 
# 1.0     2811
# 1.5     1791
# 0.5     1370

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64


In [None]:
### ALT model idea
# [#]. [BRENDAN STARTED THIS PART. LOOK AT TIMESTAMP JUPYTER NB] Explore user info from unique user ids (can I build user profiles based \
# on this to do collaborative with later for ML?)
# a.	Compile all data based on user ids (group by?) + clean/rmv if no user data
# b.	[]
# c.	[eventually] For each user, cluster on chosen features
# d.	[eventually] Predict our new input based on prev clustering of user profiles that user most sim to?