# Introduction
This project is a movie suggestion AI. The user can input a movie title, for example, 'The Incredibles', and the AI will suggest movies that are similar based on the movie's genre.

Recommender, Natural Language Processing

# TODO
1. Go back through the guide and write more documentation

## Useful Links
https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

https://www.kaggle.com/code/rounakbanik/movie-recommender-systems

https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/data

In [4]:
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer

In [5]:
credits_data = pd.read_csv("credits.csv")
movies_data = pd.read_csv("movies_metadata.csv")
keywords_data = pd.read_csv("keywords.csv")
links_data = pd.read_csv("links.csv")
ratings_data = pd.read_csv("ratings.csv")

  movies_data = pd.read_csv("movies_metadata.csv")


In [6]:
movies_data.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Simple Recommender

In [7]:
C = movies_data['vote_average'].mean()
C

5.618207215134185

In [8]:
m = movies_data['vote_count'].quantile(0.9)
m

160.0

In [9]:
# makes the genres column a list of strings instead of a bunch of other information that isn't useful at the moment
movies_data['genres'] = movies_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
qualified_movies = movies_data.copy().loc[movies_data['vote_count'] >= m]
qualified_movies.shape

(4555, 24)

In [11]:
def weighted_rating(x, m = m, C = C):
    v = x['vote_count']
    R = x['vote_average']
    # Compute the weighted rating of each movie based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
# Define a new feature 'score' and calculate its value with weighted_rating()
qualified_movies['score'] = qualified_movies.apply(weighted_rating, axis=1)

In [13]:
# Sorting movies based on score
qualified_movies = qualified_movies.sort_values(by='score', ascending=False)

qualified_movies[['title', 'overview', 'imdb_id', 'genres', 'vote_count', 'vote_average','score']].head(25)

Unnamed: 0,title,overview,imdb_id,genres,vote_count,vote_average,score
314,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,tt0111161,"[Drama, Crime]",8358.0,8.5,8.445869
834,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",tt0068646,"[Drama, Crime]",6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",tt0112870,"[Comedy, Drama, Romance]",661.0,9.1,8.421453
12481,The Dark Knight,Batman raises the stakes in his war on crime. ...,tt0468569,"[Drama, Action, Crime, Thriller]",12269.0,8.3,8.265477
2843,Fight Club,A ticking-time-bomb insomniac and a slippery s...,tt0137523,[Drama],9678.0,8.3,8.256385
292,Pulp Fiction,"A burger-loving hit man, his philosophical par...",tt0110912,"[Thriller, Crime]",8670.0,8.3,8.251406
522,Schindler's List,The true story of how businessman Oskar Schind...,tt0108052,"[Drama, History, War]",4436.0,8.3,8.206639
23673,Whiplash,"Under the direction of a ruthless instructor, ...",tt2582802,[Drama],4376.0,8.3,8.205404
5481,Spirited Away,A ten year old girl who wanders away from her ...,tt0245429,"[Fantasy, Adventure, Animation, Family]",3968.0,8.3,8.196055
2211,Life Is Beautiful,A touching story of an Italian book seller of ...,tt0118799,"[Comedy, Drama]",3643.0,8.3,8.187171


### Search by Genre

In [14]:
reset_genre = movies_data.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
reset_genre.name = 'genre'
genres_movies_data = movies_data.drop('genres', axis=1).join(reset_genre)

In [15]:
def build_chart(genre, percentile=0.85):
    df = genres_movies_data[genres_movies_data['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'overview', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    qualified['score'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('score', ascending=False).head(250)

    return qualified

In [16]:
build_chart('Romance').head(10)

Unnamed: 0,title,overview,vote_count,vote_average,popularity,score
10309,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",661,9,34.457024,8.565285
351,Forrest Gump,A man with a low IQ has accomplished great thi...,8147,8,48.307194,7.971357
876,Vertigo,A retired San Francisco detective suffering fr...,1162,8,18.20822,7.811667
40251,Your Name.,High schoolers Mitsuha and Taki are complete s...,1030,8,34.461252,7.789489
883,Some Like It Hot,Two musicians witness a mob hit and struggle t...,835,8,11.845107,7.745154
1132,Cinema Paradiso,"A filmmaker recalls his childhood, when he fel...",834,8,14.177005,7.744878
19901,Paperman,An urban office worker finds that paper airpla...,734,8,7.198633,7.713951
37863,Sing Street,A boy growing up in Dublin during the 1980s es...,669,8,10.672862,7.689483
882,The Apartment,Bud Baxter is a minor clerk in a huge New York...,498,8,11.994281,7.599317
38718,The Handmaiden,"1930s Korea, in the period of Japanese occupat...",453,8,16.727405,7.566166


In [17]:
build_chart('Action').head(10)

Unnamed: 0,title,overview,vote_count,vote_average,popularity,score
15480,Inception,"Cobb, a skilled thief who commits corporate es...",14075,8,29.108149,7.955099
12481,The Dark Knight,Batman raises the stakes in his war on crime. ...,12269,8,123.167259,7.94861
4863,The Lord of the Rings: The Fellowship of the Ring,"Young hobbit Frodo Baggins, after inheriting a...",8892,8,32.070725,7.929579
7000,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,8226,8,29.324358,7.924031
5814,The Lord of the Rings: The Two Towers,Frodo and Sam are trekking to Mordor to destro...,7641,8,29.423537,7.918382
256,Star Wars,Princess Leia is captured and held hostage by ...,6778,8,42.149697,7.908327
1154,The Empire Strikes Back,"The epic saga continues as Luke Skywalker, in ...",5998,8,19.470959,7.896841
4135,Scarface,After getting a green card in exchange for ass...,3017,8,11.299673,7.802046
9430,Oldboy,"With no clue how he came to be imprisoned, dru...",2000,8,10.616859,7.711649
1910,Seven Samurai,A samurai answers a village's request for prot...,892,8,15.01777,7.426145


In [18]:
build_chart('Crime').head(10)

Unnamed: 0,title,overview,vote_count,vote_average,popularity,score
12481,The Dark Knight,Batman raises the stakes in his war on crime. ...,12269,8,123.167259,7.957677
292,Pulp Fiction,"A burger-loving hit man, his philosophical par...",8670,8,140.950236,7.940522
314,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,8358,8,51.645403,7.938355
834,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",6024,8,41.109264,7.915273
46,Se7en,Two homicide detectives are on a desperate hun...,5915,8,18.45743,7.913765
586,The Silence of the Lambs,"FBI trainee, Clarice Starling ventures into a ...",4549,8,4.307222,7.889007
289,Leon: The Professional,"Leon, the top hit man in New York, has earned ...",4293,8,20.477329,7.882696
3030,The Green Mile,A supernatural tale set on death row in a Sout...,4166,8,19.96678,7.879291
1057,Reservoir Dogs,A botched robbery indicates a police informant...,3821,8,12.22034,7.868957
1178,The Godfather: Part II,In the continuing saga of the Corleone crime f...,3418,8,36.629307,7.854398


# Content Based Recommender

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [20]:
links_data = links_data[links_data['tmdbId'].notnull()]['tmdbId'].astype(int)
movies_data['id'] = pd.to_numeric(movies_data['id'], errors='coerce').astype('Int64')
# the above line may be messing with predictions cause it does something with the "1997-10-whatever" values

In [21]:
new_movies_data = movies_data[movies_data['id'].isin(links_data)]
new_movies_data.shape

(45463, 24)

## Description Based Recommender

In [22]:
new_movies_data['tagline'] = new_movies_data['tagline'].fillna('')
new_movies_data['description'] = new_movies_data['overview'] + new_movies_data['tagline']
new_movies_data['description'] = new_movies_data['description'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_data['tagline'] = new_movies_data['tagline'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_data['description'] = new_movies_data['overview'] + new_movies_data['tagline']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_data['description'] = new_movies_data

In [23]:
tfidfvect = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidfvect.fit_transform(new_movies_data['description'])
tfidf_matrix.shape

(45463, 77744)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.01408358, 0.        , ..., 0.        , 0.00571404,
       0.        ])

In [25]:
new_movies_data = new_movies_data.reset_index()
titles = new_movies_data['title']

In [26]:
def get_recommendations_description(title):
    index = new_movies_data[new_movies_data['title'] == title].index[0]
    similar_scores = list(enumerate(cosine_sim[index]))
    similar_movies = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_movies = similar_movies[1:31]
    movie_indices = [i[0] for i in similar_movies]
    return titles.iloc[movie_indices]

In [27]:
get_recommendations_description('The Godfather').head(15)

1178               The Godfather: Part II
44027    The Godfather Trilogy: 1972-1990
23125                          Blood Ties
1914              The Godfather: Part III
31971                    Honor Thy Father
11297                    Household Saints
33459             The Most Beautiful Wife
34715                   Start Liquidation
38027            A Mother Should Be Loved
10821                            Election
4324                                 Made
17729                   Short Sharp Shock
30785                   The Sign of Venus
5433                   Johnny Dangerously
8653                         Violent City
Name: title, dtype: object

In [28]:
get_recommendations_description('The Dark Knight Rises').head(15)

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
3095                          Batman: Mask of the Phantasm
15511                           Batman: Under the Red Hood
21193    Batman Unmasked: The Psychology of the Dark Kn...
20231              Batman: The Dark Knight Returns, Part 2
585                                                 Batman
21399                      Batman: Mystery of the Batwoman
18035                                     Batman: Year One
9230                    Batman Beyond: Return of the Joker
25266                                    Batman vs Dracula
41973                                The Lego Batman Movie
10122                                        Batman Begins
19791              Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [29]:
get_recommendations_description('Toy Story').head(15)

15348                                     Toy Story 3
2997                                      Toy Story 2
24522                                       Small Fry
10301                          The 40 Year Old Virgin
23842                     Andy Hardy's Blonde Trouble
3057                                  Man on the Moon
29201                                      Hot Splash
43424                Andy Kaufman Plays Carnegie Hall
38473    Superstar: The Life and Times of Andy Warhol
6435                           What's Up, Tiger Lily?
42718    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
1071                            Rebel Without a Cause
36091                            Welcome to Happiness
1199                                        Manhattan
Name: title, dtype: object

## Metadata Based Recommender

In [30]:
keywords_data['id'] = keywords_data['id'].astype('int')
credits_data['id'] = credits_data['id'].astype('int')
movies_data['id'] = movies_data['id'].fillna(0).astype('int')

In [31]:
movies_data = movies_data.merge(credits_data, on='id')
movies_data = movies_data.merge(keywords_data, on='id')

In [32]:
new_movies_data = movies_data[movies_data['id'].isin(links_data)]

In [33]:
new_movies_data['cast'] = new_movies_data['cast'].apply(literal_eval)
new_movies_data['crew'] = new_movies_data['crew'].apply(literal_eval)
new_movies_data['keywords'] = new_movies_data['keywords'].apply(literal_eval)

In [34]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [35]:
new_movies_data['director'] = new_movies_data['crew'].apply(get_director)

In [36]:
new_movies_data['cast'] = new_movies_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
new_movies_data['cast'] = new_movies_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [37]:
new_movies_data['keywords'] = new_movies_data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [38]:
new_movies_data['cast'] = new_movies_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
new_movies_data['director'] = new_movies_data['director'].astype('str').apply(lambda x: [str.lower(x.replace(" ", ""))])
new_movies_data['director'] = new_movies_data['director'].apply(lambda x: [x,x,x])

In [39]:
key = new_movies_data.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
key.name = 'keyword'

In [40]:
key = key.value_counts()
key[:5]

keyword
woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
Name: count, dtype: int64

No use for singlular keywords

In [41]:
key = key[key > 1]
stemmer = SnowballStemmer('english')

In [42]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in key:
            words.append(i)
    return words

In [43]:
new_movies_data['keywords'] = new_movies_data['keywords'].apply(filter_keywords)
new_movies_data['keywords'] = new_movies_data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
new_movies_data['keywords'] = new_movies_data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [44]:
new_movies_data['dump'] = new_movies_data['keywords'] + new_movies_data['cast'] + new_movies_data['director'] + new_movies_data['genres']
new_movies_data['dump'] = new_movies_data['dump'].apply(lambda x: ' '.join([str(item) if isinstance(item, list) else item for item in x]))
# ^^^ Gave a list instead of a string
new_movies_data['dump'] = new_movies_data['dump'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

In [45]:
count = CountVectorizer(analyzer='word', stop_words='english')
count_matrix = count.fit_transform(new_movies_data['dump'])

In [46]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [47]:
new_movies_data = new_movies_data.reset_index()
titles = new_movies_data['title']

Reusing the same function from previous recommender

In [48]:
get_recommendations_description('The Godfather').head(15)

15534                  The Rain People
4450     Tucker: The Man and His Dream
1187            The Godfather: Part II
1926           The Godfather: Part III
1606                     The Rainmaker
4582                   The Cotton Club
4000                  Gardens of Stone
3623                  The Conversation
8867                       Rumble Fish
6004                One from the Heart
2017                     The Outsiders
2368             Peggy Sue Got Married
7710              You're a Big Boy Now
23085                       Captain EO
750                               Jack
Name: title, dtype: object

In [49]:
get_recommendations_description('The Dark Knight Rises').head(15)

12541    The Dark Knight
10170      Batman Begins
2478           Following
11411       The Prestige
26060          Doodlebug
26061          Doodlebug
45831            Dunkirk
5278            Insomnia
15576          Inception
4114             Memento
23018       Interstellar
11472        Harsh Times
1503      Batman & Robin
8994      State of Grace
15004        Harry Brown
Name: title, dtype: object

In [50]:
get_recommendations_description('Toy Story').head(15)

10710                      Luxo Jr.
19227                       Tin Toy
19281                   Red's Dream
3012                    Toy Story 2
19331                   Knick Knack
17477                        Cars 2
22992      Mater and the Ghostlight
11030                          Cars
2254                   A Bug's Life
22064          Toy Story of Terror!
15444                   Toy Story 3
25951    Toy Story That Time Forgot
16265          Crazy on the Outside
25949               Partysaurus Rex
4777                 Monsters, Inc.
Name: title, dtype: object

### Popularity and Ratings

In [51]:
def improved_recommendations(title):
    index = new_movies_data[new_movies_data['title'] == title].index[0]
    similar_scores = list(enumerate(cosine_sim[index]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = similar_scores[1:31]
    movie_indices = [i[0] for i in similar_scores]
    
    movies = new_movies_data.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['score'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('score', ascending=False).head(10)

    return qualified


In [52]:
improved_recommendations('The Godfather')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['score'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,score
1187,The Godfather: Part II,3418,8,7.893492
1174,Apocalypse Now,2112,8,7.832268
1926,The Godfather: Part III,1589,7,6.873592
1300,Dracula,1087,7,6.822705
3623,The Conversation,377,7,6.588293
11681,The Consequences of Love,125,7,6.224257
3315,...And Justice for All,118,7,6.204724
2017,The Outsiders,293,6,5.86515
1606,The Rainmaker,239,6,5.8469
8867,Rumble Fish,141,6,5.797054


In [53]:
improved_recommendations('The Dark Knight Rises')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['score'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,score
15576,Inception,14075,8,7.973229
12541,The Dark Knight,12269,8,7.969339
23018,Interstellar,11187,8,7.966415
11411,The Prestige,4510,8,7.918397
4114,Memento,4168,8,7.911949
10170,Batman Begins,7511,7,6.971179
45831,Dunkirk,2712,7,6.92302
2478,Following,363,7,6.577272
1337,Batman Returns,1706,6,5.967263
5278,Insomnia,1181,6,5.954447


In [54]:
improved_recommendations('Toy Story')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['score'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,score
4777,"Monsters, Inc.",6150,7,6.964962
15444,Toy Story 3,4710,7,6.954602
3012,Toy Story 2,3914,7,6.945732
22857,The Lego Movie,3127,7,6.932739
11030,Cars,3991,6,5.985284
2254,A Bug's Life,2379,6,5.975941
28963,Home,1539,6,5.964045
11165,Monster House,912,6,5.943016
19106,The Pirates! In an Adventure with Scientists!,379,6,5.886666
350,The Flintstones,559,5,5.13757


# Collaborative Filtering

In [55]:
from surprise import Reader, Dataset, SVD

reader = Reader()

In [56]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [57]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=27)
for train_index, test_index in kf.split(ratings_data):
    train_set = ratings_data.iloc[train_index]
    test_set = ratings_data.iloc[test_index]

In [58]:
# Change svd parameters to possibly get better predictions -> n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)  # or any other algorithm you want to use
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

In [None]:
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fb9bb6aa0c0>>
Traceback (most recent call last):
  File "/home/bryce/.local/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


RMSE score: ~80%
This takes hours to cross validate.

In [59]:
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2539246cc0>

In [60]:
ratings_data[ratings_data['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
5,1,1968,4.0,1425942148
6,1,2762,4.5,1425941300
7,1,2918,5.0,1425941593
8,1,2959,4.0,1425941601
9,1,4226,4.0,1425942228


In [61]:
model.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=4.519698628541905, details={'was_impossible': False})

# Hybrid Recommender

In [62]:
tmdbID_map = pd.read_csv('links.csv')[['movieId', 'tmdbId']]
tmdbID_map['tmdbId'] = tmdbID_map['tmdbId'].fillna(-1).astype('int')
tmdbID_map.columns = ['movieId', 'tmdbId']
tmdbID_map = tmdbID_map.merge(new_movies_data[['title', 'id']], left_on='tmdbId', right_on='id').set_index('title')
tmdbID_map.drop(columns=['tmdbId'], inplace=True)

In [63]:
tmdbID_map.head()

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862
Jumanji,2,8844
Grumpier Old Men,3,15602
Waiting to Exhale,4,31357
Father of the Bride Part II,5,11862


In [64]:
indices_map = tmdbID_map.set_index('id')

In [65]:
def hybrid_recommender(userId, title):
    index = new_movies_data[new_movies_data['title'] == title].index[0]
    tmdbId = tmdbID_map.loc[title]['id']
    movie_id = tmdbID_map.loc[title]['movieId']

    similar_scores = list(enumerate(cosine_sim[index]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = similar_scores[1:31]
    movie_indices = [i[0] for i in similar_scores]

    movies = new_movies_data.iloc[movie_indices][['title', 'genres', 'vote_average', 'vote_count', 'id']]
    movies['score'] = movies['id'].apply(lambda x: model.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('score', ascending=False)
    return movies.head(10)

In [69]:
hybrid_recommender(1, 'Avatar')

Unnamed: 0,title,genres,vote_average,vote_count,id,score
45137,T2 3-D: Battle Across Time,[],7.0,29.0,65595,4.073572
7480,Babylon 5: A Call to Arms,"[Action, Drama, Science Fiction, Adventure]",6.9,36.0,10916,4.057682
18996,Icarus XB 1,[Science Fiction],6.7,16.0,19757,4.057216
26720,Star Wars: The Force Awakens,"[Action, Adventure, Science Fiction, Fantasy]",7.5,7993.0,140607,3.923079
1167,Aliens,"[Horror, Action, Thriller, Science Fiction]",7.7,3282.0,679,3.919049
9701,Aliens of the Deep,"[Action, Documentary, Science Fiction]",6.8,19.0,22559,3.916247
1101,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",7.1,822.0,2756,3.842323
582,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",7.7,4274.0,280,3.841858
1469,The Fifth Element,"[Adventure, Fantasy, Action, Thriller, Science...",7.3,3962.0,18,3.829424
1204,The Terminator,"[Action, Thriller, Science Fiction]",7.4,4208.0,218,3.818856


In [73]:
hybrid_recommender(500, 'Toy Story 2')

Unnamed: 0,title,genres,vote_average,vote_count,id,score
15444,Toy Story 3,"[Animation, Family, Comedy]",7.6,4710.0,10193,5.0
19331,Knick Knack,[Animation],7.1,135.0,13928,4.975998
0,Toy Story,"[Animation, Comedy, Family]",7.7,5415.0,862,4.963822
25492,The Kingdom of Dreams and Madness,"[Animation, Documentary]",7.7,49.0,252511,4.879204
2254,A Bug's Life,"[Adventure, Animation, Comedy, Family]",6.8,2379.0,9487,4.860139
13808,Up,"[Animation, Comedy, Family, Adventure]",7.8,7048.0,14160,4.777556
3979,The Brave Little Toaster,"[Fantasy, Adventure, Animation, Comedy, Family...",6.8,94.0,19933,4.760014
18339,Arthur Christmas,"[Drama, Animation, Family, Comedy]",6.7,340.0,51052,4.739131
11030,Cars,"[Animation, Adventure, Comedy, Family]",6.6,3991.0,920,4.727506
25951,Toy Story That Time Forgot,"[Animation, Family]",6.8,249.0,256835,4.696633
