In [2]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# read data 
file_path = '/Users/gaurisarin/cs4100-movie-recommendation-engine/Hydra-Movie-Scrape.csv'
movies_data = pd.read_csv(file_path)
print(movies_data.head(15))

                                                Title  Year  \
0                         Patton Oswalt: Annihilation  2017   
1                                       New York Doll  2005   
2   Mickey's Magical Christmas: Snowed in at the H...  2001   
3                          Mickey's House of Villains  2001   
4                                       And Then I Go  2017   
5                            An Extremely Goofy Movie  2000   
6                                        Peter Rabbit  2018   
7                                          Love Songs  2007   
8                                                  89  2017   
9                                      The Foster Boy  2011   
10                                    Forever My Girl  2018   
11                            Tom Segura: Disgraceful  2018   
12      The Secret Rules of Modern Living: Algorithms  2015   
13                                Secrets in the Fall  2015   
14                                       Silent Night  

In [4]:
# preprocessing data
missing_values = movies_data.isnull().sum()
print(missing_values)


Title               0
Year                0
Summary             5
Short Summary       1
Genres              0
IMDB ID             0
Runtime             0
YouTube Trailer    47
Rating              0
Movie Poster        0
Director            0
Writers            18
Cast               24
dtype: int64


In [5]:
movies_data_cleaned = movies_data.dropna(subset=['Summary']).copy()
print(movies_data_cleaned.head())

                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
3                         Mickey's House of Villains  2001   
4                                      And Then I Go  2017   

                                             Summary  \
0  Patton Oswald, despite a personal tragedy, pro...   
1  A recovering alcoholic and recently converted ...   
2  After everyone is snowed in at the House of Mo...   
3  The villains from the popular animated Disney ...   
4  In the cruel world of junior high, Edwin suffe...   

                                       Short Summary  \
0  Patton Oswalt, despite a personal tragedy, pro...   
1  A recovering alcoholic and recently converted ...   
2  Mickey and all his friends hold their own Chri...   
3  The villains from the popular animated Disney ...   
4  In the 

In [6]:
movies_data_cleaned.drop(columns=['Movie Poster', 'IMDB ID', 'Short Summary'], inplace=True)
print(movies_data_cleaned.head())

                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
3                         Mickey's House of Villains  2001   
4                                      And Then I Go  2017   

                                             Summary  \
0  Patton Oswald, despite a personal tragedy, pro...   
1  A recovering alcoholic and recently converted ...   
2  After everyone is snowed in at the House of Mo...   
3  The villains from the popular animated Disney ...   
4  In the cruel world of junior high, Edwin suffe...   

                                      Genres  Runtime YouTube Trailer  Rating  \
0                              Uncategorized       66     4hZi5QaMBFc     7.4   
1                          Documentary|Music       75     jwD04NsnLLg     7.9   
2  Adventure|Animation|Comedy|Family|Fantasy   

In [7]:
def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) >= 3])

In [8]:
movies_data_cleaned['Summary'] = movies_data_cleaned['Summary'].str.replace('[^\w\s]', '', regex=True)
movies_data_cleaned['Summary'] = movies_data_cleaned['Summary'].str.lower()
movies_data_cleaned['Summary'] = movies_data_cleaned['Summary'].apply(remove_short_words)
print(movies_data_cleaned['Summary'].head())

0    patton oswald despite personal tragedy produce...
1    recovering alcoholic and recently converted mo...
2    after everyone snowed the house mouse mickey s...
3    the villains from the popular animated disney ...
4    the cruel world junior high edwin suffers stat...
Name: Summary, dtype: object


In [9]:
# if there are multiple genres then seperating the next genres into a different column 
genres_split = movies_data_cleaned['Genres'].str.split('|', expand=True)
for i in range(genres_split.shape[1]):
    movies_data_cleaned[f'Genre_{i+1}'] = genres_split[i]
for col in movies_data_cleaned.columns:
    if 'Genre_' in col:
        movies_data_cleaned[col] = movies_data_cleaned[col].str.lower().str.strip()

movies_data_cleaned.drop(columns=['Genres'], inplace=True)

print(movies_data_cleaned.head())

                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
3                         Mickey's House of Villains  2001   
4                                      And Then I Go  2017   

                                             Summary  Runtime YouTube Trailer  \
0  patton oswald despite personal tragedy produce...       66     4hZi5QaMBFc   
1  recovering alcoholic and recently converted mo...       75     jwD04NsnLLg   
2  after everyone snowed the house mouse mickey s...       65     uCKwHHftrU4   
3  the villains from the popular animated disney ...        0     JA03ciYt-Ek   
4  the cruel world junior high edwin suffers stat...       99     8CdIiD6-iF0   

   Rating           Director        Writers                            Cast  \
0     7.4  Bobcat Goldthwait  Patton Oswalt                  

In [10]:
# split cast into different 
cast_split = movies_data_cleaned['Cast'].str.split('|', expand=True)
for i in range(cast_split.shape[1]):
    movies_data_cleaned[f'Cast_{i+1}'] = cast_split[i]
for col in movies_data_cleaned.columns:
    if 'Cast_' in col:
        movies_data_cleaned[col] = movies_data_cleaned[col].str.lower().str.strip()

movies_data_cleaned.drop(columns=['Cast'], inplace=True)

print(movies_data_cleaned.head())

                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
3                         Mickey's House of Villains  2001   
4                                      And Then I Go  2017   

                                             Summary  Runtime YouTube Trailer  \
0  patton oswald despite personal tragedy produce...       66     4hZi5QaMBFc   
1  recovering alcoholic and recently converted mo...       75     jwD04NsnLLg   
2  after everyone snowed the house mouse mickey s...       65     uCKwHHftrU4   
3  the villains from the popular animated disney ...        0     JA03ciYt-Ek   
4  the cruel world junior high edwin suffers stat...       99     8CdIiD6-iF0   

   Rating           Director        Writers        Genre_1    Genre_2 Genre_3  \
0     7.4  Bobcat Goldthwait  Patton Oswalt  uncategorized 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, max_features=None, 
                                   strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                   ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                                   stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data_cleaned['Summary'])

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data_cleaned['Summary'])

feature_names = tfidf_vectorizer.get_feature_names_out()

def get_top_n_keywords(row, top_n=25):
    sorted_indices = np.argsort(row.toarray()).flatten()[::-1]
    top_n_indices = sorted_indices[:top_n]
    top_n_keywords = [feature_names[i] for i in top_n_indices]
    return top_n_keywords

movies_data_cleaned['Keywords'] = [get_top_n_keywords(row, top_n=5) for row in tfidf_matrix]

print(movies_data_cleaned[['Summary', 'Keywords']].head())


                                             Summary  \
0  patton oswald despite personal tragedy produce...   
1  recovering alcoholic and recently converted mo...   
2  after everyone snowed the house mouse mickey s...   
3  the villains from the popular animated disney ...   
4  the cruel world junior high edwin suffers stat...   

                                            Keywords  
0  [oswald, produces, tribulations, focusing, sta...  
1             [band, dolls, converted, mormon, kane]  
2        [christmas, disney, mickey, everyone, just]  
3         [villains, mickey, mouse, house, gathered]  
4  [edwin, anxiety, unflinching, belonging, misun...  


In [14]:
movies_data_cleaned['Keywords_str'] = movies_data_cleaned['Keywords'].apply(lambda x: ' '.join(x))

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_keywords = TfidfVectorizer()
tfidf_matrix_keywords = tfidf_vectorizer_keywords.fit_transform(movies_data_cleaned['Keywords_str'])
print("TF-IDF Matrix Keywords shape:", tfidf_matrix_keywords.shape)

TF-IDF Matrix Keywords shape: (3935, 8371)


In [26]:
genre1_dummies = pd.get_dummies(movies_data_cleaned['Genre_1'])

In [27]:
genre2_dummies = pd.get_dummies(movies_data_cleaned['Genre_2'])

In [28]:
genre3_dummies = pd.get_dummies(movies_data_cleaned['Genre_3'])

In [29]:
genre4_dummies = pd.get_dummies(movies_data_cleaned['Genre_4'])

In [30]:
genre5_dummies = pd.get_dummies(movies_data_cleaned['Genre_5'])

In [31]:
director_dummies = pd.get_dummies(movies_data_cleaned['Director'])
print("Director Dummies shape:", director_dummies.shape)

Director Dummies shape: (3935, 2399)


In [32]:
writers_dummies = pd.get_dummies(movies_data_cleaned['Writers'])

In [33]:
cast1_dummies = pd.get_dummies(movies_data_cleaned['Cast_1'])

In [34]:
cast2_dummies = pd.get_dummies(movies_data_cleaned['Cast_2'])

In [35]:
cast3_dummies = pd.get_dummies(movies_data_cleaned['Cast_3'])

In [20]:
from scipy.sparse import hstack

In [36]:
combined_features = hstack([tfidf_matrix_keywords, director_dummies,
                            writers_dummies,cast1_dummies, cast2_dummies, cast3_dummies, 
                            genre1_dummies, genre2_dummies, genre3_dummies, genre4_dummies, genre5_dummies])

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
cosine_sim = cosine_similarity(combined_features, combined_features)

In [39]:
def recommend_movies(title, cosine_sim=cosine_sim):
    idx = movies_data_cleaned.index[movies_data_cleaned['Title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_data_cleaned['Title'].iloc[movie_indices]

In [40]:
recommendations = recommend_movies('New York Doll')
print(recommendations)

205                  Contemporary Color
3466    Eagles of Death Metal: Nos Amis
302     Score: A Film Music Documentary
340                           Hired Gun
359                Whitney: Can I Be Me
452     The Death and Resurrection Show
132                  Tsukiji Wonderland
235      Naledi: A Baby Elephant's Tale
530            Titanoboa: Monster Snake
3534           Storm Children: Book One
Name: Title, dtype: object


In [42]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [43]:
tfidf_matrix_summaries = tfidf_vectorizer.fit_transform(movies_data_cleaned['Summary'])

In [44]:
def recommend_movies_based_on_input_plot(input_plot, tfidf_vectorizer=tfidf_vectorizer, tfidf_matrix=tfidf_matrix_summaries):
    input_vec = tfidf_vectorizer.transform([input_plot])
    cosine_sim = cosine_similarity(input_vec, tfidf_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_data_cleaned['Title'].iloc[movie_indices]


In [45]:
input_plot = "A story about the adventures in a magical land far away."
recommended_movies = recommend_movies_based_on_input_plot(input_plot)
print(recommended_movies)

3003                                         Pan
1199           Tinker Bell and the Lost Treasure
313                                 Brigsby Bear
698                                 Nanny McPhee
1476    The Disappearance of Eleanor Rigby: Them
1852                   Oz the Great and Powerful
1671                             Shrek the Third
2085                                       K-PAX
291                           Wish for Christmas
1478     The Disappearance of Eleanor Rigby: Her
Name: Title, dtype: object


In [46]:
def recommend_movie(input_plot, preferred_genre, tfidf_vectorizer=tfidf_vectorizer, tfidf_matrix=tfidf_matrix_summaries, movies_data=movies_data_cleaned):
    input_vec = tfidf_vectorizer.transform([input_plot])
    genre_columns = ['Genre_1', 'Genre_2', 'Genre_3', 'Genre_4', 'Genre_5'] 
    filtered_movies = movies_data[genre_columns].apply(lambda x: x.str.contains(preferred_genre, case=False, na=False)).any(axis=1)
    filtered_movies_data = movies_data[filtered_movies]
    if not filtered_movies_data.empty:
        filtered_tfidf_matrix = tfidf_matrix[filtered_movies_data.index]
        cosine_sim = cosine_similarity(input_vec, filtered_tfidf_matrix)
        best_match_idx = cosine_sim.argmax()
        recommended_movie_title = filtered_movies_data.iloc[best_match_idx]['Title']
        return recommended_movie_title
    else:
        return "No movies found in the preferred genre."
input_plot = "A young woman's journey through love and heartbreak."
preferred_genre = "Romance"
recommended_movie = recommend_movie(input_plot, preferred_genre)
print(f"Recommended Movie: {recommended_movie}")


Recommended Movie: The Disappearance of Eleanor Rigby: Them


In [51]:
def recommend_movie(tfidf_vectorizer=tfidf_vectorizer, tfidf_matrix=tfidf_matrix_summaries, movies_data=movies_data_cleaned):
    input_plot = input("Enter a plot summary: ")
    input_vec = tfidf_vectorizer.transform([input_plot])
    preferred_genre = input("Enter your preferred genre: ")
    genre_columns = ['Genre_1', 'Genre_2', 'Genre_3', 'Genre_4', 'Genre_5']
    filtered_movies = movies_data[genre_columns].apply(lambda x: x.str.contains(preferred_genre, case=False, na=False)).any(axis=1)
    filtered_movies_data = movies_data[filtered_movies]
    if not filtered_movies_data.empty:
        filtered_tfidf_matrix = tfidf_matrix[filtered_movies_data.index]
        cosine_sim = cosine_similarity(input_vec, filtered_tfidf_matrix)
        best_match_idx = cosine_sim.argmax()
        recommended_movie_title = filtered_movies_data.iloc[best_match_idx]['Title']
        return recommended_movie_title
    else:
        return "No movies found in the preferred genre."
recommended_movie = recommend_movie()
print(f"Recommended Movie: {recommended_movie}")


Enter a plot summary: heartbreaking love story 
Enter your preferred genre: Romance
Recommended Movie: Equals
