# Libraries to be used in this project.

In [110]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import ast
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import warnings
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from rake_nltk import Rake
from nltk.corpus import stopwords
import string
import spacy


warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')
nltk.download('stopwords')
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\doruk\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doruk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 8.0 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [111]:
movies_path = "datasets/tmdb5000/tmdb_5000_movies.csv"

In [112]:
movies = pd.read_csv(movies_path)


# Data Peek

In [113]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [114]:
movies.describe().round(1)

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045039.9,57165.5,21.5,82260640.0,106.9,6.1,690.2
std,40722391.3,88694.6,31.8,162857100.0,22.6,1.2,1234.6
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.7,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.9,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.3,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.6,2787965000.0,338.0,10.0,13752.0


In [115]:
movies.shape

(4803, 20)

In [116]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

### We will remove some of these columns and do some preprocessing on keywords and other features since it is not going to contribute in recommending the movie or we will need them in different forms.

In [117]:
movies.drop('homepage', axis=1, inplace=True)
movies.drop('id', axis=1, inplace=True)
movies.drop('production_countries', axis=1, inplace=True)
movies.drop('release_date', axis=1, inplace=True)
movies.drop('original_title', axis=1, inplace=True)
movies.drop('spoken_languages', axis=1, inplace=True)

In [118]:
# Duplicated data check
movies.duplicated().sum()

0

In [119]:
movies.head(3)

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,revenue,runtime,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2787965087,162.0,Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",961000000,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",880674609,148.0,Released,A Plan No One Escapes,Spectre,6.3,4466


Change from json to list of keywords and genres.

In [120]:
def converter(obj):
    genreList = []
    for i in ast.literal_eval(obj):
        genreList.append(i['name'])
    return genreList

movies['genres'] = movies['genres'].apply(converter)
movies['keywords'] = movies['keywords'].apply(converter)
movies['production_companies'] = movies['production_companies'].apply(converter)

In [121]:
movies.head(3)

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,revenue,runtime,status,tagline,title,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",2787965087,162.0,Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",961000000,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B24]",880674609,148.0,Released,A Plan No One Escapes,Spectre,6.3,4466


In [122]:
movies['overview'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [123]:
movies['general_overview'] =movies['overview'].astype(str) + movies['keywords'].astype(str) + movies['genres'].astype(str) + movies['title'].astype(str) + movies['tagline'].astype(str)

In [124]:
movies['general_overview'][0]

"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.['culture clash', 'future', 'space war', 'space colony', 'society', 'space travel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alien planet', 'cgi', 'marine', 'soldier', 'battle', 'love affair', 'anti war', 'power relations', 'mind and soul', '3d']['Action', 'Adventure', 'Fantasy', 'Science Fiction']AvatarEnter the World of Pandora."

In [125]:

text = "This is an example text with stopwords and symbols!@#"

def stopword_punct_remover_nltk(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and symbols from the list of tokens
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords.words('english') and token not in string.punctuation]

    # Join the filtered tokens to form a new text
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [126]:
movies['general_overview'] = movies['general_overview'].apply(stopword_punct_remover_nltk)

In [127]:
movies['general_overview'] = movies['general_overview'].str.lower()

In [128]:
movies['general_overview'][0]

"22nd century paraplegic marine dispatched moon pandora unique mission becomes torn following orders protecting alien civilization 'culture clash 'future 'space war 'space colony 'society 'space travel 'futuristic 'romance 'space 'alien 'tribe 'alien planet 'cgi 'marine 'soldier 'battle 'love affair 'anti war 'power relations 'mind soul '3d 'action 'adventure 'fantasy 'science fiction avatarenter world pandora"

# How to recommend?

## Define the recommender function based on the demand.

In [131]:
def general_recommender(df, 
budget=300000, 
language='en', 
popularity=31, 
revenue= 800000, 
runtime= 106, 
min_vote_avg = 6, 
max_vote_avg = 10, 
vote_count = 300):

    df_filtered = df[(df['budget'] >= budget) &  (df['original_language'] == language) & (df['popularity'] >= popularity) & (df['revenue'] >= revenue) & (df['runtime'] >= runtime) & (df['vote_average'] >= min_vote_avg) & (df['vote_count'] >= vote_count) & (df['vote_average'] <= max_vote_avg)]
    return df_filtered[['title','genres','vote_average']].head(5)

In [132]:
df_movie_recommended = general_recommender(movies, popularity=50, runtime=130, min_vote_avg= 6, max_vote_avg=7)

In [133]:
df_movie_recommended.head(10)

Unnamed: 0,title,genres,vote_average
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9
2,Spectre,"[Action, Adventure, Crime]",6.3
12,Pirates of the Caribbean: Dead Man's Chest,"[Adventure, Fantasy, Action]",7.0
14,Man of Steel,"[Action, Adventure, Fantasy, Science Fiction]",6.5
15,The Chronicles of Narnia: Prince Caspian,"[Adventure, Family, Fantasy]",6.3


### User input is taken as text.
### Similarities between user input and overview is compared.
### Most matched 10 movies are found recommended to user.

In [134]:
def user_input_recommender(df, 
user_input="", 
budget=100, 
popularity=15, 
revenue= 80000, 
runtime= 20,
min_vote_avg = 4,
vote_count = 100):


    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()
    
    # Fit and transform the dataframe column and user input
    texts = df['general_overview'].tolist()
    texts.append(user_input)
    vectors = vectorizer.fit_transform(texts).toarray()

    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]

    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['budget'] >= budget) & (df['popularity'] >= popularity) & (df['revenue'] >= revenue) & (df['runtime'] >= runtime) & (df['vote_average'] >= min_vote_avg) & (df['vote_count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['title','genres','similarity','vote_average']].head(5)

In [135]:
user_input_recommender(movies, user_input="Cartoon Comedy", min_vote_avg=6.2).head(10)

Unnamed: 0,title,genres,similarity,vote_average
2018,There's Something About Mary,"[Romance, Comedy]",0.36823,6.5
1940,Carnage,"[Comedy, Drama]",0.297044,7.0
452,Space Jam,"[Animation, Comedy, Drama, Family, Fantasy]",0.285714,6.5
3883,Animal House,[Comedy],0.278543,7.0
1695,Aladdin,"[Animation, Family, Comedy, Adventure, Fantasy...",0.261116,7.4


In [136]:
user_input_recommender(movies, user_input="Virus outbreak pandemic", min_vote_avg=6.2).head(10)

Unnamed: 0,title,genres,similarity,vote_average
720,Contagion,"[Drama, Thriller, Science Fiction]",0.308607,6.2
1395,Resident Evil,"[Horror, Action, Science Fiction]",0.221766,6.4
929,Outbreak,"[Action, Drama, Science Fiction, Thriller]",0.216506,6.3
3240,28 Days Later,"[Horror, Thriller, Science Fiction]",0.210042,7.1
116,I Am Legend,"[Drama, Horror, Action, Thriller, Science Fict...",0.193649,6.9


In [137]:
user_input_recommender(movies, user_input="Love hate and action").head(10)

Unnamed: 0,title,genres,similarity,vote_average
1408,Closer,"[Drama, Romance]",0.307794,6.7
1874,August Rush,[Drama],0.273861,7.1
617,Agora,"[Adventure, Drama, History]",0.272727,6.9
1031,My Best Friend's Wedding,"[Comedy, Romance]",0.248069,6.3
164,Lethal Weapon 4,"[Action, Adventure, Comedy, Crime, Thriller]",0.246183,6.3


In [138]:
user_input_recommender(movies, min_vote_avg = 6,user_input="A scray thriller with killers and victims.").head(10)

Unnamed: 0,title,genres,similarity,vote_average
3042,The Gift,"[Thriller, Mystery]",0.140028,6.7
2091,The Silence of the Lambs,"[Crime, Drama, Thriller]",0.13525,8.1
825,Flightplan,"[Thriller, Drama, Mystery]",0.125988,6.1
3360,House of 1000 Corpses,[Horror],0.124515,6.0
1100,Johnny English Reborn,"[Crime, Adventure, Action, Comedy, Thriller]",0.118958,6.0


In [139]:
user_input_recommender(movies, user_input="matrix").head(10)

Unnamed: 0,title,genres,similarity,vote_average
2996,Commando,"[Action, Adventure, Thriller]",0.341882,6.4
125,The Matrix Reloaded,"[Adventure, Action, Thriller, Science Fiction]",0.287926,6.7
1762,Dark City,"[Mystery, Science Fiction]",0.141421,7.2
634,The Matrix,"[Action, Science Fiction]",0.141421,7.9
123,The Matrix Revolutions,"[Adventure, Action, Thriller, Science Fiction]",0.101015,6.4


In [140]:
user_input_recommender(movies, user_input="I want to watch a movie about two strangers, action and romance", min_vote_avg=7).head(10)

Unnamed: 0,title,genres,similarity,vote_average
2515,"Crouching Tiger, Hidden Dragon","[Adventure, Drama, Action, Romance]",0.173749,7.2
4006,Singin' in the Rain,"[Comedy, Music, Romance]",0.16855,7.8
2935,Brooklyn,"[Drama, Romance]",0.163299,7.2
4017,Before Sunrise,"[Drama, Romance]",0.141421,7.7
2459,The Artist,"[Drama, Comedy, Romance]",0.139686,7.3


In [141]:
user_input_recommender(movies, user_input="disaster, chaos, cyber attack", min_vote_avg=5).head(10)

Unnamed: 0,title,genres,similarity,vote_average
606,Blackhat,"[Crime, Drama, Mystery]",0.19803,5.1
4738,Pi,"[Mystery, Drama, Thriller]",0.164399,7.1
941,13 Hours: The Secret Soldiers of Benghazi,"[Action, Drama, History, Thriller, War]",0.132453,7.0
76,G.I. Joe: The Rise of Cobra,"[Adventure, Action, Thriller, Science Fiction]",0.130189,5.6
1484,Snakes on a Plane,"[Action, Crime, Horror, Thriller]",0.124035,5.1


### run "pip install spacy" then "python -m spacy download en_core_web_sm" in terminal.

In [142]:
# Load the spacy model
nlp = spacy.load("en_core_web_sm")

# Define the sentence
sentence = "This is an example sentence with keywords"

# Process the sentence with spacy
doc = nlp(sentence)

# Extract the keywords
keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]

# Print the keywords
print(keywords)

['example', 'sentence', 'keywords']


In [161]:
def user_input_recommender_keywords(df, 
user_input="", 
budget=100, 
popularity=15, 
revenue= 80000, 
runtime= 20,
min_vote_avg = 4,
vote_count = 100):

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(user_input)
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]
    for keyword in keywords:
        user_input += (" "+ keyword)
    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the dataframe column and user input
    texts = df['general_overview'].tolist()
    texts.append(user_input)
    vectors = vectorizer.fit_transform(texts).toarray()


    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]

    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['budget'] >= budget) &  
    (df['popularity'] >= popularity) & 
    (df['revenue'] >= revenue) & 
    (df['runtime'] >= runtime) & 
    (df['vote_average'] >= min_vote_avg) & 
    (df['vote_count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['title', 'vote_average', 'similarity', 'genres']]

In [162]:
user_input_recommender_keywords(movies, user_input="I want to watch something about techology, sci-fi, pandemic, crazy romance", min_vote_avg=6, budget=10000000).head(10)

Unnamed: 0,title,vote_average,similarity,genres
1053,Galaxy Quest,6.9,0.170367,"[Comedy, Family, Science Fiction]"
1567,Warm Bodies,6.4,0.143427,"[Horror, Comedy, Romance]"
295,The Tourist,6.0,0.139309,"[Action, Thriller, Romance]"
2172,Endless Love,6.7,0.135225,"[Drama, Romance]"
827,City of Angels,6.4,0.121218,"[Drama, Fantasy, Romance]"
2450,Becoming Jane,6.9,0.112687,"[Drama, Romance]"
925,"Crazy, Stupid, Love.",7.0,0.109362,"[Comedy, Drama, Romance]"
2416,Beastly,6.0,0.102448,"[Drama, Fantasy, Romance]"
1300,The Ugly Truth,6.4,0.098623,"[Comedy, Romance]"
397,It's Complicated,6.2,0.096589,"[Comedy, Romance]"


In [163]:
user_input_recommender_keywords(movies, user_input="Fast and furious cars street race").head(10)

Unnamed: 0,title,vote_average,similarity,genres
629,Need for Speed,6.1,0.253673,"[Action, Crime, Drama, Thriller]"
40,Cars 2,5.8,0.237566,"[Animation, Family, Adventure, Comedy]"
44,Furious 7,7.3,0.227921,[Action]
3697,Fast Times at Ridgemont High,7.0,0.19518,[Comedy]
177,Turbo,6.1,0.174574,"[Animation, Family]"
1186,The Final Destination,5.4,0.17192,"[Horror, Mystery]"
588,Wall Street: Money Never Sleeps,5.8,0.169031,"[Drama, Crime]"
3075,Crash,7.2,0.141598,[Drama]
3893,A Nightmare on Elm Street Part 2: Freddy's Rev...,5.7,0.139573,[Horror]
498,Hidalgo,6.5,0.138013,"[Western, Adventure]"


In [164]:
user_input_recommender_keywords(movies, user_input="Funny show about life amd death", min_vote_avg=5).head(10)

Unnamed: 0,title,vote_average,similarity,genres
1786,Flatliners,6.3,0.356348,"[Drama, Horror, Science Fiction, Thriller]"
1839,Final Destination 3,5.8,0.323381,"[Horror, Mystery]"
980,The Life of David Gale,7.3,0.281091,"[Drama, Thriller, Crime]"
1993,Final Destination,6.4,0.278019,[Horror]
3596,Y Tu Mamá También,7.3,0.256117,"[Drama, Romance]"
697,The Truman Show,7.8,0.248282,"[Comedy, Drama]"
1316,Precious,6.9,0.243108,[Drama]
66,Up,7.7,0.237566,"[Animation, Comedy, Family, Adventure]"
1588,P.S. I Love You,6.9,0.223607,"[Drama, Romance]"
3031,Wrong Turn,6.0,0.220863,"[Horror, Thriller]"


In [165]:
user_input_recommender(movies, user_input="Funny show about life and death", min_vote_avg=5).head(10)

Unnamed: 0,title,genres,similarity,vote_average
1786,Flatliners,"[Drama, Horror, Science Fiction, Thriller]",0.308607,6.3
697,The Truman Show,"[Comedy, Drama]",0.286691,7.8
1839,Final Destination 3,"[Horror, Mystery]",0.280056,5.8
980,The Life of David Gale,"[Drama, Thriller, Crime]",0.243432,7.3
1993,Final Destination,[Horror],0.240772,6.4


In [166]:
general_recommender(movies, min_vote_avg=5).head(10)

Unnamed: 0,title,genres,vote_average
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9
2,Spectre,"[Action, Adventure, Crime]",6.3
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6
4,John Carter,"[Action, Adventure, Science Fiction]",6.1


In [167]:
def user_input_recommender_sentiment(df, 
user_input="", 
budget=100, 
popularity=15, 
revenue= 80000, 
runtime= 20,
min_vote_avg = 4,
vote_count = 100):

    user_input = user_input.lower()

    genres_keywords = {
        "action": ["mission", "chase", "explosion"],
        "adventure": ["journey", "discovery", "epic"],
        "animation": ["cartoon","3D", "stop-motion"],
        "biography": ["true story", "real-life", "historical"],
        "comedy": ["humor", "joke", "parody"],
        "crime": ["detective", "investigation", "murder"],
        "documentary": ["factual", "non-fiction", "real-life"],
        "drama": ["emotional", "intense", "serious"],
        "family": ["kids", "children", "parents"],
        "fantasy": ["magic", "mythical", "enchanted"],
        "film-noir": ["crime", "mystery", "darkness"],
        "history": ["historical", "period", "true story"],
        "horror": ["scares", "terror", "suspense"],
        "music": ["songs", "bands", "musicians"],
        "musical": ["songs", "dancing", "choreography"],
        "mystery": ["detective", "investigation", "crime"],
        "romance": ["love", "passion", "heartbreak"],
        "sci-fi": ["science", "technology", "space"],
        "sport": ["competition", "athletes", "teams"],
        "superhero": ["powers", "abilities", "costume"],
        "thriller": ["suspense", "tension", "danger"],
        "mission": ["action", "chase", "explosion"],
        "journey": ["adventure", "discovery", "epic"],
        "cartoon": ["animation", "3D", "stop-motion"],
        "true story": ["biography", "real-life", "historical"],
        "humor": ["comedy", "joke", "parody"],
        "detective": ["mystery", "investigation", "crime"],
        "factual": ["documentary", "non-fiction", "real-life"],
        "emotional": ["drama", "intense", "serious"],
        "kids": ["family", "children", "parents"],
        "magic": ["fantasy", "mythical", "enchanted"],
        "crime": ["film-noir", "mystery", "darkness"],
        "historical": ["history", "period", "true story"],
        "scares": ["horror", "terror", "suspense"],
        "songs": ["musical", "dancing", "choreography"],
        "love": ["romance", "passion", "heartbreak"],
        "science": ["sci-fi", "technology", "space"],
        "competition": ["sport", "athletes", "teams"],
        "powers": ["superhero", "abilities", "costume"],
        "suspense": ["thriller", "tension", "danger"],    
        "war": ["battle", "conflict", "soldiers"],
        "western": ["gunslingers", "outlaws", "cowboys"],
        "post-apocalyptic": ["survival", "end of the world", "disaster"],
        "dystopian": ["totalitarian", "government control", "oppression"],
        "cyberpunk": ["technology", "artificial intelligence", "virtual reality"],
        "space opera": ["space", "aliens", "spaceships"],
        "time travel": ["past", "future", "time machine"],
        "vampire": ["immortality", "blood", "vampire hunter"],
        "zombie": ["undead", "apocalypse", "survival"],
        "ghost": ["paranormal", "haunting", "spirits"],
        "haunted house": ["ghosts", "spirits", "paranormal"],
        "supernatural": ["ghosts", "spirits", "paranormal"],
        "pirate": ["sea", "ships", "treasure"],
        "spy": ["espionage", "secret agent", "secrecy"],
        "historical fiction": ["historical events", "real-life figures", "period piece"],
        "political thriller": ["political", "conspiracy", "power"],
        "psychological thriller": ["mind", "manipulation", "brainwashing"],
        "crime thriller": ["crime", "investigation", "suspense"],
        "coming of age": ["growing up", "adolescence", "self-discovery"],
        "road trip": ["travel", "adventure", "self-discovery"],
        "romantic comedy": ["love", "romance", "dating"],
        "battle": ["war", "conflict", "soldiers"],
        "gunslingers": ["western", "outlaws", "cowboys"],
        "survival": ["post-apocalyptic", "end of the world", "disaster"],
        "totalitarian": ["dystopian", "government control", "oppression"],
        "technology": ["cyberpunk", "artificial intelligence", "virtual reality"],
        "space": ["space opera", "aliens", "spaceships"],
        "past": ["time travel", "future", "time machine"],
        "immortality": ["vampire", "blood", "vampire hunter"],
        "undead": ["zombie", "apocalypse", "survival"],
        "paranormal": ["ghost", "haunting", "spirits"],
        "ghosts": ["supernatural", "spirits", "paranormal"],
        "sea": ["pirate", "ships", "treasure"],
        "espionage": ["spy", "secret agent", "secrecy"],
        "historical events": ["historical fiction","real-life figures","period piece"],
        "political": ["political thriller", "conspiracy", "power"],
        "mind": ["psychological thriller", "manipulation", "brainwashing"],
        "crime": ["crime thriller", "investigation", "suspense"],
        "growing up": ["coming of age", "adolescence", "self-discovery"],
        "travel": ["road trip", "adventure", "self-discovery"],
        "love": ["romantic comedy", "romance", "dating"],
        "black comedy": ["dark humor", "satire", "irony"],
        "stand-up comedy": ["stand-up", "comedy", "humor"],
        "slapstick comedy": ["physical comedy", "humor", "jokes"],
        "action comedy": ["action", "comedy", "fun"],
        "disaster": ["destruction", "catastrophe", "emergency"],
        "political satire": ["politics", "government", "satire"],
        "social satire": ["society", "culture", "satire"],
        "psychological horror": ["mind", "manipulation", "brainwashing"],
        "dark humor": ["black comedy", "satire", "irony"],
        "stand-up": ["stand-up comedy", "comedy", "humor"],
        "physical comedy": ["slapstick comedy", "humor", "jokes"],
        "action": ["action comedy", "comedy", "fun"],
        "destruction": ["disaster", "catastrophe", "emergency"],
        "politics": ["political satire", "government", "satire"],
        "society": ["social satire", "culture", "satire"],
        "mind": ["psychological horror", "manipulation", "brainwashing"]  
    }

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(user_input)
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]

    sia = SentimentIntensityAnalyzer()
    sentence = user_input
    sentiment_score = sia.polarity_scores(sentence)
    neg_score = sentiment_score['neg']
    
    
    last_keywords = []
    new_keywords = []
    populated_keywords = []

    # If neg score is bigger than 0.3 then user wants a movie about this category.
    # We populate the search with similar terms.
    if neg_score > 0.3:
        for keyword in keywords:
            if keyword in genres_keywords:
                populated_keywords += genres_keywords[keyword]
        last_keywords += populated_keywords
        user_input = ""

    else:
        # If neg score is smaller than 0.3 that means user do not want anything about
        # extracted keywords so we give them other categories.
        new_keywords += (genres_keywords.keys() - keywords)
        for keyword in new_keywords:
            if keyword in genres_keywords:
                new_keywords += genres_keywords[keyword][0]
                new_keywords += genres_keywords[keyword][1]
                new_keywords += genres_keywords[keyword][2]
        last_keywords += new_keywords[:4]
        user_input = ""

    for keyword in last_keywords:
        user_input += (" "+ keyword)
    
    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the dataframe column and user input
    texts = np.array(df['general_overview'])
    texts = np.append(texts, np.array(user_input))
    vectors = vectorizer.fit_transform(texts).toarray()


    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]

    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['budget'] >= budget) & (df['popularity'] >= popularity) & (df['revenue'] >= revenue) & (df['runtime'] >= runtime) & (df['vote_average'] >= min_vote_avg) & (df['vote_count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['title','genres','similarity','vote_average']].head(5)

In [168]:
user_input_recommender_sentiment(movies, user_input="I like action movies.", min_vote_avg=5)

Unnamed: 0,title,genres,similarity,vote_average
466,The Time Machine,"[Science Fiction, Adventure, Action]",0.209529,5.8
148,Ghostbusters,"[Action, Fantasy, Comedy]",0.20702,5.3
187,Puss in Boots,"[Action, Adventure, Animation, Family, Fantasy]",0.179284,6.4
2143,Ghost Ship,"[Horror, Mystery, Thriller]",0.17178,5.3
2348,127 Hours,"[Adventure, Drama, Thriller]",0.157027,7.0


In [169]:
user_input_recommender_sentiment(movies, user_input="I don't like action movies.", min_vote_avg=5)

Unnamed: 0,title,genres,similarity,vote_average
2018,There's Something About Mary,"[Romance, Comedy]",0.425195,6.5
2325,My Big Fat Greek Wedding 2,"[Romance, Comedy, Family]",0.369274,5.5
1940,Carnage,"[Comedy, Drama]",0.342997,7.0
2579,Keanu,"[Action, Comedy]",0.331133,6.0
3883,Animal House,[Comedy],0.321634,7.0


In [170]:
user_input_recommender_keywords(movies, user_input="I like action movies.", min_vote_avg=5).head(5)

Unnamed: 0,title,vote_average,similarity,genres
880,Grindhouse,6.8,0.244949,"[Thriller, Action, Horror]"
403,Last Action Hero,6.1,0.244796,"[Adventure, Fantasy, Action, Comedy, Family]"
2050,The Transporter Refueled,5.2,0.227429,"[Thriller, Action, Crime]"
164,Lethal Weapon 4,6.3,0.213201,"[Action, Adventure, Comedy, Crime, Thriller]"
1505,"Big Mommas: Like Father, Like Son",5.3,0.213201,"[Crime, Comedy, Action]"


In [171]:
user_input_recommender_keywords(movies, user_input="I don't like action movies.", min_vote_avg=5).head(5)

Unnamed: 0,title,vote_average,similarity,genres
880,Grindhouse,6.8,0.235339,"[Thriller, Action, Horror]"
403,Last Action Hero,6.1,0.235192,"[Adventure, Fantasy, Action, Comedy, Family]"
2050,The Transporter Refueled,5.2,0.218507,"[Thriller, Action, Crime]"
164,Lethal Weapon 4,6.3,0.204837,"[Action, Adventure, Comedy, Crime, Thriller]"
1505,"Big Mommas: Like Father, Like Son",5.3,0.204837,"[Crime, Comedy, Action]"


In [172]:
# An example of how we tried to enrich user input.
genres_keywords_example = {
    "action": ["mission", "chase", "explosion"],
    "adventure": ["journey", "discovery", "epic"],
    "animation": ["cartoon","3D", "stop-motion"],
    "biography": ["true story", "real-life", "historical"],
    "comedy": ["humor", "joke", "parody"],
    "crime": ["detective", "investigation", "murder"],
    "documentary": ["factual", "non-fiction", "real-life"],
    "drama": ["emotional", "intense", "serious"],
    "family": ["kids", "children", "parents"],
    "fantasy": ["magic", "mythical", "enchanted"],
    "film-noir": ["crime", "mystery", "darkness"],
}

In [173]:
dictionery_reverse = {}
for key in genres_keywords_example.keys():
    dictionery_reverse[genres_keywords_example[key][0]] = [key] + [genres_keywords_example[key][1], genres_keywords_example[key][2]]

In [174]:
dictionery_reverse

{'mission': ['action', 'chase', 'explosion'],
 'journey': ['adventure', 'discovery', 'epic'],
 'cartoon': ['animation', '3D', 'stop-motion'],
 'true story': ['biography', 'real-life', 'historical'],
 'humor': ['comedy', 'joke', 'parody'],
 'detective': ['crime', 'investigation', 'murder'],
 'factual': ['documentary', 'non-fiction', 'real-life'],
 'emotional': ['drama', 'intense', 'serious'],
 'kids': ['family', 'children', 'parents'],
 'magic': ['fantasy', 'mythical', 'enchanted'],
 'crime': ['film-noir', 'mystery', 'darkness']}

In [175]:
user_input_recommender_sentiment(movies, user_input="I love action movies and fast-paced thrillers.").head(5)

Unnamed: 0,title,genres,similarity,vote_average
466,The Time Machine,"[Science Fiction, Adventure, Action]",0.209529,5.8
148,Ghostbusters,"[Action, Fantasy, Comedy]",0.20702,5.3
187,Puss in Boots,"[Action, Adventure, Animation, Family, Fantasy]",0.179284,6.4
2143,Ghost Ship,"[Horror, Mystery, Thriller]",0.17178,5.3
805,Ghost Rider: Spirit of Vengeance,"[Action, Fantasy, Thriller]",0.167705,4.7


In [178]:
user_input_recommender_sentiment(movies, user_input="I hate action movies and fast-paced thrillers.").head(5)

Unnamed: 0,title,genres,similarity,vote_average
2018,There's Something About Mary,"[Romance, Comedy]",0.425195,6.5
2325,My Big Fat Greek Wedding 2,"[Romance, Comedy, Family]",0.369274,5.5
1940,Carnage,"[Comedy, Drama]",0.342997,7.0
2579,Keanu,"[Action, Comedy]",0.331133,6.0
3883,Animal House,[Comedy],0.321634,7.0
