# Libraries to be used in this project.

In [314]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import ast
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from rake_nltk import Rake
from nltk.corpus import stopwords
import string
import spacy

nltk.download('vader_lexicon')
nltk.download('stopwords')
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\doruk\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doruk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 6.1 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [315]:
# movies_big = "datasets/extended-dataset/tmdb10000.csv"
movies_big = "datasets/extended-dataset/tmdb7000.csv"

In [316]:
movies_big_df = pd.read_csv(movies_big)
# movies_small_df = pd.read_csv(movies_small)

# Data Peek

## Example of the dataset

In [317]:
movies_big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7101 entries, 0 to 7100
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TMDb_Id               7101 non-null   int64  
 1   IMDb_Id               7100 non-null   object 
 2   Title                 7101 non-null   object 
 3   Original_Title        7101 non-null   object 
 4   Overview              7090 non-null   object 
 5   Genres                7096 non-null   object 
 6   Cast                  7095 non-null   object 
 7   Crew                  7101 non-null   object 
 8   Collection            7101 non-null   object 
 9   Release_Date          7101 non-null   object 
 10  Release_Status        7101 non-null   object 
 11  Original_Language     7101 non-null   object 
 12  Languages_Spoken      7098 non-null   object 
 13  Runtime               7099 non-null   float64
 14  Tagline               5832 non-null   object 
 15  Popularity           

## Columns to drop: TMDb_Id, IMDB_Id, Original_Title, Collection, Release_Status, Language_Spoken, Country_of_Origin, Crew

In [318]:
movies_big_df.drop('TMDb_Id', axis=1, inplace=True)
movies_big_df.drop('IMDb_Id', axis=1, inplace=True)
movies_big_df.drop('Original_Title', axis=1, inplace=True)
movies_big_df.drop('Release_Status', axis=1, inplace=True)
movies_big_df.drop('Crew', axis=1, inplace=True)

In [319]:
movies_big_df.columns

Index(['Title', 'Overview', 'Genres', 'Cast', 'Collection', 'Release_Date',
       'Original_Language', 'Languages_Spoken', 'Runtime', 'Tagline',
       'Popularity', 'Rating_average', 'Rating_Count', 'Production_Companies',
       'Country_of_Origin', 'Budget', 'Revenue'],
      dtype='object')

## Columns to merge to create extended overview: Title, Overview, Genres, Cast, Collection, Tagline, Production Companies

In [320]:
movies_big_df['extended_overview'] = movies_big_df['Overview'].astype(str) + movies_big_df['Genres'].astype(str) + movies_big_df['Tagline'].astype(str)

### How does extended overview look like.

In [321]:
movies_big_df['extended_overview'][1]

'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.Crime | DramaFear can hold you prisoner. Hope can set you free.'

### We will remove stopwords and symbols out of the text.

In [322]:
punct = string.punctuation
stopwords_list = stopwords.words('english')


In [323]:
movies_big_df['extended_overview'] = movies_big_df['extended_overview'].str.lower()

In [324]:
for p in punct:
    movies_big_df['extended_overview'] = movies_big_df['extended_overview'].str.replace(p, "")

### Data after removing stopwords and symbols and lowercase letter.

In [325]:
movies_big_df['extended_overview'][1]

'framed in the 1940s for the double murder of his wife and her lover upstanding banker andy dufresne begins a new life at the shawshank prison where he puts his accounting skills to work for an amoral warden during his long stretch in prison dufresne comes to be admired by the other inmates  including an older prisoner named red  for his integrity and unquenchable sense of hopecrime  dramafear can hold you prisoner hope can set you free'

# How to recommend?

## Define the recommender function based on the demand.

## Fill NaN values.

In [326]:
movies_big_df['Genres'].fillna(value="general", inplace=True)
movies_big_df['Release_Date'].fillna(value="backfill", inplace=True)
movies_big_df['Tagline'].fillna(value="general", inplace=True)
movies_big_df['Production_Companies'].fillna(value="general", inplace=True)
movies_big_df['Country_of_Origin'].fillna(value="general", inplace=True)
movies_big_df['Budget'].fillna(value=200, inplace=True)
movies_big_df['Revenue'].fillna(value=200, inplace=True)
movies_big_df['Runtime'].fillna(value=90, inplace=True)
movies_big_df['Languages_Spoken'].fillna(value="English", inplace=True)
movies_big_df['Collection'].fillna(value="", inplace=True)
movies_big_df['Cast'].fillna(value="", inplace=True)
movies_big_df['Overview'].fillna(value="", inplace=True)

In [327]:
movies_big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7101 entries, 0 to 7100
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 7101 non-null   object 
 1   Overview              7101 non-null   object 
 2   Genres                7101 non-null   object 
 3   Cast                  7101 non-null   object 
 4   Collection            7101 non-null   object 
 5   Release_Date          7101 non-null   object 
 6   Original_Language     7101 non-null   object 
 7   Languages_Spoken      7101 non-null   object 
 8   Runtime               7101 non-null   float64
 9   Tagline               7101 non-null   object 
 10  Popularity            7101 non-null   float64
 11  Rating_average        7101 non-null   float64
 12  Rating_Count          7101 non-null   int64  
 13  Production_Companies  7101 non-null   object 
 14  Country_of_Origin     7101 non-null   object 
 15  Budget               

In [342]:
def general_recommender(df, budget=300000, language='en', popularity=31, revenue= 800000, runtime= 106, min_vote_avg = 6, max_vote_avg = 10, vote_count = 300):

    df_filtered = df[(df['Budget'] >= budget) & (df['Original_Language'] == language) & (df['Popularity'] >= popularity) & (df['Revenue'] >= revenue) & (df['Runtime'] >= runtime) & (df['Rating_average'] >= min_vote_avg) & (df['Rating_Count'] >= vote_count) & (df['Rating_average'] <= max_vote_avg)]
    return df_filtered[['Title', 'Rating_average', 'Genres']].head(10).sort_values(by='Rating_average', ascending = False)

In [344]:
def user_input_recommender(df, user_input="", budget=1000, popularity=30, revenue= 8000, runtime= 50, min_vote_avg = 5, vote_count = 1000): 

    user_input = user_input.lower()

    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()
    
    # Fit and transform the dataframe column and user input
    texts = df['extended_overview'].tolist()
    texts.append("")
    vectors = vectorizer.fit_transform(texts).toarray()

    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]

    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['Budget'] >= budget) &  (df['Popularity'] >= popularity) & (df['Revenue'] >= revenue) & (df['Runtime'] >= runtime) & (df['Rating_average'] >= min_vote_avg) & (df['Rating_Count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['Title', 'Rating_average', 'Genres', 'similarity']]

In [345]:
# Load the spacy model
nlp = spacy.load("en_core_web_sm")

# Define the sentence
sentence = "This is an example sentence with keywords"

# Process the sentence with spacy
doc = nlp(sentence)

# Extract the keywords
keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]

# Print the keywords
print(keywords)

['example', 'sentence', 'keywords']


In [346]:
def user_input_recommender_keywords(df, user_input="", budget=100, popularity=15, revenue= 80000, runtime= 20, min_vote_avg = 4, vote_count = 100):

    user_input = user_input.lower()
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(user_input)
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]
    for keyword in keywords:
        user_input += (" "+ keyword)
    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the dataframe column and user input
    texts = df['extended_overview'].tolist()
    texts.append("")
    vectors = vectorizer.fit_transform(texts).toarray()


    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]

    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['Budget'] >= budget) &  (df['Popularity'] >= popularity) & (df['Revenue'] >= revenue) & (df['Runtime'] >= runtime) & (df['Rating_average'] >= min_vote_avg) & (df['Rating_Count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['Title', 'Rating_average', 'Genres', 'similarity']]

In [347]:
import math

In [348]:
def user_input_recommender_sentiment(df, user_input="", budget=100, popularity=15, revenue= 80000, runtime= 20, min_vote_avg = 4, vote_count = 100):

    user_input = user_input.lower()

    genres_keywords = {
        "action": ["mission", "chase", "explosion"],
        "adventure": ["journey", "discovery", "epic"],
        "animation": ["cartoon","3D", "stop-motion"],
        "biography": ["true story", "real-life", "historical"],
        "comedy": ["humor", "joke", "parody"],
        "crime": ["detective", "investigation", "murder"],
        "documentary": ["factual", "non-fiction", "real-life"],
        "drama": ["emotional", "intense", "serious"],
        "family": ["kids", "children", "parents"],
        "fantasy": ["magic", "mythical", "enchanted"],
        "film-noir": ["crime", "mystery", "darkness"],
        "history": ["historical", "period", "true story"],
        "horror": ["scares", "terror", "suspense"],
        "music": ["songs", "bands", "musicians"],
        "musical": ["songs", "dancing", "choreography"],
        "mystery": ["detective", "investigation", "crime"],
        "romance": ["love", "passion", "heartbreak"],
        "scifi": ["science", "technology", "space"],
        "sport": ["competition", "athletes", "teams"],
        "superhero": ["powers", "abilities", "costume"],
        "thriller": ["suspense", "tension", "danger"],
        "war": ["battle", "conflict", "soldiers"],
        "western": ["gunslingers", "outlaws", "cowboys"],
        "post-apocalyptic": ["survival", "end of the world", "disaster"],
        "dystopian": ["totalitarian", "government control", "oppression"],
        "cyberpunk": ["technology", "artificial intelligence", "virtual reality"],
        "space opera": ["space", "aliens", "spaceships"],
        "time travel": ["past", "future", "time machine"],
        "vampire": ["immortality", "blood", "vampire hunter"],
        "zombie": ["undead", "apocalypse", "survival"],
        "ghost": ["paranormal", "haunting", "spirits"],
        "haunted house": ["ghosts", "spirits", "paranormal"],
        "supernatural": ["ghosts", "spirits", "paranormal"],
        "pirate": ["sea", "ships", "treasure"],
        "spy": ["espionage", "secret agent", "secrecy"],
        "historical": ["events", "real figures", "period piece"],
        "political": ["political", "conspiracy", "power"],
        "psychological": ["mind", "manipulation", "brainwashing"],
        "murder": ["crime", "investigation", "suspense"],
        "road trip": ["travel", "adventure", "self-discovery"],
        "romantic": ["love", "romance", "dating"],
        "detective": ["mystery", "investigation", "crime"],
        "twist":["mystery", "unkown", "reverse"]
    }

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(user_input)
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]

    sia = SentimentIntensityAnalyzer()
    sentence = user_input
    sentiment_score = sia.polarity_scores(sentence)
    neg_score = sentiment_score['neg']
    
    last_keywords = []
    new_keywords = []
    populated_keywords = []

    # If neg score is bigger than 0.3 then user wants a movie about this category.
    # We populate the search with similar terms.
    if neg_score < 0.3:
        for keyword in keywords:
            if keyword in genres_keywords:
                populated_keywords += genres_keywords[keyword]
        last_keywords += populated_keywords
        user_input = ""

    else:
        # If neg score is smaller than 0.3 that means user do not want anything about
        # extracted keywords so we give them other categories.
        new_keywords += (genres_keywords.keys() - keywords)
        for keyword in new_keywords:
            if keyword in genres_keywords:
                new_keywords += genres_keywords[keyword][0]
                new_keywords += genres_keywords[keyword][1]
                new_keywords += genres_keywords[keyword][2]
        last_keywords += new_keywords[:4]
        user_input = ""

    for keyword in last_keywords:
        user_input += (" "+ keyword)
    
    # Create an instance of the CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the dataframe column and user input
    texts = np.array(df['extended_overview'])
    texts = np.append(texts, np.array(user_input))
    vectors = vectorizer.fit_transform(texts).toarray()

    # Calculate the cosine similarity between user input and each row in the dataframe
    similarities = cosine_similarity(vectors)[-1][:-1]
    df["similarity"] = similarities

    # Filter the dataframe based on the similarity score
    df_filtered = df[(df['Budget'] >= budget) &  (df['Popularity'] >= popularity) & (df['Revenue'] >= revenue) & (df['Runtime'] >= runtime) & (df['Rating_average'] >= min_vote_avg) & (df['Rating_Count'] >= vote_count)]

    df_sorted = df_filtered.sort_values(by='similarity', ascending = False)


    return df_sorted[['Title', 'Rating_average', 'similarity', 'Genres']].head(10).sort_values(by='Rating_average', ascending = False)

In [349]:
user_input_recommender_sentiment(movies_big_df, user_input="I hate detective spy movies.")

Unnamed: 0,Title,Rating_average,similarity,Genres
132,Dr. Strangelove or: How I Learned to Stop Worr...,8.2,0.149071,Drama | Comedy | War
2061,The Nice Guys,7.1,0.111803,Action | Comedy | Crime
2474,Despicable Me 2,6.9,0.117851,Animation | Comedy | Family
3298,Mr. & Mrs. Smith,6.6,0.109764,Action | Comedy | Drama | Thriller
3519,Teenage Mutant Ninja Turtles,6.6,0.104257,Action | Adventure | Comedy | Science Fiction ...
3653,The Grinch,6.5,0.104257,Animation | Comedy | Fantasy | Family
4169,Despicable Me 3,6.4,0.1066,Action | Adventure | Animation | Comedy | Family
4445,Robots,6.3,0.132453,Animation | Comedy | Science Fiction | Family
5918,Charlie's Angels,5.7,0.102062,Action | Adventure | Comedy | Crime | Thriller
6535,The Emoji Movie,5.3,0.125,Animation | Comedy | Family


In [350]:
user_input_recommender_sentiment(movies_big_df, user_input="I love detective spy movies.")

Unnamed: 0,Title,Rating_average,similarity,Genres
109,Double Indemnity,8.2,0.125988,Crime | Mystery | Thriller
97,Joker,8.2,0.125988,Crime | Drama | Thriller
1226,Wind River,7.4,0.13159,Crime | Drama | Mystery | Thriller
1986,Hereditary,7.1,0.122628,Horror | Mystery | Thriller
2547,The Fate of the Furious,6.9,0.122628,Action | Crime | Thriller
2472,Face/Off,6.9,0.115129,Action | Crime | Science Fiction | Thriller
3033,Sicario: Day of the Soldado,6.7,0.142857,Action | Crime | Drama | Thriller
3734,RED 2,6.5,0.142857,Action | Comedy | Crime | Thriller
4242,Atomic Blonde,6.3,0.181568,Action | Mystery | Thriller
5545,Tracers,5.9,0.133631,Action | Crime | Drama


# An example of how we tried to enrich user input with keywords and generated keywords.

### "action": ["mission", "chase", "explosion"],
### "adventure": ["journey", "discovery", "epic"],

    "animation": ["cartoon","3D", "stop-motion"],
    "biography": ["true story", "real-life", "historical"],
    "comedy": ["humor", "joke", "parody"],
    "crime": ["detective", "investigation", "murder"],
    "documentary": ["factual", "non-fiction", "real-life"],
    "drama": ["emotional", "intense", "serious"],
    "family": ["kids", "children", "parents"],
    "fantasy": ["magic", "mythical", "enchanted"],
    .
    .
    .
    

dictionery_reverse = {}
for key in genres_keywords_example.keys():
    dictionery_reverse[genres_keywords_example[key][0]] = [key] + [genres_keywords_example[key][1], genres_keywords_example[key][2]]



### 'mission': ['action', 'chase', 'explosion'],
### 'journey': ['adventure', 'discovery', 'epic'],
 'cartoon': ['animation', '3D', 'stop-motion'],
 'true story': ['biography', 'real-life', 'historical'],
 'humor': ['comedy', 'joke', 'parody'],
 'detective': ['mystery', 'investigation', 'crime'],
 'factual': ['documentary', 'non-fiction', 'real-life'],
 'emotional': ['drama', 'intense', 'serious'],
 'kids': ['family', 'children', 'parents'],
 'magic': ['fantasy', 'mythical', 'enchanted'],
.
.
.

# Model Comparisons

### 1) General Recommender
### 2) User input recommender
### 3) User input recommender with keywords
### 4) User input recommender with keywords enriched and sentiments

In [351]:
general_recommender(movies_big_df).head(5)

Unnamed: 0,Title,Rating_average,Genres
1,The Shawshank Redemption,8.7,Crime | Drama
2,The Godfather,8.7,Crime | Drama
11,Pulp Fiction,8.5,Crime | Thriller
14,Forrest Gump,8.4,Comedy | Drama | Romance
15,The Lord of the Rings: The Return of the King,8.4,Action | Adventure | Fantasy


In [352]:
user_input_recommender(movies_big_df, user_input="Fun, Action").head(5)

Unnamed: 0,Title,Rating_average,Genres,similarity
1,The Shawshank Redemption,8.7,Crime | Drama,0.0
2006,Midsommar,7.1,Drama | Horror | Mystery,0.0
2029,Maleficent,7.1,Action | Adventure | Fantasy | Romance | Family,0.0
2045,Doctor Sleep,7.1,Drama | Fantasy | Horror | Thriller,0.0
2120,Aladdin,7.1,Adventure | Comedy | Fantasy | Romance | Family,0.0


In [353]:
user_input_recommender_keywords(movies_big_df, user_input="Romance, Love and Drama").head(5)

Unnamed: 0,Title,Rating_average,Genres,similarity
0,Dilwale Dulhania Le Jayenge,8.8,Comedy | Drama | Romance,0.0
3951,Dragonfly,6.4,Drama,0.0
3944,The Number 23,6.4,Drama | Mystery | Thriller,0.0
3939,Over the Hedge,6.4,Animation | Comedy | Family,0.0
3937,Halloween,6.4,Horror | Thriller,0.0


In [354]:
user_input_recommender_sentiment(movies_big_df, user_input="I love action, scifi and adventure.").head(10)

Unnamed: 0,Title,Rating_average,similarity,Genres
889,Gattaca,7.5,0.118678,Mystery | Science Fiction | Thriller | Romance
1046,Rogue One: A Star Wars Story,7.5,0.114332,Action | Adventure | Science Fiction
1134,Treasure Planet,7.4,0.132453,Adventure | Animation | Fantasy | Science Fict...
1436,Contact,7.3,0.117041,Drama | Mystery | Science Fiction
3009,The Good Dinosaur,6.7,0.121716,Adventure | Animation | Family
3029,Armageddon,6.7,0.117851,Action | Thriller | Science Fiction | Adventure
4218,Small Soldiers,6.3,0.123797,Action | Adventure | Comedy | Fantasy | Scienc...
4715,Tomorrowland,6.2,0.131306,Adventure | Mystery | Science Fiction | Family
5303,Criminal,6.0,0.112509,Action | Crime | Science Fiction
5332,Journey 2: The Mysterious Island,6.0,0.108148,Action | Adventure | Science Fiction


In [355]:
user_input_recommender_sentiment(movies_big_df, user_input="I hate action, scifi and adventure.").head(10)

Unnamed: 0,Title,Rating_average,similarity,Genres
132,Dr. Strangelove or: How I Learned to Stop Worr...,8.2,0.149071,Drama | Comedy | War
2061,The Nice Guys,7.1,0.111803,Action | Comedy | Crime
2474,Despicable Me 2,6.9,0.117851,Animation | Comedy | Family
3298,Mr. & Mrs. Smith,6.6,0.109764,Action | Comedy | Drama | Thriller
3519,Teenage Mutant Ninja Turtles,6.6,0.104257,Action | Adventure | Comedy | Science Fiction ...
3653,The Grinch,6.5,0.104257,Animation | Comedy | Fantasy | Family
4169,Despicable Me 3,6.4,0.1066,Action | Adventure | Animation | Comedy | Family
4445,Robots,6.3,0.132453,Animation | Comedy | Science Fiction | Family
5918,Charlie's Angels,5.7,0.102062,Action | Adventure | Comedy | Crime | Thriller
6535,The Emoji Movie,5.3,0.125,Animation | Comedy | Family


In [356]:
user_input_recommender_keywords(movies_big_df, user_input="I hate action, scifi and adventure.").head(10)

Unnamed: 0,Title,Rating_average,Genres,similarity
0,Dilwale Dulhania Le Jayenge,8.8,Comedy | Drama | Romance,0.0
3951,Dragonfly,6.4,Drama,0.0
3944,The Number 23,6.4,Drama | Mystery | Thriller,0.0
3939,Over the Hedge,6.4,Animation | Comedy | Family,0.0
3937,Halloween,6.4,Horror | Thriller,0.0
3933,Project X,6.4,Comedy,0.0
3923,Overboard,6.4,Comedy | Romance,0.0
3913,Silent Hill,6.4,Horror | Mystery,0.0
3903,The Finest Hours,6.4,Action | Drama | History | Thriller,0.0
3894,Wanted,6.4,Action | Crime | Thriller,0.0
