# Movie reviews

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


#### general info about the movies

In [2]:
movies = pd.read_csv('rotten_tomatoes_movies.csv')

In [3]:
print("Data type : ", type(movies))
print("Data dims : ", movies.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (17712, 22)


#### Movie review by critics

In [6]:
movies.head(5)
#reviews.head(10)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


Description of the rotten_tomatoes_movies ,from rotten_tomato_1960_2020 dataset found from kaggle

> **rotten_tomatoes_link** : link for movies from the website  
> **movie_title** : Name of the movie  
> **movie_info** : Sypnosis of the movie  
> **critics_consensus** : comments from Rotten Tomatoes  
> **content_rating** : category based on the movie suitability for audience  
> **genres** : movie genres separated by commes, if multiple  
> **directors** : name of director(s)  
> **authors** : name of author(s)  
> **actors** : name of actors  
> **original_release_date** : date in which the movie has been released  
> **streaming_release_date** : date in which the movie has been released for streaming   
> **runtime** : movie runtume (in minutes)  
> **production_company** : name of the production company  
> **tomatometer_status** : tomatometer value of "Rotten" (less than 60% positive reviews), "Fresh" (at least 60% of positive reviews), and "Certified-Fresh" (at least 75% of positive reviews)  
> **tomatometer_rating** : percentage of positive critic ratings  
> **tomatometer_count** : critic ratings counted for the calculation of the tomatomer status  
> **audience_status** : audience value of "Spilled" (less than 60% of users gave a rating of at least 3.5) or "Upright" (at least 60% of users gave a rating of at least 3.5)  
> **audience_rating** : percentage of positive user ratings  
> **audience_count** : user ratings counted for the calculation of the audience status  
> **tomatometer_top_critics_count** : count of top critic ratings  
> **tomatometer_fresh_critics_count** : count of fresh critic ratings  
> **tomatometer_rotten_critics_count** : count of rotten critic ratings  
---


## We aim to categorise movies based on similarities between their sypnosis (movie_info), and then recommend movies similar to those the user likes

# Cleaning the Datasets

#### To see if there are any movies without 'movie_info'

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17712 entries, 0 to 17711
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rotten_tomatoes_link              17712 non-null  object 
 1   movie_title                       17712 non-null  object 
 2   movie_info                        17391 non-null  object 
 3   critics_consensus                 9134 non-null   object 
 4   content_rating                    17712 non-null  object 
 5   genres                            17693 non-null  object 
 6   directors                         17518 non-null  object 
 7   authors                           16170 non-null  object 
 8   actors                            17360 non-null  object 
 9   original_release_date             16546 non-null  object 
 10  streaming_release_date            17328 non-null  object 
 11  runtime                           17398 non-null  float64
 12  prod

#### We can see that there are movies without 'movie_info', we will drop these rows from the dataframe

In [8]:
#movies = movies[movies['movie_info'].notnull()]
movies = movies[['movie_title','movie_info','actors','genres','directors','authors']]
movies = movies.dropna(subset=['movie_info'])

In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17391 entries, 0 to 17711
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movie_title  17391 non-null  object
 1   movie_info   17391 non-null  object
 2   actors       17062 non-null  object
 3   genres       17372 non-null  object
 4   directors    17211 non-null  object
 5   authors      15945 non-null  object
dtypes: object(6)
memory usage: 951.1+ KB


## Recommender based on plot similarities

In [11]:
movies['movie_info'].head()

0    Always trouble-prone, the life of teenager Per...
1    Kate (Catherine Keener) and her husband Alex (...
2    A successful, middle-aged Hollywood songwriter...
3    Following the closing arguments in a murder tr...
4    In 1866, Professor Pierre M. Aronnax (Paul Luk...
Name: movie_info, dtype: object

## Natural Language Processing

#### Step 1: remove stop words such as 'the' and 'an' etc to create Term Frequence - Inverse Document Frequency(TF-IDF) matrix
info: it can be seen that there are 43567 different vocabularies from 45000 movies

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['movie_info'])
tfidf_matrix.shape


(17391, 43567)

In [13]:
tfidf.get_feature_names_out()[1000:1020]

array(['aella', 'aeon', 'aereon', 'aerial', 'aerialist', 'aerobics',
       'aeronaut', 'aeronautics', 'aerosmith', 'aerospace', 'aesthetic',
       'aether', 'af', 'afar', 'afemo', 'affable', 'affair', 'affairs',
       'affect', 'affectations'], dtype=object)

#### Step 2: Use cosine similarity to calculate a numeric quantity that denotes similarity between 2 movies

info: each movie will be a 1x17712 column vector where each column will be a similarity score with each movie.

In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
cosine_sim.shape

(17391, 17391)

info: movies have a similarity of 1 to itself

In [16]:
cosine_sim[1]

array([0.        , 1.        , 0.05157674, ..., 0.01456716, 0.        ,
       0.        ])

#### step 3: a function that takes input 'movie_title' and outputs a list of 10 most similar movies

info: reverse map of indices and movie titles

In [17]:
index = pd.Series(movies.index, index=movies['movie_title']).drop_duplicates()

In [18]:
index[:10]

movie_title
Percy Jackson & the Olympians: The Lightning Thief    0
Please Give                                           1
10                                                    2
12 Angry Men (Twelve Angry Men)                       3
20,000 Leagues Under The Sea                          4
10,000 B.C.                                           5
The 39 Steps                                          6
3:10 to Yuma                                          7
Charly (A Heartbeat Away)                             8
Abraham Lincoln                                       9
dtype: int64

info: the recommendation function

In [19]:
def recommendations(title, cosine_sim=cosine_sim):
    idx = index[title]
    similarity = list(enumerate(cosine_sim[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:11]
    # given movie is removed from the list
    movie_index = [i[0] for i in similarity]
    return movies['movie_title'].iloc[movie_index]

In [20]:
recommendations('The Lord of the Rings: The Fellowship of the Ring')
#movie reccomendations based on a movie

2034     71 Fragmente einer Chronologie des Zufalls (71...
14556                                             Bastards
9810                                             Love Liza
7463                        The Happiness of the Katakuris
4242                                 Camille Claudel, 1915
3663                                            Black Pond
16378                                     Tortoise In Love
14186                           Sympathy for Mr. Vengeance
10619                                              mother!
5301                                  Death and the Maiden
Name: movie_title, dtype: object

### In the above section, we only used one factor to provide recommendations. The recommendations provided can be improved if we make better use of the data we have, by utilising more of the data for example, using members of the cast, genres as well as the director. 

#### step 4: identify the parameters the reccomedation can be improved by

In [21]:
features = movies[['movie_title','actors','genres','directors','authors']]
features.head(5)

Unnamed: 0,movie_title,actors,genres,directors,authors
0,Percy Jackson & the Olympians: The Lightning T...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...","Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan"
1,Please Give,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Comedy,Nicole Holofcener,Nicole Holofcener
2,10,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...","Comedy, Romance",Blake Edwards,Blake Edwards
3,12 Angry Men (Twelve Angry Men),"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","Classics, Drama",Sidney Lumet,Reginald Rose
4,"20,000 Leagues Under The Sea","James Mason, Kirk Douglas, Paul Lukas, Peter L...","Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton


Info: a function to convert all strings to lower case and strip names of spaces

In [22]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [38]:
features = ['actors','genres','directors','authors']

for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

features = movies[['movie_title','actors','genres','directors','authors']]    
features.head(5)

Unnamed: 0,movie_title,actors,genres,directors,authors
0,Percy Jackson & the Olympians: The Lightning T...,"loganlerman,brandont.jackson,alexandradaddario...","action&adventure,comedy,drama,sciencefiction&f...",chriscolumbus,"craigtitley,chriscolumbus,rickriordan"
1,Please Give,"catherinekeener,amandapeet,oliverplatt,rebecca...",comedy,nicoleholofcener,nicoleholofcener
2,10,"dudleymoore,boderek,julieandrews,robertwebber,...","comedy,romance",blakeedwards,blakeedwards
3,12 Angry Men (Twelve Angry Men),"martinbalsam,johnfiedler,leej.cobb,e.g.marshal...","classics,drama",sidneylumet,reginaldrose
4,"20,000 Leagues Under The Sea","jamesmason,kirkdouglas,paullukas,peterlorre,ro...","action&adventure,drama,kids&family",richardfleischer,earlfelton


#### Step 5: The create soup function.
Info: The create_soup function will simply join all the required columns by a space. This is the final preprocessing step, and the output of this function will be fed into the word vector model.

In [24]:
def create_soup(x):
    return x['directors'] + ' ' + x['actors'] + ' ' + x['genres'] + ' ' + x['authors']

In [25]:
movies['soup'] = movies.apply(create_soup, axis=1)

In [26]:
movies[['soup']].head(10)

Unnamed: 0,soup
0,"chriscolumbus loganlerman,brandont.jackson,ale..."
1,"nicoleholofcener catherinekeener,amandapeet,ol..."
2,"blakeedwards dudleymoore,boderek,julieandrews,..."
3,"sidneylumet martinbalsam,johnfiedler,leej.cobb..."
4,"richardfleischer jamesmason,kirkdouglas,paullu..."
5,"rolandemmerich stevenstrait,camillabelle,cliff..."
6,"alfredhitchcock robertdonat,madeleinecarroll,g..."
7,"delmerdaves glennford,vanheflin,feliciafarr,le..."
8,"adamthomasanderegg heatherbeers,garyneilson,li..."
9,"d.w.griffith walterhuston,unamerkel,kayhammond..."


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

info: it can be seen that there are 43567 different vocabularies from 45000 movies

In [28]:
count_matrix.shape

(17391, 218280)

info: cosine_similarity is used to measure the distance between the embeddings

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [30]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['movie_title'])

info: reusing the recommendations using the new cosine_sim2 as your second argument)

In [31]:
recommendations('The Lord of the Rings: The Fellowship of the Ring', cosine_sim2)

4724         Come Undone (Cosa voglio di più)
6898     Gianni e le donne (The Salt of Life)
15370                                The Pool
4365                           Chak de! India
6044                 Every Thing Will Be Fine
9190                 Leap Year (Año bisiesto)
2432                          I Saw the Devil
2204                    A Tale of Two Sisters
13137             Six Sex Scenes and a Murder
13702                                Stranded
Name: movie_title, dtype: object