## Title length correlation to success (measured by ratings)

### Data import and cleaning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import string
import requests

In [2]:
# Load the pre-processed CMU movies dataset
movies = pd.read_csv('Data/movies_metadata_english_only.csv')
movies.head(2)

Unnamed: 0.1,Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv..."
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,['English Language'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri..."


In [3]:
# Load the IMdB ratings dataset
ratings_imdb = pd.read_table('Data/ratings_imdb.tsv', sep='\t')
ratings_imdb.head(2)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2005
1,tt0000002,5.8,269


In [4]:
# Load the IMdB movies dataset
movies_imdb = pd.read_csv('Data/movies_imdb.tsv', sep = '\t')
movies_imdb.head(2)

  movies_imdb = pd.read_csv('Data/movies_imdb.tsv', sep = '\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"


In [5]:
# Load the Rotten tomatoes dataset
movies_rotten = pd.read_csv('Data/movies_rottentomatoes.csv', sep = ',')
movies_rotten.head(2)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19


In [6]:
# Several movies have the same title, some movies with the same title are even released the same year.
# That's why the datasets will be merged on a set of parameters : [Movie title, Release date, Runtime]
duplicates_name_date = movies[movies.duplicated(subset=['Movie_name', 'Movie_release_date'], keep=False)]
duplicates_name_date.head(2)

Unnamed: 0.1,Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
49,143,7460227,/m/0kv80y,The Bridge,2006,,69.0,['English Language'],['United States of America'],['Drama']
221,669,4953051,/m/0cwvf3,The Bridge,2006,,94.0,['English Language'],"['United States of America', 'United Kingdom']",['Documentary']


In [7]:
duplicates_name_date_runtime = movies[movies.duplicated(subset=['Movie_name', 'Movie_release_date', 'Movie_runtime'], keep=False)]
duplicates_name_date_runtime

Unnamed: 0.1,Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
10202,31701,31214305,/m/0gjdf_r,Facing the Music,1941,,79.0,['English Language'],['United Kingdom'],"['Musical', 'Romance Film']"
16412,51505,20306762,/m/04_0zf4,"Yours, Mine and Ours",2005,,88.0,['English Language'],['United States of America'],"['Romance Film', 'Family Film', 'Comedy']"
21148,66518,31214392,/m/0gjb_d7,Facing the Music,1941,,79.0,['English Language'],['United Kingdom'],['Comedy film']
25440,80080,3145964,/m/08vczx,"Yours, Mine and Ours",2005,72028752.0,88.0,['English Language'],['United States of America'],"['Romance Film', 'Family Film', 'Comedy']"


In [8]:
# Clean the IMdB movies dataset :
# Make a deep copy of the database
movies_imdb_filtered = movies_imdb.copy(deep=True)

# Remove unuseful columns
drop_col = ['titleType', 'primaryTitle', 'isAdult', 'genres', 'endYear']
movies_imdb_filtered.drop(drop_col, axis =1, inplace=True)

# Rename useful columns
movies_imdb_filtered = movies_imdb_filtered.rename(
    columns={
        "startYear": "Movie_release_date",
        "originalTitle": "Movie_name",
        "runtimeMinutes": "Movie_runtime",
    }
)

movies_imdb_filtered.head(2)

Unnamed: 0,tconst,Movie_name,Movie_release_date,Movie_runtime
0,tt0000001,Carmencita,1894,1
1,tt0000002,Le clown et ses chiens,1892,5


In [None]:
# Remove NaN values in the columns used for the merge

movies_imdb_filtered['Movie_release_date'] = movies_imdb_filtered['Movie_release_date'].replace('\\N', np.nan)
movies_imdb_filtered.dropna(subset=['Movie_release_date'], inplace =True)
movies_imdb_filtered['Movie_release_date'] = movies_imdb_filtered['Movie_release_date'].astype('int') #keep only the year

movies_imdb_filtered['Movie_runtime'] = movies_imdb_filtered['Movie_runtime'].replace('\\N', np.nan)
movies_imdb_filtered.dropna(subset=['Movie_runtime'], inplace =True)
movies_imdb_filtered['Movie_runtime'] = movies_imdb_filtered['Movie_runtime'].astype(int).astype('float64', errors='ignore')

movies_imdb_filtered = movies_imdb_filtered.drop_duplicates( subset=["Movie_name", "Movie_release_date", "Movie_runtime"] )

movies_imdb_filtered.head(2)

In [None]:
# Clean the IMdB ratings dataset :
# Rename useful columns
ratings_imdb = ratings_imdb.rename(
    columns={
        "averageRating": "Audience_imdb_rating",
        "numVotes": "Audience_imdb_count"
    }
)

In [None]:
# Merge IMdB movies and ratings datasets
df_imdb = movies_imdb_filtered.merge(ratings_imdb, on='tconst', how='inner')
print('Size of the dataset with the ratings : ', df_imdb.shape)
df_imdb.head(2)

In [None]:
# Clean the Rotten tomatoes dataset :
# Make a deep copy of the database
movies_rotten_filtered = movies_rotten.copy(deep=True)

# Remove unuseful columns
drop_col = ['critics_consensus', 'content_rating', 'genres', 'streaming_release_date', 'tomatometer_status', 'audience_status', 
            'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
movies_rotten_filtered.drop(drop_col, axis =1, inplace=True)

# Rename useful columns
movies_rotten_filtered = movies_rotten_filtered.rename(
    columns={
        "original_release_date": "Movie_release_date",
        "movie_title": "Movie_name",
        "runtime": "Movie_runtime",
        "movie_info": "Movie_info_rotten"
        "directors": "Directors"
        "authors": "Authors"
        "actors":"Actors_rotten"
        "tomatometer_rating": "Critics_rotten_rating"
        "tomatometer_count": "Critic_rotten_count"
        "audience_rating": "Audience_rotten_rating"
        "audience_count": "Audience_rotten_count"
    }
)

movies_rotten_filtered.head(2)

In [None]:
# Remove NaN values in the columns used for the merge
movies_rotten_filtered['Movie_release_date'] = movies_rotten['Movie_release_date'].replace('\\N', np.nan)
movies_rotten_filtered.dropna(subset=['Movie_release_date'], inplace =True)
movies_rotten_filtered['Movie_release_date'] = movies_rotten['Movie_release_date'].astype('int')
# TO DO : keep only the year

movies_rotten_filtered['Movie_runtime'] = movies_rotten_filtered['Movie_runtime'].replace('\\N', np.nan)
movies_rotten_filtered.dropna(subset=['Movie_runtime'], inplace =True)
movies_rotten_filtered['Movie_runtime'] = movies_rotten_filtered['Movie_runtime'].astype(int).astype('float64', errors='ignore')

movies_rotten_filtered.head(2)

In [None]:
# Merging IMdB dataset and pre-processed CMU movies dataset on [Movie title, Release date, Runtime]
cmu_imdb = movies.merge(df_imdb, on=['Movie_name', 'Movie_release_date', 'Movie_runtime'], how='inner')
print('Size of the new dataset : ', cmu_imdb.shape)
cmu_imdb.head(2)

In [None]:
# Merging Rotten tomatoes dataset and IMdB-CMU dataset on [Movie title, Release date, Runtime]
movies_ratings = cmu_imdb.merge(movies_rotten_filtered, on=['Movie_name', 'Movie_release_date', 'Movie_runtime'], how='inner')

### Plots of title length vs ratings (IMdB audience, Rotten tomatoes audience, Rotten tomatoes critics)