# IMDB dataset analysis

In [141]:
DATA_FOLDER = './data/'
MOVIES_FOLDER = DATA_FOLDER + 'movies_summaries/'
PLOT_SUMMARY_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries'
IMDB_FOLDER = DATA_FOLDER + 'imdb/'

REPORT_FOLDER = './gen/reports/'
ETHNICITY_FILE = './gen/ethnicities.tsv'

CHARACTERS_FILE = MOVIES_FOLDER + 'character.metadata.tsv'
MOVIES_FILE = MOVIES_FOLDER + 'movie.metadata.tsv'
PLOT_SUMMARIES_FILE = MOVIES_FOLDER + 'plot_summaries.txt'
TROPES_FILE = MOVIES_FOLDER + 'tvtropes.clusters.txt'

IMDB_FILE = IMDB_FOLDER + 'movies_metadata.csv'
EUROPEAN_COUNTRIES_FILE = DATA_FOLDER + 'european_countries.txt'

In [142]:
import pandas as pd
import numpy as np
%matplotlib inline

from data_wrangling_tools import *

In [143]:
# import imdb dataset

def load_imdb(imdb_file, columns=['original_title', 'revenue', 'budget', 'vote_average', 'vote_count']):
    imdb = pd.read_csv(imdb_file, usecols=columns)

    # remove wrongly formatted rows (only 3)
    imdb = imdb.drop(imdb[imdb['budget'].str.contains('.jpg')].index)

    # convert numerical columns to float
    imdb['revenue'] = imdb['revenue'].astype(float).apply(lambda x: np.nan if x == 0.0 else x)
    imdb['budget'] = imdb['budget'].astype(float).apply(lambda x: np.nan if x == 0.0 else x)

    return imdb

imdb = load_imdb(IMDB_FILE)
print(imdb.shape)
imdb.head()

(45463, 5)


Unnamed: 0,budget,original_title,revenue,vote_average,vote_count
0,30000000.0,Toy Story,373554033.0,7.7,5415.0
1,65000000.0,Jumanji,262797249.0,6.9,2413.0
2,,Grumpier Old Men,,6.5,92.0
3,16000000.0,Waiting to Exhale,81452156.0,6.1,34.0
4,,Father of the Bride Part II,76578911.0,5.7,173.0


In [144]:
# load movies
movies = load_movies(MOVIES_FILE)
movies = clean_unknowns(movies)
movies = clean_jsons(movies)
print(movies.shape)
movies.head()

(81741, 9)


Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama]


In [145]:
# merge movies and imdb movies

def merge_movies_imdb(movies, imdb):
    df = pd.merge(movies, imdb, left_on='name', right_on='original_title', how='left')

    # drop movies that have been duplicated during the merge TODO see it
    df = df.drop_duplicates(subset=['name', 'vote_count', 'vote_average'])

    # fill the box_office revenue with the imdb revenue if it's missing
    df['box_office_revenue'] = df['box_office_revenue'].fillna(df['revenue'].copy())
    df = df.drop(columns=['revenue', 'original_title'])

    return df

df = merge_movies_imdb(movies, imdb)

print(df.shape)
df.head()

(77380, 12)


Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres,budget,vote_average,vote_count
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",28000000.0,4.8,299.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",,,
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,,
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,,
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama],,,


# Gen the main dataset

In [146]:
# load characters
characters = load_characters(CHARACTERS_FILE)
ethnicities = load_ethnicities(ETHNICITY_FILE)
characters = add_characters_ethnicities(characters, ethnicities)

In [147]:
# merge characters and movies
merged_df = merge_characters_movies(characters, df)
merged_df

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,release_date,box_office_revenue,runtime,genres,languages,countries,char_name,a_name,a_gender,a_ethnicity,a_dob,a_age_at_release,a_height,freebase_char/a_map,freebase_char_id,freebase_a_id,a_ethnicity_freebase_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Akooshay,Wanda De Jesus,F,,1958-08-26,42.0,1.620,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,
1,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Lieutenant Melanie Ballard,Natasha Henstridge,F,,1974-08-15,27.0,1.780,/m/0jys3m,/m/0bgchn4,/m/0346l4,/m/044038p
2,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Desolation Williams,Ice Cube,M,African Americans,1969-06-15,32.0,1.727,/m/0jys3g,/m/0bgchn_,/m/01vw26l,/m/0x67
3,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Sgt Jericho Butler,Jason Statham,M,,1967-09-12,33.0,1.750,/m/02vchl6,/m/0bgchnq,/m/034hyc,
4,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Bashira Kincaid,Clea DuVall,F,,1977-09-25,23.0,1.650,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467184,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"[Science Fiction, Japanese Movies, Adventure, ...",[Japanese Language],[Japan],Elensh,Dorothy Elias-Fahn,F,,1970-05,,,/m/0kr406c,/m/0kr406h,/m/0b_vcv,
467185,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"[Science Fiction, Japanese Movies, Adventure, ...",[Japanese Language],[Japan],Hibiki,Jonathan Fahn,M,,1965-04-12,27.0,,/m/0kr405_,/m/0kr4090,/m/0bx7_j,
467186,28308153,/m/0cp05t9,Five Clues to Fortune,1957-01-01,,129.0,[Crime Fiction],[English Language],[United Kingdom],,David Hemmings,M,English people,1941-11-18,15.0,1.730,/m/0g8ngmc,,/m/022g44,/m/02w7gg
467187,28308153,/m/0cp05t9,Five Clues to Fortune,1957-01-01,,129.0,[Crime Fiction],[English Language],[United Kingdom],,Roberta Paterson,,,,,,/m/0g8ngmj,,/m/0g8ngmm,


In [148]:
european_countries = pd.read_csv(EUROPEAN_COUNTRIES_FILE, header=None, skiprows=1, names=['country'])['country'].to_list()