In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import importlib
import data_cleaning as dc
%matplotlib inline


Tables of interest:
* imdb.title.basics  (for genres)
* imdb.title.ratings (for average ratings)
* bom.movie_gross (for studios)
* tn.movie_budgets (for budgets and worldwide gross)

## Import tables of interest

function `df_from_movie_csv` loads csv.gz by tablename into dataframe, setting index, converting date fields to date time, converting dollar fields to numeric.

In [2]:
import_tables = ['imdb.title.basics','imdb.title.ratings','bom.movie_gross','tn.movie_budgets']
dfs = {}
for table_name in import_tables:
    dfs[table_name] = dc.df_from_movie_csv(table_name)

## Join tables

Join the two `imdb` dataframes. They join directly on index, no error-checking required.

In [5]:
imdb_title_ratings_df = dfs['imdb.title.basics'].join(dfs['imdb.title.ratings'])

Join the `imdb` dataframe with the `bom` dataframe on (English-language) movie title.

In [25]:
importlib.reload(dc)
bom_df = dfs['bom.movie_gross']
bom_df = bom_df.query('2009 < year < 2019')
imdb_bom_df = dc.join_dfs_on_key_col(imdb_title_ratings,bom_df,
                                     left_on='primary_title',right_on='title')


In [19]:
imdb_title_ratings.reset_index().merge(bom_df.rename(columns={'title':'primary_title'}),how='inner',on='primary_title').set_index('tconst')

Unnamed: 0_level_0,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,studio,domestic_gross,foreign_gross,year
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0315642,Wazir,Wazir,2016,103.0,"Action,Crime,Drama",7.1,15378.0,Relbig.,1100000.0,,2016
tt0337692,On the Road,On the Road,2012,124.0,"Adventure,Drama,Romance",6.1,37886.0,IFC,744000.0,8000000,2012
tt2404548,On the Road,On the Road,2011,90.0,Drama,,,IFC,744000.0,8000000,2012
tt3872966,On the Road,On the Road,2013,87.0,Documentary,,,IFC,744000.0,8000000,2012
tt4339118,On the Road,On the Road,2014,89.0,Drama,6.0,6.0,IFC,744000.0,8000000,2012
tt5389486,On the Road,On the Road,2015,39.0,Documentary,,,IFC,744000.0,8000000,2012
tt5647250,On the Road,On the Road,2016,121.0,Drama,5.7,127.0,IFC,744000.0,8000000,2012
tt0359950,The Secret Life of Walter Mitty,The Secret Life of Walter Mitty,2013,114.0,"Adventure,Comedy,Drama",7.3,275300.0,Fox,58200000.0,129900000,2013
tt0365907,A Walk Among the Tombstones,A Walk Among the Tombstones,2014,114.0,"Action,Crime,Drama",6.5,105116.0,Uni.,26300000.0,26900000,2014
tt0369610,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",7.0,539338.0,Uni.,652300000.0,1019.4,2015


In [26]:
imdb_bom_df

Unnamed: 0_level_0,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,studio,domestic_gross,foreign_gross,year
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0315642,Wazir,Wazir,2016,103.0,"Action,Crime,Drama",7.1,15378.0,Relbig.,1100000.0,,2016
tt0337692,On the Road,On the Road,2012,124.0,"Adventure,Drama,Romance",6.1,37886.0,IFC,744000.0,8000000,2012
tt2404548,On the Road,On the Road,2011,90.0,Drama,,,IFC,744000.0,8000000,2012
tt3872966,On the Road,On the Road,2013,87.0,Documentary,,,IFC,744000.0,8000000,2012
tt4339118,On the Road,On the Road,2014,89.0,Drama,6.0,6.0,IFC,744000.0,8000000,2012
tt5389486,On the Road,On the Road,2015,39.0,Documentary,,,IFC,744000.0,8000000,2012
tt5647250,On the Road,On the Road,2016,121.0,Drama,5.7,127.0,IFC,744000.0,8000000,2012
tt0359950,The Secret Life of Walter Mitty,The Secret Life of Walter Mitty,2013,114.0,"Adventure,Comedy,Drama",7.3,275300.0,Fox,58200000.0,129900000,2013
tt0365907,A Walk Among the Tombstones,A Walk Among the Tombstones,2014,114.0,"Action,Crime,Drama",6.5,105116.0,Uni.,26300000.0,26900000,2014
tt0369610,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",7.0,539338.0,Uni.,652300000.0,1019.4,2015
