In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from glob import glob

In [3]:
#loads the data into a dictionary of dataframes
csv_files = glob("./zippedData/*.csv.gz")
csv_files_dict = {}
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") # cleaning the filenames
    filename_df = pd.read_csv(filename, index_col=0)
    csv_files_dict[filename_cleaned] = filename_df

#### Cleaning and Linking for the exploration of our questions

Since we will be working with Pandas we need to organize our DFs. Let's start by renaming all of them and adding them to a new dictionary. 

By creating a new dictionary of DFs, we can manipulate the data without messing up the originals in csv_files_dict. 

In [4]:
# by creating a dictionary, we can manipulate the data without messing up the originals in csv_files_dict
df_repository = {} 
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") # cleaning the filenames    
    exec("""df_repository[""" + """'""" + filename_cleaned + """']""" + """ = csv_files_dict['""" + filename_cleaned + """']""" )

#### First step is to clean the data. 

An easy way to start is to remove all duplicates


In [5]:
def clean_duplicates(dataframe): #returns a dataframe without duplicates
    cleaned_filter = dataframe.duplicated()
    index_to_drop = dataframe.loc[cleaned_filter].index
    return dataframe.drop(index=index_to_drop)

In [6]:
for db in df_repository.keys(): #loop to parse dataframes through the clean_duplicates() function we wrote above
    df_repository[db] = clean_duplicates(df_repository[db])

In [7]:
for db in df_repository.keys():
    print(db, 'contains ', df_repository[db].duplicated().sum(), ' duplicates') #checking if it worked

bom_movie_gross_gz contains  0  duplicates
imdb_name_basics_gz contains  0  duplicates
imdb_title_akas_gz contains  0  duplicates
imdb_title_basics_gz contains  0  duplicates
imdb_title_crew_gz contains  0  duplicates
imdb_title_principals_gz contains  0  duplicates
imdb_title_ratings_gz contains  0  duplicates
tmdb_movies_gz contains  0  duplicates
tn_movie_budgets_gz contains  0  duplicates


#### Now we move on to the tables we are interested in

We want to explore the relationships between the Genre and Profits, Director and Profits, etc

 

#### The 'tn_movie_budgets' is a good place to start

It shows budget and revenue for a given movie


     

In [8]:
df_repository['tn_movie_budgets_gz'].head()

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [10]:
def string_to_float(string):
    if string == None:
        new_string = 0
    else:
        if (type(string) != int) and (type(string) != float):
            new_string = string.replace('$', '')
            new_string = new_string.replace(',', '')
        else:
            new_string = string
    return float(new_string)


def series_string_to_float(dataseries):
    new_dataseries = dataseries.apply(string_to_float)
    return new_dataseries


In [11]:
#cleaning data 
# from string to float (tn_movie_budgets_gz) for relevant columns
# string to datetime format
df_repository['tn_movie_budgets_gz'].production_budget = series_string_to_float(df_repository['tn_movie_budgets_gz'].production_budget)
df_repository['tn_movie_budgets_gz'].domestic_gross = series_string_to_float(df_repository['tn_movie_budgets_gz'].domestic_gross)
df_repository['tn_movie_budgets_gz'].worldwide_gross = series_string_to_float(df_repository['tn_movie_budgets_gz'].worldwide_gross)
df_repository['tn_movie_budgets_gz'].release_date = pd.to_datetime(df_repository['tn_movie_budgets_gz'].release_date)


#### We cleaned the tn_movie_budgets_gz database

The next step is to connect this data to the other parameters we want to investigate (ex: director of the movie)

##### imdb_title_akas_gz might be helpful, 
It contains several title versions and links them to a specific identifier ('title_id'), which will be useful when merging data from imdb

In [14]:
df_repository['imdb_title_akas_gz'].head(3)

Unnamed: 0_level_0,ordering,title,region,language,types,attributes,is_original_title
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0369610,10,Джурасик свят,BG,bg,,,0.0
tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0


In [15]:
df_repository['imdb_title_akas_gz'].rename(columns={'title' : 'movie'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [16]:
# needs more readability!!!!!!!!
df_imdb_financials = pd.merge(df_repository['imdb_title_akas_gz'].reset_index(), 
                              df_repository['tn_movie_budgets_gz'], on='movie')

In [17]:
df_imdb_financials.drop(columns=['ordering', 'region', 'language', 'types', 'attributes'], inplace=True)

In [18]:
filt = df_imdb_financials.is_original_title == 1
index_to_drop = df_imdb_financials.loc[~filt].index
df_imdb_financials.drop(index=index_to_drop, inplace=True)

Now we have a dataframe with movies, costs and revenues. We need now to calculate the profits

In [19]:
df_imdb_financials['total_profit'] = df_imdb_financials.worldwide_gross - df_imdb_financials.production_budget
df_imdb_financials.head()

Unnamed: 0,title_id,movie,is_original_title,release_date,production_budget,domestic_gross,worldwide_gross,total_profit
10,tt0369610,Jurassic World,1.0,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0
25,tt0401729,John Carter,1.0,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0
31,tt1014759,Alice in Wonderland,1.0,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0
32,tt1014759,Alice in Wonderland,1.0,1951-07-28,3000000.0,0.0,0.0,-3000000.0
35,tt1194173,The Bourne Legacy,1.0,2012-08-10,125000000.0,113203870.0,280355900.0,155355900.0


#### Directors vs Financials
Now lets include the directors into our list, by merging diferent datasets

df_repository['imdb_title_crew_gz'] contains the information we need, and it already uses the imdb identifier 

In [20]:
df_repository['imdb_title_crew_gz'].head()

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0285252,nm0899854,nm0899854
tt0438973,,"nm0175726,nm1802864"
tt0462036,nm1940585,nm1940585
tt0835418,nm0151540,"nm0310087,nm0841532"
tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [21]:
# to facilitate marging with 'tconst', let's transform the index into a column

df_repository['imdb_title_crew_gz'].reset_index()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943
...,...,...,...
123224,tt8999974,nm10122357,nm10122357
123225,tt9001390,nm6711477,nm6711477
123226,tt9001494,"nm10123242,nm10123248",
123227,tt9004986,nm4993825,nm4993825


Even though we have here the directors codes, we want to select just the first one

In [23]:
# function that breaks a series' values into a list of values; selects the first one to return a new series with the values:

def split_series(series, split_characters):
    broken_series = series.str.split(split_characters)
    new_list = []
    
    for lst in broken_series:
        if type(lst) == list:
            new_list.append(lst[0])
        else:
            new_list.append('N/A')
            
    new_series_dict = {'new_column': new_list}
    return pd.DataFrame(new_series_dict)
    

In [None]:
# function that breaks a series' values into a list of values; selects the first one to return a new series with the values:

def split_series(series, split_characters):
    broken_series = series.str.split(split_characters)
    new_list = []
    
    for lst in broken_series:
        if type(lst) == list:
            new_list.append([lst[i] for i in list(range(len(lst)))
        else:
            new_list.append('N/A')
            
    new_series_dict = {'new_column': new_list}
    return pd.DataFrame(new_series_dict)

In [24]:
df_imdb_financials['director_code'] = split_series(df_repository['imdb_title_crew_gz'].directors, ',')
df_imdb_financials.head()

Unnamed: 0,title_id,movie,is_original_title,release_date,production_budget,domestic_gross,worldwide_gross,total_profit,director_code
10,tt0369610,Jurassic World,1.0,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0,nm9232888
25,tt0401729,John Carter,1.0,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0,nm7717515
31,tt1014759,Alice in Wonderland,1.0,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,nm1583111
32,tt1014759,Alice in Wonderland,1.0,1951-07-28,3000000.0,0.0,0.0,-3000000.0,nm10586624
35,tt1194173,The Bourne Legacy,1.0,2012-08-10,125000000.0,113203870.0,280355900.0,155355900.0,nm10384892


##### REDUNDANT NOW. DELETE AFTER MAKING SURE IT IS NOT NECESSARY


##### split the column 'directors' into list of multiple strings
df_repository['imdb_title_crew_gz'].directors = df_repository['imdb_title_crew_gz'].directors.str.split(',') 

##### transform the resulting column into a string, so we can treat it
code_list = df_repository['imdb_title_crew_gz'].directors.tolist()

##### for loop that selects the first director of the list 
dir_list = []
for lst in code_list:
    if type(lst) == list:
        dir_list.append(lst[0])
    else:
        dir_list.append('N/A') # if not a list, returns 'N/A'

dir_dict = {'director_code': dir_list} # creates dictionary to facilitate creation of a data series
df_imdb_financials['director_code'] = pd.DataFrame(dir_dict)['director_code'] #creates and populates a new column
df_imdb_financials.head()


#df_repository['imdb_title_crew_gz'][['dir1', 'dir2', 'dir3']] = pd.DataFrame(df_repository['imdb_title_crew_gz'].directors.tolist(), index= df_repository['imdb_title_crew_gz'].index)

#### Now we need to include the directors names

the database imdb_name_basics_gz will be useful. Is uses the identifing code we have to list people's names

In [25]:
df_repository['imdb_name_basics_gz'].head(3)

Unnamed: 0_level_0,primary_name,birth_year,death_year,primary_profession,known_for_titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"


In [26]:
df_repository['imdb_name_basics_gz'].reset_index(inplace=True) 

In [27]:
df_codes_names = df_repository['imdb_name_basics_gz'][['nconst', 'primary_name']]

df_imdb_financials_directors = pd.merge(df_imdb_financials, df_codes_names, how='left', 
                                        left_on='director_code', right_on='nconst')

In [28]:
df_imdb_financials_directors.drop(columns=['is_original_title', 'nconst'], inplace=True)

 #### Next step is to include genres

the information we need is in imdb_title_basics_gz

In [30]:
# joinning our recently created df_imdb_financials_directors with the genres from the imdb_title_basics_gz database
df_imdb_financials_directors_genre = pd.merge(df_imdb_financials_directors, 
                                              df_repository['imdb_title_basics_gz']['genres'].reset_index(), 
                                              left_on='title_id', right_on='tconst', how='left')

In [31]:
# additional cleaning
df_imdb_financials_directors_genre.drop(columns='tconst', inplace=True)
df_imdb_financials_directors_genre.head()

Unnamed: 0,title_id,movie,release_date,production_budget,domestic_gross,worldwide_gross,total_profit,director_code,primary_name,genres
0,tt0369610,Jurassic World,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0,nm9232888,Michael Anthony Giudicissi,"Action,Adventure,Sci-Fi"
1,tt0401729,John Carter,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0,nm7717515,Jordan Walker,"Action,Adventure,Sci-Fi"
2,tt1014759,Alice in Wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,nm1583111,Serena Davies,"Adventure,Family,Fantasy"
3,tt1014759,Alice in Wonderland,1951-07-28,3000000.0,0.0,0.0,-3000000.0,nm10586624,Andrea Hammontree,"Adventure,Family,Fantasy"
4,tt1194173,The Bourne Legacy,2012-08-10,125000000.0,113203870.0,280355900.0,155355900.0,nm10384892,Mal Williamson,"Action,Adventure,Thriller"


In [54]:
df_imdb_financials_directors_genre['profit_margin'] = df_imdb_financials_directors_genre['total_profit'] / df_imdb_financials_directors_genre['production_budget']
df_imdb_financials_directors_genre.head()

Unnamed: 0,title_id,title,release_date,production_budget,domestic_gross,worldwide_gross,total_profit,director_code,director_name,genres,profit_margin
0,tt0369610,Jurassic World,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0,nm9232888,Michael Anthony Giudicissi,Action,6.669092
1,tt0401729,John Carter,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0,nm7717515,Jordan Walker,Action,0.028284
2,tt1014759,Alice in Wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,nm1583111,Serena Davies,Adventure,4.127456
3,tt1014759,Alice in Wonderland,1951-07-28,3000000.0,0.0,0.0,-3000000.0,nm10586624,Andrea Hammontree,Adventure,-1.0
4,tt1194173,The Bourne Legacy,2012-08-10,125000000.0,113203870.0,280355900.0,155355900.0,nm10384892,Mal Williamson,Action,1.242847


#### We can now clean the Genres column by selecting the first genre listed

We will use the custom function split_series for that and then update the values into our growing dataframe

In [32]:
df_imdb_financials_directors_genre['genres'] = split_series(df_imdb_financials_directors_genre.genres, ',')

In [33]:
df_imdb_financials_directors_genre.head()

Unnamed: 0,title_id,movie,release_date,production_budget,domestic_gross,worldwide_gross,total_profit,director_code,primary_name,genres
0,tt0369610,Jurassic World,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0,nm9232888,Michael Anthony Giudicissi,Action
1,tt0401729,John Carter,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0,nm7717515,Jordan Walker,Action
2,tt1014759,Alice in Wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,nm1583111,Serena Davies,Adventure
3,tt1014759,Alice in Wonderland,1951-07-28,3000000.0,0.0,0.0,-3000000.0,nm10586624,Andrea Hammontree,Adventure
4,tt1194173,The Bourne Legacy,2012-08-10,125000000.0,113203870.0,280355900.0,155355900.0,nm10384892,Mal Williamson,Action


In [34]:
# last bit of cleaning
df_imdb_financials_directors_genre.rename(columns={'movie': 'title', 'primary_name': 'director_name'}, inplace=True)

#### Let's now vizualize the data we treated and see what conclusions we can draw

 

In [71]:
df_imdb_financials_directors_genre.groupby('genres').median()

Unnamed: 0_level_0,production_budget,domestic_gross,worldwide_gross,total_profit,profit_margin
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,45000000.0,37286531.5,88881594.5,39770337.5,0.936479
Adventure,50000000.0,55617125.0,118410673.5,52498492.0,1.412102
Animation,39000000.0,35287788.0,81150788.0,42150788.0,1.080789
Biography,18550000.0,15017106.5,25724122.0,9640023.5,0.686798
Comedy,13400000.0,18007317.0,26853810.0,12838945.0,0.941235
Crime,16000000.0,10169202.0,19054534.0,545420.0,0.168269
Documentary,5000000.0,3484331.0,6000000.0,33588.0,0.009125
Drama,10000000.0,5580479.0,14189810.0,1927779.0,0.38021
Family,3500000.0,0.0,0.0,-300000.0,-1.0
Fantasy,25000000.0,24046682.0,39126427.0,14126427.0,0.565057
