In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
%matplotlib inline

In [2]:
from glob import glob

In [3]:
#loads the data into a dictionary of dataframes
csv_files = glob("./zippedData/*.csv.gz")
csv_files_dict = {}
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") # cleaning the filenames
    filename_df = pd.read_csv(filename, index_col=0)
    csv_files_dict[filename_cleaned] = filename_df

#### Cleaning and Linking for the exploration of our questions

Since we will be working with Pandas we need to organize our DFs. Let's start by renaming all of them and adding them to a new dictionary. 

By creating a new dictionary of DFs, we can manipulate the data without messing up the originals in csv_files_dict. 

In [4]:
# by creating a dictionary, we can manipulate the data without messing up the originals in csv_files_dict
df_repository = {} 
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") # cleaning the filenames    
    exec("""df_repository[""" + """'""" + filename_cleaned + """']""" + """ = csv_files_dict['""" + filename_cleaned + """']""" )

##### Below are a few functions that we will use later on


In [5]:
def string_to_float(string):
    if string == None:
        new_string = 0
    else:
        if (type(string) != int) and (type(string) != float):
            new_string = string.replace('$', '')
            new_string = new_string.replace(',', '')
        else:
            new_string = string
    return float(new_string)


def series_string_to_float(dataseries):
    new_dataseries = dataseries.apply(string_to_float)
    return new_dataseries


In [6]:
def clean_duplicates(dataframe): #returns a dataframe without duplicates
    cleaned_filter = dataframe.duplicated()
    index_to_drop = dataframe.loc[cleaned_filter].index
    return dataframe.drop(index=index_to_drop)

In [7]:
# returns a Data Frame with 0 or 1, depending on the professions listed for each row (actor/actress)

def is_actor(series):
    
    actor_list = []
    
    for lst in series:        
        if ('actor' in lst) or ('actress' in lst):
            actor_list.append(1)
        else:
            actor_list.append(0)
    
    actor_dict = {'is_actor': actor_list}
    return pd.DataFrame(actor_dict)
            
            

In [8]:

def is_prof(series, profession1, profession2=None):
    
    prof_list = []
    
    for lst in series:        
        if (profession1 in lst):
            prof_list.append(1)
        else:
            if (profession2) and (profession2 in lst):
                prof_list.append(1)
            else:
                prof_list.append(0)
    
    prof_dict = {('is_' + str(profession1)): prof_list}
    return pd.DataFrame(prof_dict)
            

In [9]:
def split_series(series, split_characters):
    broken_series = series.str.split(split_characters)
    new_list1 = []
    new_list2 = []
    new_list3 = []
    
    for lst in broken_series:
        if type(lst) == list:
            new_list1.append(lst[0])
            
            if len(lst) > 1:
                new_list2.append(lst[1])
                
                if len(lst) > 2:
                    new_list3.append(lst[2])
                else:
                    new_list3.append('N/A')
            
            
            else: 
                new_list2.append('N/A')
                new_list3.append('N/A')
            
        else:
            new_list1.append('N/A')
            new_list2.append('N/A')
            new_list3.append('N/A')
            
    new_series_dict = {'col_1': new_list1, 'col_2': new_list2, 'col_3': new_list3 }
    return pd.DataFrame(new_series_dict)

#### First step is to clean the data. 

An easy way to start is to remove all duplicates


In [10]:
for db in df_repository.keys(): #loop to parse dataframes through the clean_duplicates() function we wrote above
    df_repository[db] = clean_duplicates(df_repository[db])

In [11]:
for db in df_repository.keys():
    print(db, 'contains ', df_repository[db].duplicated().sum(), ' duplicates') #checking if it worked

bom_movie_gross_gz contains  0  duplicates
imdb_name_basics_gz contains  0  duplicates
imdb_title_akas_gz contains  0  duplicates
imdb_title_basics_gz contains  0  duplicates
imdb_title_crew_gz contains  0  duplicates
imdb_title_principals_gz contains  0  duplicates
imdb_title_ratings_gz contains  0  duplicates
tmdb_movies_gz contains  0  duplicates
tn_movie_budgets_gz contains  0  duplicates


#### Now we move on to the tables we are interested in

We want to explore the relationships between  genre, actors, directors and financial results.

 

#### The 'tn_movie_budgets' is a good place to start

It shows the financial results for a given movie


     

In [12]:
df_repository['tn_movie_budgets_gz'].head()

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [13]:
#cleaning data 

# from string to float (tn_movie_budgets_gz) for relevant columns
df_repository['tn_movie_budgets_gz'].production_budget = series_string_to_float(df_repository['tn_movie_budgets_gz'].production_budget)
df_repository['tn_movie_budgets_gz'].domestic_gross = series_string_to_float(df_repository['tn_movie_budgets_gz'].domestic_gross)
df_repository['tn_movie_budgets_gz'].worldwide_gross = series_string_to_float(df_repository['tn_movie_budgets_gz'].worldwide_gross)

# string to datetime format
df_repository['tn_movie_budgets_gz'].release_date = pd.to_datetime(df_repository['tn_movie_budgets_gz'].release_date)


#### We cleaned the tn_movie_budgets_gz database

The next step is to connect this data to the other parameters we want to investigate (ex: actors of a movie)

##### imdb_title_akas_gz might be helpful, 
It contains several title versions and links them to a specific identifier ('title_id'), which will be useful when merging data from imdb

In [14]:
df_repository['imdb_title_akas_gz'].rename(columns={'title' : 'movie'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [15]:
# merging the DFs

df_title_akas_tn_movie_budgets_gz = pd.merge(df_repository['imdb_title_akas_gz'].reset_index(), 
                              df_repository['tn_movie_budgets_gz'], on='movie', how='inner')

In [16]:
# selecting columns and assining it to a new variable

df_imdb_financials = df_title_akas_tn_movie_budgets_gz[['title_id','release_date', 'production_budget', 'domestic_gross', 'worldwide_gross']]

# cleaning duplicates
filt = df_imdb_financials.duplicated()
index_to_drop = df_imdb_financials.loc[filt].index
df_imdb_financials.drop(index=index_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
df_imdb_financials.head()

Unnamed: 0,title_id,release_date,production_budget,domestic_gross,worldwide_gross
0,tt0369610,2015-06-12,215000000.0,652270625.0,1648855000.0
14,tt0401729,2012-03-09,275000000.0,73058679.0,282778100.0
27,tt1014759,2010-03-05,200000000.0,334191110.0,1025491000.0
28,tt1014759,1951-07-28,3000000.0,0.0,0.0
33,tt1926979,2010-03-05,200000000.0,334191110.0,1025491000.0


In [18]:
#  let's calculate the profits and add it as a feature

df_imdb_financials['total_profit'] = df_imdb_financials.worldwide_gross - df_imdb_financials.production_budget
df_imdb_financials.set_index('title_id', inplace=True)

#  and the profit margin

df_imdb_financials['profit_margin'] = (df_imdb_financials['total_profit']/df_imdb_financials.worldwide_gross)

filt = (df_imdb_financials.worldwide_gross == 0)
index_for_change = filt.index
df_imdb_financials['profit_margin'].loc[filt] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

#### Here we have a dataframe with title_id (linkable with IMDB data) and financial results

It should be easy to explore imdb data with a financial perspective.



In [19]:
df_imdb_financials.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,total_profit,profit_margin
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0369610,2015-06-12,215000000.0,652270625.0,1648855000.0,1433855000.0,0.869606
tt0401729,2012-03-09,275000000.0,73058679.0,282778100.0,7778100.0,0.027506
tt1014759,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,0.804971
tt1014759,1951-07-28,3000000.0,0.0,0.0,-3000000.0,0.0
tt1926979,2010-03-05,200000000.0,334191110.0,1025491000.0,825491100.0,0.804971


We can now move on to exploring imdb_name_basics_gz


In [20]:
df_repository['imdb_name_basics_gz'].head()

Unnamed: 0_level_0,primary_name,birth_year,death_year,primary_profession,known_for_titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [21]:
# data cleaning first

# here we split the string values in primary_profession and known_for_titles into a list of strings. 

df_repository['imdb_name_basics_gz']['primary_profession'] = df_repository['imdb_name_basics_gz']['primary_profession'].str.split(',')
df_repository['imdb_name_basics_gz']['known_for_titles'] = df_repository['imdb_name_basics_gz']['known_for_titles'].str.split(',')
df_repository['imdb_name_basics_gz'].head()

Unnamed: 0_level_0,primary_name,birth_year,death_year,primary_profession,known_for_titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0061671,Mary Ellen Bauder,,,"[miscellaneous, production_manager, producer]","[tt0837562, tt2398241, tt0844471, tt0118553]"
nm0061865,Joseph Bauer,,,"[composer, music_department, sound_department]","[tt0896534, tt6791238, tt0287072, tt1682940]"
nm0062070,Bruce Baum,,,"[miscellaneous, actor, writer]","[tt1470654, tt0363631, tt0104030, tt0102898]"
nm0062195,Axel Baumann,,,"[camera_department, cinematographer, art_depar...","[tt0114371, tt2004304, tt1618448, tt1224387]"
nm0062798,Pete Baxter,,,"[production_designer, art_department, set_deco...","[tt0452644, tt0452692, tt3458030, tt2178256]"


In [22]:
# more data cleaning


# deletes rows with NaN either on known_for_titles column or on primary_profession

filt1 = df_repository['imdb_name_basics_gz'].known_for_titles.isna()
filt2 = df_repository['imdb_name_basics_gz'].primary_profession.isna()


index_to_drop = set( list( df_repository['imdb_name_basics_gz'].loc[filt1].index ) 
                    + list( df_repository['imdb_name_basics_gz'].loc[filt2].index ))

df_repository['imdb_name_basics_gz'].drop(index=index_to_drop, inplace=True)



#### We need to find a better way to see the professions of each person

Instead of listing one's professions, we should seek to have columns that indicate whether a person is or is not a actor, for example. 



In [23]:
# creates dataframes that show if a person is:

    # an actor/actress
new_column_actor = is_prof(df_repository['imdb_name_basics_gz'].primary_profession, 'actor', profession2='actress')
    # a producer 
new_column_producer = is_prof(df_repository['imdb_name_basics_gz'].primary_profession, 'producer')
    # a director 
new_column_director = is_prof(df_repository['imdb_name_basics_gz'].primary_profession, 'director')


In [24]:
# Adding the returned DFs to imdb_name_basics_gz

df_repository['imdb_name_basics_gz'].reset_index(inplace=True) 
df_repository['imdb_name_basics_gz']['is_actor'] = new_column_actor.reset_index().is_actor
df_repository['imdb_name_basics_gz']['is_producer'] = new_column_producer.reset_index().is_producer
df_repository['imdb_name_basics_gz']['is_director'] = new_column_director.reset_index().is_director

df_repository['imdb_name_basics_gz'].head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles,is_actor,is_producer,is_director
0,nm0061671,Mary Ellen Bauder,,,"[miscellaneous, production_manager, producer]","[tt0837562, tt2398241, tt0844471, tt0118553]",0,1,0
1,nm0061865,Joseph Bauer,,,"[composer, music_department, sound_department]","[tt0896534, tt6791238, tt0287072, tt1682940]",0,0,0
2,nm0062070,Bruce Baum,,,"[miscellaneous, actor, writer]","[tt1470654, tt0363631, tt0104030, tt0102898]",1,0,0
3,nm0062195,Axel Baumann,,,"[camera_department, cinematographer, art_depar...","[tt0114371, tt2004304, tt1618448, tt1224387]",0,0,0
4,nm0062798,Pete Baxter,,,"[production_designer, art_department, set_deco...","[tt0452644, tt0452692, tt3458030, tt2178256]",0,0,0


In [25]:
# cleaning away rows of people who are neither a director, an actor nor a producer 

filt = (df_repository['imdb_name_basics_gz'].is_actor + df_repository['imdb_name_basics_gz'].is_director + df_repository['imdb_name_basics_gz'].is_producer) == 0
index_to_drop = df_repository['imdb_name_basics_gz'].loc[filt].index
df_repository['imdb_name_basics_gz'].drop(index=index_to_drop, inplace=True)

#### The dataframe  df_repository['imdb_name_basics_gz'] now shows us in a efficient way whether a person is a producer, a director or an actor

The next step is to see their relationship with the financial results of a movie. 

We can do it by calculating the average grossing of the movies that person is known for

In [26]:
avg_grossing_column = []
total_grossing_column = []

for row in df_repository['imdb_name_basics_gz'].known_for_titles:
    grossing_row = []
    for movie in row:
        if movie in df_imdb_financials.index:
            if isinstance(df_imdb_financials.loc[movie], pd.DataFrame):
                grossing = df_imdb_financials.loc[movie].iloc[-1,-1]
            else:
                grossing = df_imdb_financials.worldwide_gross.loc[movie] #it returns a data series when movie is duplicated in df_imdb_financials.index
            grossing_row.append(grossing)
    if len(grossing_row) > 0:
        avg_grossing = sum(grossing_row)/len(grossing_row)
        avg_grossing_column.append(avg_grossing)
        total_grossing_column.append(sum(grossing_row))
    else:
        avg_grossing_column.append(0)
        total_grossing_column.append(0)

dict_grossings = {'avg_grossing': avg_grossing_column, 'total_grossing': total_grossing_column}
df_professionals_grossings = pd.DataFrame(dict_grossings)

##### Now we need to add a new colum to df_repository['imdb_name_basics_gz'] with the data we found

Also, let's clean the data a little

In [27]:
#resets index in order to create the new columns that matche df_professionals_grossings
df_repository['imdb_name_basics_gz'].reset_index(inplace=True)
df_repository['imdb_name_basics_gz'].drop(columns='index', inplace=True)

#creates the new columns
df_repository['imdb_name_basics_gz']['avg_grossing'] = df_professionals_grossings['avg_grossing']
df_repository['imdb_name_basics_gz']['total_grossing'] = df_professionals_grossings['total_grossing']

In [28]:
# additional cleaning
df_repository['imdb_name_basics_gz'].drop(columns=['birth_year','primary_profession'], inplace=True)

In [29]:
df_repository['imdb_name_basics_gz'].head()

Unnamed: 0,nconst,primary_name,death_year,known_for_titles,is_actor,is_producer,is_director,avg_grossing,total_grossing
0,nm0061671,Mary Ellen Bauder,,"[tt0837562, tt2398241, tt0844471, tt0118553]",0,1,0,288042199.0,576084398.0
1,nm0062070,Bruce Baum,,"[tt1470654, tt0363631, tt0104030, tt0102898]",1,0,0,0.0,0.0
2,nm0062879,Ruel S. Bayani,,"[tt2590280, tt0352080, tt0216559, tt2057445]",0,0,1,0.0,0.0
3,nm0063198,Bayou,,"[tt6579724, tt0093116]",1,0,0,0.0,0.0
4,nm0063750,Lindsay Beamish,,"[tt0404826, tt0111756, tt0367027, tt1492842]",1,0,0,0.0,0.0


### We can now get started on the Visualization of our questions

#### Worlwide Grossing vs Directors

Considering the main movies a director is known for, we computed the total and average worlwide grossing for each person.  

#### Worlwide Grossing vs Producers

Considering the main movies a producer is known for, we computed the total and average worlwide grossing for each person.  

#### Worlwide Grossing vs Actors

Considering the main movies an actor is known for, we computed the total and average worlwide grossing for each person.  