In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Merge datasets
df1 = pd.read_csv("data/tngross.csv")
df2 = pd.read_csv("data/tnproduction.csv")

df = df1.merge(df2, on="Unnamed: 0")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,runtime_minutes,genres,production_company,production_country
0,0,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,797,800,564",181 minutes,Action,Marvel Studios,United States
1,1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802",136 minutes,Adventure,Walt Disney Pictures,United States
2,2,"Apr 22, 2015",Avengers: Age of Ultron,"$365,000,000","$459,005,868","$1,395,316,979",141 minutes,Action,Marvel Studios,United States
3,3,"Dec 16, 2015",Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,064,615,817",136 minutes,Adventure,"Lucasfilm, Bad Robot",United States
4,4,"Apr 25, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,044,540,523",156 minutes,Action,Marvel Studios,United States
...,...,...,...,...,...,...,...,...,...,...
6095,6095,"Mar 17, 2015",Closure,"$100,000",$0,$0,90 minutes,Drama,,United States
6096,6096,"Aug 29, 2015",Lunch Time Heroes,"$100,000",$0,$0,88 minutes,Adventure,Phebean Films,Nigeria
6097,6097,"Mar 25, 2015",Open Secret,"$100,000",$0,$0,,Documentary,,United States
6098,6098,"Nov 10, 2015",The Night Visitor,"$100,000",$0,$0,,Horror,,United States


In [4]:
# Clean up columns
df = df.drop("Unnamed: 0", axis=1)

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce', format='%b %d, %Y')

def dollar_to_int(column):
    return column.str.replace('$', '').str.replace(',', '').map(int)

df[['production_budget', 'domestic_gross', 'worldwide_gross']] = df[['production_budget', 'domestic_gross', 'worldwide_gross']].apply(dollar_to_int)

df['runtime_minutes'] = df['runtime_minutes'].map(lambda x: x.split(' ')[0].strip(' '))

In [5]:
# Create new columns
df['total_profit'] = df['worldwide_gross'] - df['production_budget']

df['profit_to_budget'] = df['worldwide_gross'] / df['production_budget']

df['profit_to_budget_perc'] = df['profit_to_budget'] * 100

df['day'] = df['release_date'].dt.day_name()

df['month'] = pd.DatetimeIndex(df['release_date']).month

df['year'] = pd.DatetimeIndex(df['release_date']).year

In [6]:
df['production_company'].value_counts()

None                                                                                                                       2266
Warner Bros.                                                                                                                 29
Universal Pictures                                                                                                           21
Columbia Pictures                                                                                                            21
Walt Disney Pictures                                                                                                         19
                                                                                                                           ... 
20th Century Fox, Chernin Entertainment, TSG Entertainment                                                                    1
Columbia Pictures, Metro-Goldwyn-Mayer Pictures, LStar Capital, MRC, Original Film, Cannell Studios     

In [7]:
df['production_country'].value_counts()

United States                                         4142
None                                                   483
United Kingdom                                         198
United Kingdom, United States                          173
France                                                  92
                                                      ... 
China, Hong Kong, Japan, Taiwan, Province of China       1
France, Germany, Israel                                  1
Denmark, Poland                                          1
France, Netherlands, United Kingdom, United States       1
Sweden, United Kingdom, United States                    1
Name: production_country, Length: 336, dtype: int64

In [8]:
x = df.groupby(['production_country']).production_country.agg(len)

In [9]:
x.sort_values(ascending=False)

production_country
United States                                                         4142
None                                                                   483
United Kingdom                                                         198
United Kingdom, United States                                          173
France                                                                  92
                                                                      ... 
France, Ireland, United Kingdom                                          1
France, Iceland, Ireland, United States                                  1
France, Iceland                                                          1
France, Hungary, Italy, Spain                                            1
Afghanistan, Islamic Republic of Iran, Ireland, Japan, Netherlands       1
Name: production_country, Length: 336, dtype: int64

In [10]:
y = df.groupby(['production_company']).production_company.agg(len)

In [11]:
y.sort_values(ascending=False)

production_company
None                                                           2266
Warner Bros.                                                     29
Universal Pictures                                               21
Columbia Pictures                                                21
Walt Disney Pictures                                             19
                                                               ... 
Plan B Entertainment, Amazon Studios                              1
Plan B Entertainment                                              1
Pinoy Pictures                                                    1
Pine District, Scott Rudin Productions                            1
108 Media, Rebel Films, Bigel Entertainment, Contento Films       1
Name: production_company, Length: 3186, dtype: int64

### Clean up production_company first

In [14]:
company_df = df[df['production_company'] != 'None']

In [16]:
company_df['production_company'] = company_df['production_company'].map(lambda x: x.split(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_df['production_company'] = company_df['production_company'].map(lambda x: x.split(','))


In [28]:
company_df = company_df.explode('production_company')

In [37]:
company_df['production_company'].value_counts().sort_values(ascending=False)

Columbia Pictures           163
Universal Pictures          135
Warner Bros.                132
Paramount Pictures          107
Walt Disney Pictures         83
                           ... 
Over Under Media              1
 75 Year Plan                 1
Gil Friesen                   1
 SF Norge AS                  1
 Initial A Entertainment      1
Name: production_company, Length: 4542, dtype: int64

### Clean up production_country nexy

In [38]:
country_df = df[df['production_country'] != 'None']

In [41]:
country_df['production_country'] = country_df['production_country'].map(lambda x: x.split(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_df['production_country'] = country_df['production_country'].map(lambda x: x.split(','))


In [43]:
country_df = country_df.explode('production_country')

In [44]:
country_df['production_country'].value_counts().sort_values(ascending=False)

United States                                 4143
 United States                                 606
United Kingdom                                 371
France                                         218
 United Kingdom                                181
                                              ... 
 Czech Republic                                  1
 The Former Yugoslav Republic of Macedonia       1
Cambodia                                         1
 Egypt                                           1
German Democratic Republic                       1
Name: production_country, Length: 143, dtype: int64