# Phase 1 Project WIP

EDA of Box Office Mojo Data

In [1]:
#Import 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [2]:
df = pd.read_csv('zippedData/bom.movie_gross.csv')

In [3]:
df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [4]:
df.index

RangeIndex(start=0, stop=3387, step=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [6]:
df.isna().sum()

title                0
studio               5
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64

In [7]:
#look at rows with null values
#i'm thinking it could be interesting to look at foreign $
#but that's the column missing the most info

df_na = df[df.isnull().any(axis=1)]
df_na.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
210,Outside the Law (Hors-la-loi),,96900.0,3300000.0,2010
222,Flipped,WB,1800000.0,,2010
230,It's a Wonderful Afterlife,UTV,,1300000.0,2010
254,The Polar Express (IMAX re-issue 2010),WB,673000.0,,2010
267,Tiny Furniture,IFC,392000.0,,2010


In [8]:
#check and remove duplicates

df.drop_duplicates(inplace = True)
df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            3377, 3378, 3379, 3380, 3381, 3382, 3383, 3384, 3385, 3386],
           dtype='int64', length=3387)

In [12]:
#checkout number of studios in the list
df['studio'].nunique()

257

In [13]:
#get a list of unique studio names

studio_list = []
for studio in df['studio']:
    studio_list.append(studio)


In [14]:
unique_studios = list(set(studio_list))

In [15]:
df['domestic_gross'].values

array([4.150e+08, 3.342e+08, 2.960e+08, ..., 2.500e+03, 2.400e+03,
       1.700e+03])

In [16]:
#get rid of nulls
df.dropna(inplace = True)

In [18]:
#remove commas from foreign_gross
df['foreign_gross'] = df['foreign_gross'].str.replace(',', '')

In [44]:
#change type to float from object
df['foreign_gross'] = df['foreign_gross'].astype(float)

In [45]:
#check to see if dropna worked
df.isna().sum()

title             0
studio            0
domestic_gross    0
foreign_gross     0
year              0
foreign_gross2    0
dtype: int64

In [46]:
#check to see if astype worked
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2007 entries, 0 to 3353
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           2007 non-null   object 
 1   studio          2007 non-null   object 
 2   domestic_gross  2007 non-null   float64
 3   foreign_gross   2007 non-null   float64
 4   year            2007 non-null   int64  
 5   foreign_gross2  2007 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 109.8+ KB


In [75]:
#look at mean and median gross 

print(df.foreign_gross.mean())
print(df.foreign_gross.median())

75790384.84130543
19400000.0


In [76]:
#look at mean and median gross - domestic

print(df.domestic_gross.mean())
print(df.domestic_gross.median())

47019840.20179372
16700000.0


In [72]:
#look at top domestically grossing

top_domestic = df.nlargest(20,'domestic_gross')
top_domestic

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,foreign_gross2
1872,Star Wars: The Force Awakens,BV,936700000.0,1131.6,2015,1131.6
3080,Black Panther,BV,700100000.0,646900000.0,2018,646900000.0
3079,Avengers: Infinity War,BV,678800000.0,1369.5,2018,1369.5
1873,Jurassic World,Uni.,652300000.0,1019.4,2015,1019.4
727,Marvel's The Avengers,BV,623400000.0,895500000.0,2012,895500000.0
2758,Star Wars: The Last Jedi,BV,620200000.0,712400000.0,2017,712400000.0
3082,Incredibles 2,BV,608600000.0,634200000.0,2018,634200000.0
2323,Rogue One: A Star Wars Story,BV,532200000.0,523900000.0,2016,523900000.0
2759,Beauty and the Beast (2017),BV,504000000.0,759500000.0,2017,759500000.0
2324,Finding Dory,BV,486300000.0,542300000.0,2016,542300000.0


In [73]:
#look at top foreign grossing

top_foreign = df.nlargest(50,'foreign_gross')
top_foreign

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,foreign_gross2
328,Harry Potter and the Deathly Hallows Part 2,WB,381000000.0,960500000.0,2011,960500000.0
1875,Avengers: Age of Ultron,BV,459000000.0,946400000.0,2015,946400000.0
727,Marvel's The Avengers,BV,623400000.0,895500000.0,2012,895500000.0
3081,Jurassic World: Fallen Kingdom,Uni.,417700000.0,891800000.0,2018,891800000.0
1127,Frozen,BV,400700000.0,875700000.0,2013,875700000.0
2764,Wolf Warrior 2,HC,2700000.0,867600000.0,2017,867600000.0
1477,Transformers: Age of Extinction,Par.,245400000.0,858600000.0,2014,858600000.0
1876,Minions,Uni.,336000000.0,823400000.0,2015,823400000.0
3083,Aquaman,WB,335100000.0,812700000.0,2018,812700000.0
1128,Iron Man 3,BV,409000000.0,805800000.0,2013,805800000.0


In [74]:
#look at films that are in both top lists, if any

top_films = []

for film in list(top_domestic['title'].values): 
    if film in list(top_foreign['title'].values):
        top_films.append(film)
top_films

['Black Panther',
 "Marvel's The Avengers",
 'Star Wars: The Last Jedi',
 'Incredibles 2',
 'Rogue One: A Star Wars Story',
 'Beauty and the Beast (2017)',
 'Finding Dory',
 'Avengers: Age of Ultron',
 'The Dark Knight Rises',
 'Jurassic World: Fallen Kingdom',
 'Toy Story 3',
 'Iron Man 3',
 'Captain America: Civil War',
 'Jumanji: Welcome to the Jungle']

# Thoughts so far (9/27)

Finding genres and movie/ tv show types that perfrom well domestically and internationally seems feasible. 

Releasing a tv show or movie both domestically and internationally could have the benefit of revenue from multiple sources. 

However, what are the additional costs of releasing internationally? Would it be an option to only release on highest-earning platforms? (ie, instead of releasing everywhere - including lower-earning platforms) If so, would that save enough to offset the cost of releasing internationally?

Which countries should be focused on? Probably English-speaking countries? Where is American made content most popular? 

In [48]:
#no longer need - was using when I couldn't get the dtype to change

fgross_list = []

for gross in df.foreign_gross.values:
    fgross_list.append(gross)

In [49]:
forgrossint = []
for gross in fgross_list: 
    float(gross)
    forgrossint.append(gross)

In [41]:
df['foreign_gross2'] = forgrossint
df['foreign_gross2'] = df['foreign_gross2'].astype(float)