        ## looking at 3 csv files: movie_gross.csv, movie_budgets.csv, movies.csv

### Looking at Movie Gross

In [2]:
import pandas as pd
import numpy as np

In [3]:
gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
gross.head(3)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010


In [4]:
gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [5]:
#making foreign gross a series
gross_revenue = pd.Series(gross['foreign_gross'])

In [6]:
#changing to a datatype that can do calculations
pd.to_numeric(gross_revenue, errors='ignore')

0       652000000
1       691300000
2       664300000
3       535700000
4       513900000
          ...    
3382          NaN
3383          NaN
3384          NaN
3385          NaN
3386          NaN
Name: foreign_gross, Length: 3387, dtype: object

## Which Genres Make the Most Money?

In [7]:
#join movie_gross with title_basics, which contains a genre column

In [8]:
#look at dataframes
title_basics = pd.read_csv('zippedData/imdb.title.basics.csv.gz')
title_basics.head(3)

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama


In [9]:
gross.head(3)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010


In [10]:
#check datatypes
gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [11]:
#change foreign gross to a float
gross['foreign_gross'] = pd.to_numeric(gross['foreign_gross'], errors='coerce', downcast='float')

gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2032 non-null   float32
 4   year            3387 non-null   int64  
dtypes: float32(1), float64(1), int64(1), object(2)
memory usage: 119.2+ KB


In [30]:
#join title_basics and gross, drop non-relevant columns
finance = title_basics.join(gross, how = 'outer')
finance.drop(columns = ['runtime_minutes', 'studio', 'start_year', 'original_title', 'tconst', 'primary_title'], inplace=True)
total_gross = gross['domestic_gross'] + gross['foreign_gross']

#make a new column for total gross revenue, sum of domestic & foreign
finance['total gross'] = total_gross

#change foreign gross to a float
gross['foreign_gross'] = pd.to_numeric(gross['foreign_gross'], errors='coerce', downcast='float')


#total gross is very big so dividing by $1M
finance['total gross in millions'] = (finance['total gross']/1000000)

finance.head(3)

Unnamed: 0,genres,title,domestic_gross,foreign_gross,year,total gross,total gross in millions
0,"Action,Crime,Drama",Toy Story 3,415000000.0,652000000.0,2010.0,1067000000.0,1067.0
1,"Biography,Drama",Alice in Wonderland (2010),334200000.0,691299968.0,2010.0,1025499968.0,1025.5
2,Drama,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300032.0,2010.0,960300032.0,960.3


In [13]:
#check data type of total gross in millions column
finance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146144 entries, 0 to 146143
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   genres                   140736 non-null  object 
 1   title                    3387 non-null    object 
 2   domestic_gross           3359 non-null    float64
 3   foreign_gross            2032 non-null    float32
 4   year                     3387 non-null    float64
 5   total gross              2004 non-null    float64
 6   total gross in millions  2004 non-null    object 
dtypes: float32(1), float64(3), object(3)
memory usage: 8.4+ MB


In [37]:
#first change object type
finance['total gross in millions'] = pd.to_numeric(finance['total gross in millions'])

#remove scientific notation from total gross in millions column
pd.options.display.float_format = '{:.2f}'.format

finance.head(5)

Unnamed: 0,genres,title,domestic_gross,foreign_gross,year,total gross,total gross in millions
0,"Action,Crime,Drama",Toy Story 3,415000000.0,652000000.0,2010.0,1067000000.0,1067.0
1,"Biography,Drama",Alice in Wonderland (2010),334200000.0,691299968.0,2010.0,1025499968.0,1025.5
2,Drama,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300032.0,2010.0,960300032.0,960.3
3,"Comedy,Drama",Inception,292600000.0,535700000.0,2010.0,828300000.0,828.3
4,"Comedy,Drama,Fantasy",Shrek Forever After,238700000.0,513900000.0,2010.0,752600000.0,752.6


In [15]:
finance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146144 entries, 0 to 146143
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   genres                   140736 non-null  object 
 1   title                    3387 non-null    object 
 2   domestic_gross           3359 non-null    float64
 3   foreign_gross            2032 non-null    float32
 4   year                     3387 non-null    float64
 5   total gross              2004 non-null    float64
 6   total gross in millions  2004 non-null    float64
dtypes: float32(1), float64(4), object(2)
memory usage: 8.4+ MB


In [40]:
#split values in genres column 
finance['genres'] = finance['genres'].str.split(pat=',', expand=True)
finance.head(5)

Unnamed: 0,genres,title,domestic_gross,foreign_gross,year,total gross,total gross in millions
0,Action,Toy Story 3,415000000.0,652000000.0,2010.0,1067000000.0,1067.0
1,Biography,Alice in Wonderland (2010),334200000.0,691299968.0,2010.0,1025499968.0,1025.5
2,Drama,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300032.0,2010.0,960300032.0,960.3
3,Comedy,Inception,292600000.0,535700000.0,2010.0,828300000.0,828.3
4,Comedy,Shrek Forever After,238700000.0,513900000.0,2010.0,752600000.0,752.6


In [36]:
#now split genres into separate columns
pd.DataFrame(finance['genres'].to_list(), columns = ['Action', 'Adventure','Animation',\
             'Biography', 'Crime', 'Comedy', 'Drama', 'Documentary', 'Fantasy', \
              'Horror','History', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'Western'])




pd.DataFrame(df2["teams"].to_list(), columns=['team1', 'team2'])

ValueError: Shape of passed values is (146144, 1), indices imply (146144, 16)

In [None]:
#make a bar chart of genres on the x-axis and total gross in millions on the y-axis.

### Looking at Movie Budgets

In [24]:
budgets = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
budgets

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [1]:
budgets.info()

NameError: name 'budgets' is not defined

In [31]:
#change production_budget, domestic_gross, worldwide gross to numeric types
p_budget = pd.Series(gross['production_budget'])
pd.to_numeric(p_budget, errors='ignore')


KeyError: 'production_budget'