## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
import pandas as pd
import sqlite3

In [2]:
# print big numbers in pd.DataFrame (instead of scientific notation)
pd.set_option('display.precision', 12)
pd.set_option('display.float_format', lambda x: '%.2f' % x)


## Find column names and info in various csv files

### What's in bom.movie_gross.csv.gz ?
- `foreign_gross` has ~1300 missing values, `domestic_gross` and `studio` have small number of missing values
- `domestic_gross` has dtype float64, but `foreign_gross` has dtype object
- Combination of `title` and `year` may serve as key

In [3]:
# Your code here - remember to use markdown cells for comments as well!
# Let's get column names and info first
bom_df = pd.read_csv('../zippedData/bom.movie_gross.csv.gz')
print(bom_df.columns)
print(bom_df.info())
print(bom_df.head())
print(bom_df.tail())

Index(['title', 'studio', 'domestic_gross', 'foreign_gross', 'year'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB
None
                                         title studio  domestic_gross  \
0                                  Toy Story 3     BV    415000000.00   
1                   Alice in Wonderland (2010)     BV    334200000.00   
2  Harry Potter and the Deathly Hallows Part 1     WB    296000000.00   
3                                    Inception     WB    292600000.00   
4                          Shrek Forever After   P/DW    23

### Let's clean bom_df a bit
- `foreign_gross` is currently an 'object' (string), and contains some commas
- Convert `foreign_gross` to float
- Create a `total_gross` column as sum of these domestic and foreign gross

In [4]:
# Cleaning the comma from one of the foreign_gross data points
bom_df['foreign_gross'] = bom_df['foreign_gross'].str.replace(",", "")
# and turn it into a float
bom_df['foreign_gross'] = bom_df['foreign_gross'].astype(float)

### note:
- We can assume that NaN in `foreign_gross` should be 0's because there was no foreign market

In [5]:
# Let's make the total gross column and check it out
# First we need to replace Nan with 0, so we don't lose all of our smaller films
bom_df['foreign_gross'].fillna(0, inplace=True)
# If we didn't replace those, we would lose over 1000 values
bom_df['total_gross'] = bom_df['domestic_gross'] + bom_df['foreign_gross']
bom_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,total_gross
0,Toy Story 3,BV,415000000.0,652000000.0,2010,1067000000.0
1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010,1025500000.0
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000.0,2010,960300000.0
3,Inception,WB,292600000.0,535700000.0,2010,828300000.0
4,Shrek Forever After,P/DW,238700000.0,513900000.0,2010,752600000.0


In [6]:
# It's hard to make meaning out of all these 0's, let's make these numbers more readable
# We'll work with the total gross in millions, this will make the smaller observations less readable,
# but we're trying to compete in the big league. 
bom_df['total_gross(mil)'] = bom_df['total_gross'] / 1000000
bom_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,total_gross,total_gross(mil)
0,Toy Story 3,BV,415000000.0,652000000.0,2010,1067000000.0,1067.0
1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010,1025500000.0,1025.5
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000.0,2010,960300000.0,960.3
3,Inception,WB,292600000.0,535700000.0,2010,828300000.0,828.3
4,Shrek Forever After,P/DW,238700000.0,513900000.0,2010,752600000.0,752.6


### What's in tmdb.movies.csv.gz ?
- `id` column is integer that might possibly be a key?
- `genre_ids` contains list of multiple genre codes for each movie
- `original_language` may be useful category to group on
- Contains `popularity` estimates

In [7]:
tmdb_df = pd.read_csv('../zippedData/tmdb.movies.csv.gz')
print(tmdb_df.columns)
print(tmdb_df.info())
print(tmdb_df.head())
print(tmdb_df.tail())

Index(['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title',
       'popularity', 'release_date', 'title', 'vote_average', 'vote_count'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB
None
   Unnamed: 0            genre_ids     id original_language  \
0      

### What's in rt.movie_info.tsv.gz ?
- Contains movie `rating` column: array(['R', 'NR', 'PG', 'PG-13', nan, 'G', 'NC17']
- `runtime` column contains number of minutes formatted as string including ' minutes'
- `director` names column
- `box_office` values formatted as strings with commas (with tons of missing data)

In [8]:
#rt.movie_info.tsv.gz
rt_movie_info_df = pd.read_csv('../zippedData/rt.movie_info.tsv.gz', delimiter='\t')
print(rt_movie_info_df.columns)
print(rt_movie_info_df.info())
print(rt_movie_info_df.head())
print(rt_movie_info_df.tail())

Index(['id', 'synopsis', 'rating', 'genre', 'director', 'writer',
       'theater_date', 'dvd_date', 'currency', 'box_office', 'runtime',
       'studio'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB
None
   id                                           synopsis rating  \
0 

In [9]:
rt_movie_info_df.rating.unique()

array(['R', 'NR', 'PG', 'PG-13', nan, 'G', 'NC17'], dtype=object)

### What's in tn.movie_budgets.csv.gz ?
- Contains `production_budget`, `domestic_gross`, `worldwide_gross` formatted as string with special characters (e.g., $425,000,000)

In [10]:
#tn.movie_budgets.csv.gz
tn_movie_budgets_df = pd.read_csv('../zippedData/tn.movie_budgets.csv.gz')
print(tn_movie_budgets_df.columns)
print(tn_movie_budgets_df.info())
print(tn_movie_budgets_df.head())
print(tn_movie_budgets_df.tail())

Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB
None
   id  release_date                                        movie  \
0   1  Dec 18, 2009                                       Avatar   
1   2  May 20, 2011  Pirates of the Caribbean: On Stranger Tides   
2   3   Jun 7, 2019                                 Dark Phoenix   
3   4   May 1, 2015                      Avengers: Age of Ultro

### What's in im.db ?

- `movie_id` column primary key
- `movie_basics` table
    - `start_year`, `runtime_minutes`, and `genres`
- `movie_ratings` table
    - `averagerating` and `numvotes`
- `directors` table
    - contains `person_id` for director(s) of each `movie_id`
- `persons` table (using `person_id` as key)
    - `primary_name`, `primary_profession`, and `death_year`

In [11]:
# Create the connect to im.db database
con = sqlite3.connect('../zippedData/im.db')

In [12]:
# Get info about tables and columns
im_schema_df = pd.read_sql("""

SELECT *
FROM sqlite_master

""", con)

In [13]:
im_schema_df

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [14]:
# movie_basics
im_schema_df.iloc[0]['sql']

'CREATE TABLE "movie_basics" (\n"movie_id" TEXT,\n  "primary_title" TEXT,\n  "original_title" TEXT,\n  "start_year" INTEGER,\n  "runtime_minutes" REAL,\n  "genres" TEXT\n)'

In [15]:
# movie_ratings
im_schema_df.iloc[4]['sql']

'CREATE TABLE "movie_ratings" (\n"movie_id" TEXT,\n  "averagerating" REAL,\n  "numvotes" INTEGER\n)'

In [16]:
# directors
im_schema_df.iloc[1]['sql']

'CREATE TABLE "directors" (\n"movie_id" TEXT,\n  "person_id" TEXT\n)'

In [17]:
# persons
im_schema_df.iloc[5]['sql']

'CREATE TABLE "persons" (\n"person_id" TEXT,\n  "primary_name" TEXT,\n  "birth_year" REAL,\n  "death_year" REAL,\n  "primary_profession" TEXT\n)'

### Let's try to get data from im.db `movie_basics` and `movie_ratings`
- `movie_basics` table
    - `genres` is a string that contains multiple genres (separated by commas)
    - `primary_title` has duplicate rows with same value
- `movie_ratings`
    - `averagerating` and `numvotes` are columns of imdb ratings

In [18]:
pd.read_sql("""

SELECT
    *
FROM
    movie_basics
LIMIT 5

""", con)


Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [19]:
pd.read_sql("""
SELECT
    COUNT(*) as movie_basics_rows
FROM
    movie_basics
""", con)

Unnamed: 0,movie_basics_rows
0,146144


### note


We don't have ratings for every movie, but we want other values for every movie. Probably want a left join to save all `movie_basics` info even if it doesn't have `movie_ratings`

In [20]:
pd.read_sql("""
SELECT
    *
FROM
    movie_ratings
LIMIT 5
""", con)

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [21]:
pd.read_sql("""
SELECT
    COUNT(*) as movie_ratings_rows
FROM
    movie_ratings
""", con)

Unnamed: 0,movie_ratings_rows
0,73856


In [22]:
im_basics_and_ratings_df = pd.read_sql("""

SELECT
    b.movie_id, b.primary_title, b.original_title, b.start_year, b.runtime_minutes, b.genres,
    r.averagerating, r.numvotes
FROM
    movie_basics as b
    LEFT JOIN movie_ratings as r
        ON b.movie_id = r.movie_id

""", con)

In [23]:
im_basics_and_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
 6   averagerating    73856 non-null   float64
 7   numvotes         73856 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 8.9+ MB


### Let's try to get data from im.db `movie_basics`  and `directors`
- I'm keeping outputs of this merge separate, because relationship of `movie_id` to `person_id` in `directors` is one-to-many (i.e., duplicate rows)
- There are multiple directors for some movies, and we will need to figure out how to deal with that when determining which directors are best
- Also, there are ~13000 duplicate rows with identical [movie_id,person_id] in directors, so we need to use CTE to get only unique rows!

In [24]:
pd.read_sql("""
SELECT
    *
FROM
    directors
LIMIT 5
""", con)

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540
3,tt0835418,nm0151540
4,tt0878654,nm0089502


In [25]:
pd.read_sql("""
SELECT
    COUNT(*)
FROM
    directors
""", con)

Unnamed: 0,COUNT(*)
0,291174


In [26]:
pd.read_sql("""
WITH distinct_rows AS (
    SELECT
        DISTINCT movie_id, person_id
    FROM
        directors
)
SELECT
    COUNT(*)
FROM
    distinct_rows

""", con)

Unnamed: 0,COUNT(*)
0,163535


In [27]:
imdb_directors = pd.read_sql("""
WITH distinct_rows AS (
    SELECT
        DISTINCT movie_id, person_id
    FROM
        directors
)
SELECT
    b.movie_id, b.primary_title, b.original_title, b.start_year, b.runtime_minutes, b.genres,
    d.person_id
FROM
    movie_basics as b
    LEFT JOIN distinct_rows as d
        ON b.movie_id = d.movie_id

""", con)

### note:
The number of `movie_id` rows in this df is larger than the original number of rows in the `movie_basics` table, because the `movie_id` row is duplicated for additional directors in the movie.

In [28]:
imdb_directors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169262 entries, 0 to 169261
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         169262 non-null  object 
 1   primary_title    169262 non-null  object 
 2   original_title   169240 non-null  object 
 3   start_year       169262 non-null  int64  
 4   runtime_minutes  133324 non-null  float64
 5   genres           163287 non-null  object 
 6   person_id        163535 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 9.0+ MB


### Let's try to get names of the directors from im.db `movie_basics` , `directors`, and `persons`
- Nest CTEs to:
    1. get distinct [movie_id,person_id] pairs
    2. join info from persons table with [movie_id,person_id] pairs 
        - Use INNER JOIN to include only people with both person_id and movie_id
    3. join all the good persons info with the movie_basics info
        - Use LEFT JOIN to keep movie_basics even when persons info is missing
- Included `death_year` in case we want to avoid hiring dead people

In [29]:
pd.read_sql("""

SELECT
    *
FROM
    persons
LIMIT 5

""", con)

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator"


In [30]:
pd.read_sql("""

WITH distinct_rows AS (
    SELECT
        DISTINCT movie_id, person_id
    FROM
        directors
)
SELECT
    p.person_id, p.primary_name, p.birth_year, p.death_year, p.primary_profession,
    distinct_rows.movie_id
FROM
    persons as p
    INNER JOIN distinct_rows
        ON p.person_id = distinct_rows.person_id


""", con)



Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession,movie_id
0,nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous",tt1592569
1,nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous",tt2057445
2,nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous",tt2590280
3,nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous",tt8421806
4,nm0064023,Bryan Beasley,,,"director,producer,writer",tt3501180
...,...,...,...,...,...,...
163528,nm9971456,Zheng Wei,,,director,tt8697720
163529,nm9980896,Rama Narayanan,,,"director,writer",tt8715016
163530,nm9980896,Rama Narayanan,,,"director,writer",tt8919136
163531,nm9981679,Samir Eshra,,,"director,writer,cinematographer",tt8717234


In [31]:
im_basics_and_directors_names_df = pd.read_sql("""
WITH director_names AS (
    WITH distinct_rows AS (
        SELECT
            DISTINCT movie_id, person_id
        FROM
            directors
    )
    SELECT
        p.person_id, p.primary_name, p.birth_year, p.death_year, p.primary_profession,
        distinct_rows.movie_id
    FROM
        persons as p
        INNER JOIN distinct_rows
            ON p.person_id = distinct_rows.person_id
)
SELECT 
    b.movie_id, b.primary_title, b.original_title, b.start_year, b.runtime_minutes, b.genres,
    d.person_id, d.primary_name, d.birth_year, d.death_year, d.primary_profession
FROM
    movie_basics as b
    LEFT JOIN director_names as d
        ON b.movie_id = d.movie_id


""", con)

In [32]:
im_basics_and_directors_names_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169261 entries, 0 to 169260
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movie_id            169261 non-null  object 
 1   primary_title       169261 non-null  object 
 2   original_title      169239 non-null  object 
 3   start_year          169261 non-null  int64  
 4   runtime_minutes     133324 non-null  float64
 5   genres              163287 non-null  object 
 6   person_id           163533 non-null  object 
 7   primary_name        163533 non-null  object 
 8   birth_year          34268 non-null   float64
 9   death_year          979 non-null     float64
 10  primary_profession  162895 non-null  object 
dtypes: float64(3), int64(1), object(7)
memory usage: 14.2+ MB


In [33]:
# Checking whether there are duplicate primary_title rows (using DISTINCT)
# yes, there are 10073 with duplicate titles
pd.read_sql("""

SELECT
    DISTINCT primary_title
FROM
    movie_basics
--LIMIT 5

""", con)

Unnamed: 0,primary_title
0,Sunghursh
1,One Day Before the Rainy Season
2,The Other Side of the Wind
3,Sabse Bada Sukh
4,The Wandering Soap Opera
...,...
136066,Kuambil Lagi Hatiku
136067,Rodolpho Teóphilo - O Legado de um Pioneiro
136068,Dankyavar Danka
136069,6 Gunn


In [34]:
# Does adding `start_year` help reduce duplicate rows?
# yes, but doesn't alleviate the problem altogether
pd.read_sql("""

SELECT
    DISTINCT primary_title,
    start_year
FROM
    movie_basics
--LIMIT 5

""", con)

Unnamed: 0,primary_title,start_year
0,Sunghursh,2013
1,One Day Before the Rainy Season,2019
2,The Other Side of the Wind,2018
3,Sabse Bada Sukh,2018
4,The Wandering Soap Opera,2017
...,...,...
144067,Kuambil Lagi Hatiku,2019
144068,Rodolpho Teóphilo - O Legado de um Pioneiro,2015
144069,Dankyavar Danka,2013
144070,6 Gunn,2017


In [35]:
pd.read_sql("""

SELECT
    *
FROM
    principals

""", con)

Unnamed: 0,movie_id,ordering,person_id,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"
...,...,...,...,...,...,...
1028181,tt9692684,1,nm0186469,actor,,"[""Ebenezer Scrooge""]"
1028182,tt9692684,2,nm4929530,self,,"[""Herself"",""Regan""]"
1028183,tt9692684,3,nm10441594,director,,
1028184,tt9692684,4,nm6009913,writer,writer,


In [36]:
## Need to parse strings from im.db into python list types
# Specifically `genre` and `primary_profession`
# dataframe[col].str.split(",")

In [37]:

# Here is where I merge the two together (by default, merge how='inner')
directors_gross = tn_movie_budgets_df.merge(im_basics_and_directors_names_df, left_on='movie', right_on='primary_title')

for money_column in ['production_budget','domestic_gross','worldwide_gross']:
    # Here I clean values and turn them into float
    directors_gross[money_column] = directors_gross[money_column].str.replace('$', '').str.replace(',', '').astype(float)

# Here I filter for only the highest overall grossing values
high_grossing = directors_gross[directors_gross['worldwide_gross'] >= 5000000]

high_grossing

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,person_id,primary_name,birth_year,death_year,primary_profession
0,1,"Dec 18, 2009",Avatar,425000000.00,760507625.00,2776345279.00,tt1775309,Avatar,Abatâ,2011,93.00,Horror,nm3786927,Atsushi Wada,,,director
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.00,241063875.00,1045663875.00,tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011,136.00,"Action,Adventure,Fantasy",nm0551128,Rob Marshall,1960.00,,"director,miscellaneous,producer"
2,3,"Jun 7, 2019",Dark Phoenix,350000000.00,42762350.00,149762350.00,tt6565702,Dark Phoenix,Dark Phoenix,2019,113.00,"Action,Adventure,Sci-Fi",nm1334526,Simon Kinberg,1973.00,,"producer,writer,director"
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.00,459005868.00,1403013963.00,tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015,141.00,"Action,Adventure,Sci-Fi",nm0923736,Joss Whedon,1964.00,,"writer,producer,director"
4,7,"Apr 27, 2018",Avengers: Infinity War,300000000.00,678815482.00,2048134200.00,tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018,149.00,"Action,Adventure,Sci-Fi",nm0751577,Anthony Russo,1970.00,,"producer,director,writer"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4153,16,"Oct 18, 1996",Swingers,200000.00,4505922.00,6618578.00,tt10297320,Swingers,Swingers,2019,,Comedy,,,,,
4154,16,"Oct 18, 1996",Swingers,200000.00,4505922.00,6618578.00,tt6016012,Swingers,Swingers,2016,85.00,Comedy,nm1044237,Andrejs Ekis,,,"producer,director,writer"
4155,16,"Oct 18, 1996",Swingers,200000.00,4505922.00,6618578.00,tt8630424,Swingers,Swingers,2018,86.00,Comedy,nm1533235,Pamela Tola,1981.00,,"actress,writer,director"
4191,80,"Jul 10, 2015",The Gallows,100000.00,22764410.00,41656474.00,tt2309260,The Gallows,The Gallows,2015,81.00,"Horror,Mystery,Thriller",nm3951039,Chris Lofing,,,"producer,director,writer"


In [38]:
# Here, I wanted to see the highest grossing directors
# Groupby directors and then take the mean of each worldwide_gross number
Director_options = high_grossing.groupby('primary_name')['worldwide_gross'].mean()
# Then I sorted the values from highest to lowest
Director_options.sort_values(ascending=False).head(20)
# The top 4 values are bad data, not sure what happened with Mr. Wada,
# But the next 3 result from merging the wrong 2 "Titanic" movies together

primary_name
Atsushi Wada         2776345279.00
Ravi Punj            2208208395.00
Kevin Lincoln        2208208395.00
Pete Meads           2208208395.00
Colin Trevorrow      1648854864.00
Joe Russo            1300868500.67
Anthony Russo        1300868500.67
Chris Buck           1272469910.00
Chi-kin Kwok         1272469910.00
Jennifer Lee         1272469910.00
Adam Green           1272469910.00
Kyle Balda           1097531961.50
Eric Guillon         1034727750.00
James Fotopoulos     1025491110.00
Fernando J. Scarpa   1025491110.00
Giuseppe Malpasso    1025491110.00
Angus MacLane        1021215193.00
Jared Bush           1019429616.00
Sam Mendes            995073952.00
Peter Jackson         974316014.67
Name: worldwide_gross, dtype: float64

In [39]:
# What movies are these folks responsible for?
high_grossing.loc[high_grossing['primary_name'] == 'Atsushi Wada']['primary_title'] # Avatar
high_grossing.loc[high_grossing['primary_name'] == 'Ravi Punj']['primary_title'] # Titanic
high_grossing.loc[high_grossing['primary_name'] == 'Colin Trevorrow']['primary_title'] # Jurassic World

29    Jurassic World
Name: primary_title, dtype: object

### note
- The top-grossing directors in these lists are not necessarily the primary director or key director
- Also, the top-grossing directors often have only one very high-grossing movie

In [40]:
for one_name in Director_options.sort_values(ascending=False).head(20).index:
    movie = list(high_grossing.loc[high_grossing['primary_name'] == one_name]['primary_title'])
    print(one_name, movie)

Atsushi Wada ['Avatar']
Ravi Punj ['Titanic']
Kevin Lincoln ['Titanic']
Pete Meads ['Titanic']
Colin Trevorrow ['Jurassic World']
Joe Russo ['Avengers: Infinity War', 'Captain America: Civil War', 'Captain America: The Winter Soldier']
Anthony Russo ['Avengers: Infinity War', 'Captain America: Civil War', 'Captain America: The Winter Soldier']
Chris Buck ['Frozen']
Chi-kin Kwok ['Frozen']
Jennifer Lee ['Frozen']
Adam Green ['Frozen']
Kyle Balda ['Despicable Me 3', 'Minions']
Eric Guillon ['Despicable Me 3']
James Fotopoulos ['Alice in Wonderland']
Fernando J. Scarpa ['Alice in Wonderland']
Giuseppe Malpasso ['Alice in Wonderland']
Angus MacLane ['Finding Dory']
Jared Bush ['Zootopia']
Sam Mendes ['Spectre', 'Skyfall']
Peter Jackson ['The Hobbit: An Unexpected Journey', 'The Hobbit: The Desolation of Smaug', 'The Hobbit: The Battle of the Five Armies']


In [41]:
# im_basics_and_ratings_df
im_basics_and_ratings_df['genres'].value_counts().head()

Documentary     32185
Drama           21486
Comedy           9177
Horror           4372
Comedy,Drama     3519
Name: genres, dtype: int64

In [42]:
im_basics_and_ratings_df[im_basics_and_ratings_df['genres'] == 'Documentary'].shape

(32185, 8)

In [43]:
im_basics_and_ratings_df['genres'].str.contains('Documentary', na=False)

0         False
1         False
2         False
3         False
4         False
          ...  
146139    False
146140     True
146141    False
146142    False
146143     True
Name: genres, Length: 146144, dtype: bool

In [44]:
im_basics_and_ratings_df[im_basics_and_ratings_df['genres'].str.contains('Documentary', na=False)].value_counts('genres')

genres
Documentary                      32185
Biography,Documentary             2115
Documentary,Drama                 1554
Documentary,Music                 1365
Documentary,History               1289
                                 ...  
Documentary,Romance,Sport            1
Documentary,Music,Western            1
Documentary,Music,Sci-Fi             1
Comedy,Documentary,Reality-TV        1
Action,Documentary,Western           1
Length: 206, dtype: int64

In [45]:
set_of_genres = set()
for movie_row in im_basics_and_ratings_df['genres'].dropna():
    set_of_genres.update(movie_row.split(','))

list_of_genres = list(set_of_genres)
list_of_genres

['Musical',
 'War',
 'Talk-Show',
 'Romance',
 'Thriller',
 'Music',
 'Adult',
 'Drama',
 'Family',
 'Crime',
 'Game-Show',
 'Action',
 'Fantasy',
 'History',
 'Animation',
 'Documentary',
 'Short',
 'Sport',
 'Reality-TV',
 'Biography',
 'Horror',
 'Western',
 'Sci-Fi',
 'Adventure',
 'Comedy',
 'News',
 'Mystery']

In [46]:
im_basics_and_ratings_df['genres'].shape

(146144,)

In [47]:
im_basics_and_ratings_df['genres'].dropna().shape

(140736,)

In [48]:
tn_movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [49]:
# Let's make some helpful functions
def convert_money_string(money_series):
    return money_series.str.replace('$','').str.replace(',','').astype(float)

In [50]:

for col in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    tn_movie_budgets_df[col] = convert_money_string(tn_movie_budgets_df[col])
#    tn_movie_budgets_df[col] = tn_movie_budgets_df[col].str.replace('$','').str.replace(',','').astype(int)
#    tn_movie_budgets_df[col] = tn_movie_budgets_df[col]
#    tn_movie_budgets_df[col] = tn_movie_budgets_df[col]

In [51]:
filt = (tn_movie_budgets_df['domestic_gross'] == tn_movie_budgets_df['worldwide_gross'])
tn_movie_budgets_df[filt]

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
194,95,"Dec 31, 2020",Moonfall,150000000.00,0.00,0.00
341,42,"Jun 14, 2019",Men in Black: International,110000000.00,3100000.00,3100000.00
479,80,"Dec 13, 2017",Bright,90000000.00,0.00,0.00
480,81,"Dec 31, 2019",Army of the Dead,90000000.00,0.00,0.00
505,6,"Nov 22, 2000",102 Dalmatians,85000000.00,66941559.00,66941559.00
...,...,...,...,...,...,...
5776,77,"Dec 31, 2004",The Mongol King,7000.00,900.00,900.00
5777,78,"Dec 31, 2018",Red 11,7000.00,0.00,0.00
5779,80,"Jul 13, 2005",Return to the Land of Wonders,5000.00,1338.00,1338.00
5780,81,"Sep 29, 2015",A Plague So Pleasant,1400.00,0.00,0.00


In [52]:
tn_movie_budgets_df['domestic_net'] = tn_movie_budgets_df['domestic_gross'] - tn_movie_budgets_df['production_budget']
tn_movie_budgets_df['worldwide_net'] = tn_movie_budgets_df['worldwide_gross'] - tn_movie_budgets_df['production_budget']

In [53]:
tn_movie_budgets_df.describe()

Unnamed: 0,id,production_budget,domestic_gross,worldwide_gross,domestic_net,worldwide_net
count,5782.0,5782.0,5782.0,5782.0,5782.0,5782.0
mean,50.37,31587757.1,41873326.87,91487460.91,10285569.77,59899703.81
std,28.82,41812076.83,68240597.36,174719968.78,49921366.46,146088881.08
min,1.0,1100.0,0.0,0.0,-307237650.0,-200237650.0
25%,25.0,5000000.0,1429534.5,4125414.75,-9132757.0,-2189070.75
50%,50.0,17000000.0,17225945.0,27984448.5,-348775.5,8550285.5
75%,75.0,40000000.0,52348661.5,97645836.5,17781444.0,60968501.75
max,100.0,425000000.0,936662225.0,2776345279.0,630662225.0,2351345279.0
