In [1]:
# Modeling
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

# Using PolynomialFeatures and make_pipeline for Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Data wrangling
import pandas as pd
import numpy as np
import re

# Time
import time
import datetime
from datetime import datetime as dt
import dateutil.parser

# Plotting
# import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import plt

# URL Grabbing
import requests

# Scraping / Searching
from bs4 import BeautifulSoup
import re

# Misc
import pickle
from pprint import pprint

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x) # reduces the amount of decimals when printing the df's

In [2]:
    with open("movies.pkl", 'rb') as picklefile: 
        df = pickle.load(picklefile)

In [3]:
# just check on the data
df.head().T

Unnamed: 0,0,1,2,3,4
index,The Other Side of the Mountain Part II,Universal Soldier,Newtown,Mommie Dearest,G.B.F.
actors,,"[Jean-Claude Van Damme, Dolph Lundgren]",,,"[Natasha Lyonne, Megan Mullally]"
close_date,NaT,NaT,NaT,NaT,NaT
composers,,[Tim Simonec],,,
days_in_theater,,,14.00,,
directors,,[Roland Emmerich],,,
domestic_gross_adj,42753500.00,75661200.00,13900.00,59219100.00,
foreign_unadj,,,,,
genre,Romance,Sci-Fi Action,Documentary,Drama,Comedy
mpaa_rating,PG,R,Unrated,PG,R


In [33]:
# get all the crew members & related data
df_crew = (df[['index', 'actors', 'composers', 'directors', 'producers',
               'writers', 'domestic_gross_adj', 'release_date', 'production_budget_adj']])

In [34]:
df_crew.head().T

Unnamed: 0,0,1,2,3,4
index,The Other Side of the Mountain Part II,Universal Soldier,Newtown,Mommie Dearest,G.B.F.
actors,,"[Jean-Claude Van Damme, Dolph Lundgren]",,,"[Natasha Lyonne, Megan Mullally]"
composers,,[Tim Simonec],,,
directors,,[Roland Emmerich],,,
producers,,,,,
writers,,[Dean Devlin],,,
domestic_gross_adj,42753500.00,75661200.00,13900.00,59219100.00,
release_date,1978-01-01 00:00:00,1992-07-10 00:00:00,2016-10-07 00:00:00,1981-09-18 00:00:00,2014-01-17 00:00:00
production_budget_adj,,,,,


In [35]:
# checking if there are any multiple crew memebers in a column
def checkLargeCrew(crew):
    if isinstance(crew, list):
        if len(crew) > 1:
            return True
        else:
            return False
    else:
        return False

df[df['actors'].apply(checkLargeCrew)].T

Unnamed: 0,1,4,7,10,11,13,14,18,21,23,...,9935,9938,9939,9942,9945,9954,9955,9959,9960,9961
index,Universal Soldier,G.B.F.,The Jackal,Talladega Nights: The Ballad of Ricky Bobby,Me Before You,Cruising,ABCD (Any Body Can Dance) 2,Alfie,Hot Shots!,Chain Letter,...,She's So Lovely,Ride Along 2,"Big Mommas: Like Father, Like Son",Trust the Man,Boomerang,Shopping,The Talented Mr. Ripley,Logan,Fruitvale Station,Beloved
actors,"[Jean-Claude Van Damme, Dolph Lundgren]","[Natasha Lyonne, Megan Mullally]","[Richard Gere, Bruce Willis]","[Will Ferrell, John C. Reilly, Sacha Baron Coh...","[Emilia Clarke, Sam Claflin]","[Al Pacino, Paul Sorvino, Karen Allen]","[Varun Dhawan, Prabhu Dheva, Shraddha Kapoor]","[Jude Law, Marisa Tomei, Susan Sarandon, Nia L...","[Charlie Sheen, Cary Elwes]","[Nikki Reed, Keith David]",...,"[Sean Penn, John Travolta]","[Kevin Hart, Ice Cube, Benjamin Bratt, Olivia ...","[Martin Lawrence, Brandon T. Jackson, Portia D...","[Julianne Moore, David Duchovny, Billy Crudup,...","[Eddie Murphy, Halle Berry, Martin Lawrence]","[Sadie Frost, Jude Law, Sean Pertwee, Sean Bea...","[Matt Damon, Gwyneth Paltrow, Jude Law, Cate B...","[Hugh Jackman, Patrick Stewart, Richard E. Gra...","[Michael B. Jordan, Octavia Spencer]","[Danny Glover, Thandie Newton]"
close_date,NaT,NaT,NaT,2006-10-15 00:00:00,2016-08-18 00:00:00,NaT,2015-07-16 00:00:00,2005-01-13 00:00:00,NaT,2010-10-07 00:00:00,...,NaT,2016-06-16 00:00:00,2011-05-26 00:00:00,2006-10-08 00:00:00,NaT,NaT,NaT,NaT,2013-10-31 00:00:00,NaT
composers,[Tim Simonec],,[Carter Burwell],[Alex Wurman],,,,[John Powell],,,...,,,,[Clint Mansell],[Marcus Miller],,[Gabriel Yared],[Marco Beltrami],[Ludwig Goransson],[Rachel Portman]
days_in_theater,,,,77.00,77.00,,28.00,70.00,,,...,,154.00,98.00,56.00,,,,41.00,112.00,
directors,[Roland Emmerich],,[Michael Caton-Jones],[Adam McKay],,[William Friedkin],[Remo D'Souza],[Charles Shyer],[Jim Abrahams],,...,[Nick Cassavetes],[Tim Story],[John Whitesell],[Bart Freundlich],,[Paul W.S. Anderson],[Anthony Minghella],[James Mangold],[Ryan Coogler],[Jonathan Demme]
domestic_gross_adj,75661200.00,,103517800.00,195732200.00,55815700.00,63618400.00,935400.00,18664000.00,142730400.00,149900.00,...,13722100.00,91957900.00,41726300.00,2021200.00,146012900.00,6000.00,133151200.00,219343600.00,17757200.00,41695800.00
foreign_unadj,,,104400000.00,14752800.00,151700000.00,,,21750734.00,111628547.00,446574.00,...,,33383467.00,44770652.00,5822583.00,61000000.00,,47500000.00,377548381.00,1284491.00,
genre,Sci-Fi Action,Comedy,Action Thriller,Sports Comedy,Romance,Thriller,Musical,Comedy / Drama,Comedy,Horror Thriller,...,Romantic Comedy,Action Comedy,Action Comedy,Romantic Comedy,Romantic Comedy,,Thriller,Action / Adventure,Drama,Period Drama
mpaa_rating,R,R,R,PG-13,PG-13,R,Unrated,R,PG-13,R,...,R,PG-13,PG-13,R,R,R,R,R,R,R


In [37]:
crewcount = df_crew['composers'].dropna().apply(lambda r: len(r[0])).sum()
crewcount

34669

In [43]:
"""
This cell creates the crew data frame that can be joined with the core data frame.
"""

# preparing the dataframe
crew = (pd.DataFrame(columns=['name','role','movie','domestic_gross_adj','release_date','production_budget_adj']))

def makeRow(row):
    global crew
    
#     print(row)
    
    movie = row[0]
    domestic_gross_adj = row[6]
    release_date = row[7]
    prod_budget = row[8]

    for index, role in enumerate(row[1:6]):
        if index == 0:
            rrole = 'actor'
        elif index == 1:
            rrole = 'composer'
        elif index == 2:
            rrole = 'director'
        elif index == 3:
            rrole = 'producer'
        elif index == 4:
            rrole = 'writer'
        
        if isinstance(role, list):
            for name in role:
                frame = pd.DataFrame(data={'name' : [name],
                                           'role' : [rrole],
                                           'movie' : [movie],
                                           'domestic_gross_adj' : [domestic_gross_adj],
                                           'release_date' : [release_date],
                                           'production_budget_adj' : [prod_budget]},
                                     columns=['name','role','movie','domestic_gross_adj','release_date', 'production_budget_adj'])
            crew = pd.concat([crew, frame])

(df_crew
#  .iloc[:10,:] # used for testing
 .apply(makeRow, axis=1));

In [8]:
# save the data frame! :)
# crew.to_pickle('crew.pkl')

In [44]:
crew.groupby(['role']).count()

Unnamed: 0_level_0,name,movie,domestic_gross_adj,release_date,production_budget_adj
role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
actor,4648,4648,4493,4614,2101
composer,2552,2552,2523,2549,1492
director,3491,3491,3367,3459,1778
producer,2926,2926,2850,2901,1692
writer,2375,2375,2309,2358,1398


In [9]:
crew.tail().T

Unnamed: 0,0,0.1,0.2,0.3,0.4
name,Thandie Newton,Rachel Portman,Jonathan Demme,Gary Goetzman,Richard LaGravenese
role,actor,composer,director,producer,writer
movie,Beloved,Beloved,Beloved,Beloved,Beloved
domestic_gross_adj,41695800.00,41695800.00,41695800.00,41695800.00,41695800.00
release_date,1998-10-16 00:00:00,1998-10-16 00:00:00,1998-10-16 00:00:00,1998-10-16 00:00:00,1998-10-16 00:00:00
production_budget_adj,80000000.00,80000000.00,80000000.00,80000000.00,80000000.00


In [10]:
crew.domestic_gross_adj.describe()

count         3905.00
mean     138538153.52
std      144175199.24
min          27600.00
25%       44994700.00
50%       90505200.00
75%      177586000.00
max     1234649200.00
Name: domestic_gross_adj, dtype: float64

In [11]:
crew.production_budget_adj.describe()

count        3905.00
mean     72688860.44
std      57530210.71
min       1000000.00
25%      28000000.00
50%      60000000.00
75%     100000000.00
max     300000000.00
Name: production_budget_adj, dtype: float64

In [45]:
# look for films that made N times the amount of the budget
topcrew = (crew[crew[['domestic_gross_adj', 'production_budget_adj']]
            .apply(axis=1, func=lambda r: True if r[0] > r[1]*4 else False)]
            .sort_values('domestic_gross_adj', ascending=False))

In [48]:
# taking a peek into the data
topcrew[topcrew['role'] == 'director'].sort_values('domestic_gross_adj', ascending=False);

Unnamed: 0,name,role,movie,domestic_gross_adj,release_date,production_budget_adj
0,George Lucas,director,Star Wars,1234649200.00,1977-05-25,11000000.00
0,James Cameron,director,Titanic,1110191900.00,1997-12-19,200000000.00
0,Steven Spielberg,director,Jaws,1107881800.00,1975-06-20,7000000.00
0,Steven Spielberg,director,E.T.: The Extra-Terrestrial,1075530900.00,1982-06-11,5000000.00
0,Steven Spielberg,director,Jurassic Park,745680500.00,1993-06-11,63000000.00
0,George Lucas,director,Star Wars: Episode I - The Phantom Menace,733743200.00,1999-05-19,115000000.00
0,Colin Trevorrow,director,Jurassic World,683775600.00,2015-06-12,150000000.00
0,Irvin Kershner,director,The Empire Strikes Back,682158400.00,1980-05-21,18000000.00
0,Robert Zemeckis,director,Forrest Gump,679419600.00,1994-07-06,55000000.00
0,Steven Spielberg,director,Raiders of the Lost Ark,668118400.00,1981-06-12,18000000.00


In [63]:
len(topcrew)

1572

In [73]:
# store the date that they were part of a big movie
made_man = topcrew.groupby(['name'])['release_date'].min().reset_index()
made_man.rename(columns={'release_date': 'made_man'}, inplace=True)

In [76]:
# merge the columns
made_crew = pd.merge(crew, made_man, how='left', on='name')

In [243]:
# just make sure all the values make sense, but notice that there are duplicates
made_crew[made_crew.name.duplicated()].sort_values('name').head()#.tail()

Unnamed: 0,name,role,movie,domestic_gross_adj,release_date,production_budget_adj,made_man
15591,A.R. Rahman,composer,The Hundred-Foot Journey,58003200.0,2014-08-08,22000000.0,NaT
6098,A.R. Rahman,composer,Million Dollar Arm,37914200.0,2014-05-16,25000000.0,NaT
8835,A.R. Rahman,composer,People Like Us,13825300.0,2012-06-29,16000000.0,NaT
15650,Aaron Eckhart,actor,Nurse Betty,40393500.0,2000-09-08,35000000.0,NaT
11432,Aaron Eckhart,actor,The Pledge,30157500.0,2001-01-19,35000000.0,NaT


In [93]:
# drop all NaN values and drop the duplicates with the same name
df_made_crew = made_crew.dropna(subset=['made_man']).drop_duplicates(subset=['name'])

In [95]:
df_made_crew.head()

Unnamed: 0,name,role,movie,domestic_gross_adj,release_date,production_budget_adj,made_man
2,Roland Emmerich,director,Universal Soldier,75661200.0,1992-07-10,,1996-07-03
3,Dean Devlin,writer,Universal Soldier,75661200.0,1992-07-10,,1996-07-03
5,Darren Lynn Bousman,director,Saw IV,79585200.0,2007-10-26,,2005-10-28
6,Leigh Whannell,producer,Saw IV,79585200.0,2007-10-26,,2004-10-29
7,Patrick Melton,writer,Saw IV,79585200.0,2007-10-26,,2008-10-24


In [110]:
df.head().T

Unnamed: 0,0,1,2,3,4
index,The Other Side of the Mountain Part II,Universal Soldier,Newtown,Mommie Dearest,G.B.F.
actors,,"[Jean-Claude Van Damme, Dolph Lundgren]",,,"[Natasha Lyonne, Megan Mullally]"
close_date,NaT,NaT,NaT,NaT,NaT
composers,,[Tim Simonec],,,
days_in_theater,,,14.00,,
directors,,[Roland Emmerich],,,
domestic_gross_adj,42753500.00,75661200.00,13900.00,59219100.00,
foreign_unadj,,,,,
genre,Romance,Sci-Fi Action,Documentary,Drama,Comedy
mpaa_rating,PG,R,Unrated,PG,R


In [231]:
def checkForQualityCrew(row):
    """
    This calculates the number of actors, directors, and crew were in past high grossing films
    """
#     print(row[0])
    title = row[0]
    rel_date = row[1]

    total = 0
    for role in row[2:]:
        if type(role) == list:
            for name in role:
                role_row = df_made_crew['name'].isin(['Roland Emmerich'])
                isrowempty = role_row.empty
                if (not isrowempty and df_made_crew[role_row].made_man.iloc[0] < rel_date):
                    total += 1
    return total
    

    
made_men_movies = (df[['index','release_date','actors','composers','directors','producers','writers']]
#   .iloc[0:4,:] # this is for testing to control the amount of variables passing into the function
  .apply(checkForQualityCrew,axis=1, raw=True))
#   .sort_values(ascending=False))

In [240]:
made_men_movies.name = 'made_men'

In [242]:
new_df = pd.concat([df, made_men_movies], axis=1)

In [None]:
# save the data frame! :)
# crew.to_pickle('crew.pkl')