In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [2]:
#import cleaned
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.corr()["cast_total_facebook_likes"]

duration                     0.120210
director_facebook_likes      0.118573
actor_3_facebook_likes       0.489800
actor_1_facebook_likes       0.944816
gross                        0.235669
cast_total_facebook_likes    1.000000
facenumber_in_poster         0.080765
budget                       0.028703
title_year                   0.126927
actor_2_facebook_likes       0.643003
aspect_ratio                 0.068860
profit                       0.043866
gross_bin                    0.203394
action                       0.059598
adventure                    0.068826
animation                   -0.003097
biography                    0.020964
comedy                      -0.057746
crime                        0.027423
documentary                 -0.060534
drama                        0.001105
family                       0.001051
fantasy                      0.040610
film-noir                   -0.008936
history                     -0.017975
horror                      -0.066516
music       

We can see from above that facebook likes is extremely correlated with the number of facebook likes of actor 1 (0.932273), as well as actor 2 (0.706831), and actor 3(0.551871). This should be taken into account when making the model (interactions).

## Which combination of actors are most profitable?

In [4]:
df_people = df[["director_name", "director_facebook_likes",  "actor_1_name", "actor_1_facebook_likes","actor_2_name", "actor_2_facebook_likes", "actor_3_name","actor_3_facebook_likes", "cast_total_facebook_likes",  "profit"]]

In [5]:
yay = df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False)
yay

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,5.235058e+08
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,5.021773e+08
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,4.586723e+08
George Lucas,Harrison Ford,Peter Cushing,Kenny Baker,4.499357e+08
Steven Spielberg,Henry Thomas,Dee Wallace,Peter Coyote,4.244495e+08
...,...,...,...,...
Katsuhiro Ôtomo,William Hootkins,Robin Atkin Downes,Rosalind Ayres,-2.127110e+09
Hayao Miyazaki,Minnie Driver,Jada Pinkett Smith,Billy Crudup,-2.397702e+09
Lajos Koltai,Marcell Nagy,Péter Fancsikai,Bálint Péntek,-2.499804e+09
Chan-wook Park,Min-sik Choi,Yeong-ae Lee,Hye-jeong Kang,-4.199788e+09


In [6]:
yay[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,523505847.0
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,502177271.0
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,458672302.0


In [7]:
df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Schneider,Bill Murray,Robert Duvall,Bill Cobbs,1,1,1,1,1,1
Aaron Seltzer,Alyson Hannigan,Carmen Electra,Fred Willard,1,1,1,1,1,1
Abel Ferrara,Isabella Rossellini,Vincent Gallo,Gretchen Mol,1,1,1,1,1,1
Adam Goldberg,Judy Greer,Marisa Coughlan,Nicky Katt,1,1,1,1,1,1
Adam Marcus,Kane Hodder,Leslie Jordan,Erin Gray,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...
Zack Snyder,Ty Burrell,Kevin Zegers,Mekhi Phifer,2,2,2,2,2,2
Zak Penn,Zak Penn,Gabriel Beristain,John Bailey,1,1,1,1,1,1
Zal Batmanglij,Alexander Skarsgård,Julia Ormond,Jason Ritter,1,1,1,1,1,1
Álex de la Iglesia,Jim Carter,Leonor Watling,Danny Sapani,1,1,1,1,1,1


Most profitable combination of director and actors is James Cameron, CCH Pounder, Joel David Moore, and Wes Studi.

In [8]:
director = df_people.groupby(["director_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
director

Unnamed: 0,director_name,profit
0,Tim Miller,3.050243e+08
1,George Lucas,2.773283e+08
2,Richard Marquand,2.766254e+08
3,Kyle Balda,2.620296e+08
4,Colin Trevorrow,2.527175e+08
...,...,...
1654,Takao Okawara,-9.899626e+08
1655,Chan-wook Park,-1.403635e+09
1656,Katsuhiro Ôtomo,-1.613335e+09
1657,Lajos Koltai,-2.499804e+09


In [9]:
actor_1 = df_people.groupby(["actor_1_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_1

Unnamed: 0,actor_1_name,profit
0,Wayne Knight,2.937840e+08
1,Rupert Everett,2.864710e+08
2,Catherine Dyer,2.269504e+08
3,Henry Thomas,2.113146e+08
4,Kathleen Freeman,2.076520e+08
...,...,...
1423,Mitsuo Iwata,-1.099561e+09
1424,Min-sik Choi,-1.404099e+09
1425,William Hootkins,-2.127110e+09
1426,Marcell Nagy,-2.499804e+09


In [10]:
actor_2 = df_people.groupby(["actor_2_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_2


Unnamed: 0,actor_2_name,profit
0,Peter Cushing,4.499357e+08
1,Robert Downey Jr.,3.385169e+08
2,Ed Skrein,3.050243e+08
3,Leonard Roberts,2.913236e+08
4,Jennifer Saunders,2.864710e+08
...,...,...
2184,Robin Atkin Downes,-1.070511e+09
2185,Takeshi Kusao,-1.099561e+09
2186,Péter Fancsikai,-2.499804e+09
2187,Yeong-ae Lee,-4.199788e+09


In [11]:
actor_3 = df_people.groupby(["actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_3

Unnamed: 0,actor_3_name,profit
0,Omar Sy,5.021773e+08
1,Gloria Stuart,4.586723e+08
2,Niketa Calame,3.777838e+08
3,Ian McDiarmid,3.595447e+08
4,Anthony Reynolds,3.299993e+08
...,...,...
2583,Tesshô Genda,-1.099561e+09
2584,Hye-jeong Kang,-2.100304e+09
2585,Rosalind Ayres,-2.127110e+09
2586,Bálint Péntek,-2.499804e+09


In [12]:
director_high = list(director.loc[director["profit"] > 0]["director_name"])
director_high

['Tim Miller',
 'George Lucas',
 'Richard Marquand',
 'Kyle Balda',
 'Colin Trevorrow',
 'Chris Buck',
 'Joss Whedon',
 'Yarrow Cheney',
 'Pierre Coffin',
 'Lee Unkrich',
 'Roger Allers',
 'William Cottrell',
 'James Cameron',
 'Peter Faiman',
 'Pete Docter',
 'Francis Lawrence',
 'Irvin Kershner',
 'Daniel Myrick',
 'Andrew Adamson',
 'Joel Zwick',
 'Sam Taylor-Johnson',
 'George Roy Hill',
 'Phil Lord',
 'Josh Boone',
 'Gary Ross',
 'Peter Farrelly',
 'Jon Favreau',
 'David Silverman',
 'Oren Peli',
 'Victor Fleming',
 'Penelope Spheeris',
 'Robert Wise',
 'Christopher Nolan',
 'Leonard Nimoy',
 'Steven Spielberg',
 'Shane Black',
 'Randal Kleiser',
 'Robert Stevenson',
 'John Lasseter',
 'Michael Sucsy',
 'Rawson Marshall Thurber',
 'John G. Avildsen',
 'John Cornell',
 'Jay Roach',
 'Chris Miller',
 'Paul Tibbitt',
 'Mike Gabriel',
 'Chris Columbus',
 'Todd Phillips',
 'Jeannot Szwarc',
 'Tod Williams',
 'Norman Ferguson',
 'Jeff Tremaine',
 'P.J. Hogan',
 'David Slade',
 'Jim Abra

In [13]:
director_low = list(director.loc[director["profit"] <= 0]["director_name"])
director_low

['Hunter Richards',
 'Michel Orion Scott',
 'Woody Allen',
 'Marc Levin',
 'Brandon Trost',
 'Jean-François Richet',
 'Cédric Klapisch',
 'George Miller',
 'Joe Swanberg',
 'Bill Plympton',
 'Mike Mills',
 'Xavier Beauvois',
 'Ricky Gervais',
 'Hans Canosa',
 'Clark Gregg',
 'John Sayles',
 'Richard LaGravenese',
 'Matt Piedmont',
 'Tom Putnam',
 'Eric Valette',
 'E.L. Katz',
 'Steve James',
 'Jon Gunn',
 "Eddie O'Flaherty",
 'Ricki Stern',
 'Richard Dutcher',
 'Eric Schaeffer',
 'Henry Alex Rubin',
 'Nadine Labaki',
 'C. Jay Cox',
 'Neema Barnette',
 'Gareth Edwards',
 'Lucio Fulci',
 'Fabián Bielinsky',
 'Bruce Dellis',
 'Jon Amiel',
 'Julian Schnabel',
 'Lucky McKee',
 'Quentin Dupieux',
 'Darren Stein',
 'Nick Tomnay',
 'Rob Letterman',
 'Peter Landesman',
 'Ben Wheatley',
 'Steve Buscemi',
 'Lloyd Kaufman',
 'Alex Smith',
 'David Ayer',
 'Alex Craig Mann',
 'Josef Rusnak',
 'Joshua Oppenheimer',
 'Marielle Heller',
 'François Girard',
 'Finn Taylor',
 'Karen Moncrieff',
 'Tim Robb

In [14]:
actor_1_high = list(actor_1.loc[actor_1["profit"] > 0]["actor_1_name"])

In [15]:
actor_1_low = list(actor_1.loc[actor_1["profit"] <- 0]["actor_1_name"])

In [16]:
actor_2_high = list(actor_2.loc[actor_2["profit"] > 0]["actor_2_name"])

In [17]:
actor_2_low = list(actor_2.loc[actor_2["profit"] <- 0]["actor_2_name"])

In [18]:
actor_3_high = list(actor_3.loc[actor_3["profit"] > 0]["actor_3_name"])

In [19]:
actor_3_low = list(actor_3.loc[actor_3["profit"] <- 0]["actor_3_name"])

In [20]:
df["director_high"] = [1 if director_high.count(x) > 0 else 0 for x in df["director_name"]]
df["director_low"] = [1 if director_low.count(x) > 0 else 0 for x in df["director_name"]]


In [21]:
df["actor_1_high"] = [1 if actor_1_high.count(x) > 0 else 0 for x in df["actor_1_name"]]
df["actor_1_low"] = [1 if actor_1_low.count(x) > 0 else 0 for x in df["actor_1_name"]]

In [22]:
df["actor_2_high"] = [1 if actor_2_high.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_low"] = [1 if actor_2_low.count(x) > 0 else 0 for x in df["actor_2_name"]]

In [23]:
df["actor_3_high"] = [1 if actor_3_high.count(x) > 0 else 0 for x in df["actor_3_name"]]
df["actor_3_low"] = [1 if actor_3_low.count(x) > 0 else 0 for x in df["actor_3_name"]]

In [24]:
df_actors = df.copy()

In [27]:
team_train_X = pd.read_csv('train.csv')
team_test_X = pd.read_csv('test.csv')


## Investigating number of names that came up more than once

In [28]:
#code showing the ratio of actor names that show up more than once


df = pd.read_csv('cleaned_data.csv')

director = sum(df["director_name"].value_counts() > 1) /len(df["director_name"])
actor_1 = sum(df["actor_1_name"].value_counts() > 1) /len(df["actor_1_name"])
actor_2 = sum(df["actor_2_name"].value_counts() > 1) /len(df["actor_2_name"])
actor_3 = sum(df["actor_3_name"].value_counts() > 1) /len(df["actor_3_name"])

print("Director Ration:", director, "Actor 1 Ratio:", actor_1, "Actor 2 Ratio:", actor_2, "Actor 3 Ratio:", actor_3)

Director Ration: 0.19084375831780676 Actor 1 Ratio: 0.1485227575192973 Actor 2 Ratio: 0.19377162629757785 Actor 3 Ratio: 0.18658504125632153


This ratio is very small - this could be a problem!