In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [5]:
#import cleaned
df = pd.read_csv('cleaned_data.csv')

In [6]:
df.corr()["cast_total_facebook_likes"]

num_critic_for_reviews       0.279822
duration                     0.143664
director_facebook_likes      0.145286
actor_3_facebook_likes       0.551699
actor_1_facebook_likes       0.932219
gross                        0.265954
num_voted_users              0.288564
cast_total_facebook_likes    1.000000
facenumber_in_poster         0.084828
num_user_for_reviews         0.215984
budget                       0.249671
title_year                   0.152563
actor_2_facebook_likes       0.706680
imdb_score                   0.129401
aspect_ratio                 0.101115
movie_facebook_likes         0.240749
profit                       0.125373
log_profit                   0.182579
action                       0.033711
adventure                    0.040496
animation                   -0.001954
biography                    0.012309
comedy                      -0.035494
crime                        0.014883
documentary                 -0.040650
drama                       -0.002840
family      

We can see from above that facebook likes is extremely correlated with the number of facebook likes of actor 1 (0.932273), as well as actor 2 (0.706831), and actor 3(0.551871). This should be taken into account when making the model (interactions).

## Which combination of actors are most profitable?

In [7]:
df_people = df[["director_name", "director_facebook_likes",  "actor_1_name", "actor_1_facebook_likes","actor_2_name", "actor_2_facebook_likes", "actor_3_name","actor_3_facebook_likes", "cast_total_facebook_likes",  "profit"]]

In [8]:
yay = df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False)
yay

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,523505847.0
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,502177271.0
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,458672302.0
George Lucas,Harrison Ford,Peter Cushing,Kenny Baker,449935665.0
Steven Spielberg,Henry Thomas,Dee Wallace,Peter Coyote,424449459.0
...,...,...,...,...
Luc Besson,Paul Brooke,David Bailie,Rab Affleck,-375868702.0
Chatrichalerm Yukol,Sarunyu Wongkrachang,Chatchai Plengpanich,Mai Charoenpura,-399545745.0
John Woo,Takeshi Kaneshiro,Tony Chiu Wai Leung,Wei Zhao,-553005191.0
Karan Johar,Shah Rukh Khan,John Abraham,Preity Zinta,-696724557.0


In [9]:
yay[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,523505847.0
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,502177271.0
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,458672302.0


In [10]:
df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Schneider,Bill Murray,Robert Duvall,Bill Cobbs,2,2,2,2,2,2
Aaron Seltzer,Alyson Hannigan,Carmen Electra,Fred Willard,2,2,2,2,2,2
Abel Ferrara,Isabella Rossellini,Vincent Gallo,Gretchen Mol,2,2,2,2,2,2
Adam Goldberg,Judy Greer,Marisa Coughlan,Nicky Katt,2,2,2,2,2,2
Adam Marcus,Kane Hodder,Leslie Jordan,Erin Gray,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...
Zack Snyder,Ty Burrell,Kevin Zegers,Mekhi Phifer,6,6,6,6,6,6
Zak Penn,Zak Penn,Gabriel Beristain,John Bailey,3,3,3,3,3,3
Zal Batmanglij,Alexander Skarsgård,Julia Ormond,Jason Ritter,2,2,2,2,2,2
Álex de la Iglesia,Jim Carter,Leonor Watling,Danny Sapani,3,3,3,3,3,3


Most profitable combination of director and actors is James Cameron, CCH Pounder, Joel David Moore, and Wes Studi.

In [21]:
director = df_people.groupby(["director_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
director

Unnamed: 0,director_name,profit
0,Tim Miller,3.050243e+08
1,Colin Trevorrow,2.883546e+08
2,George Lucas,2.859128e+08
3,Richard Marquand,2.766254e+08
4,Kyle Balda,2.620296e+08
...,...,...
1651,Karan Johar,-1.801671e+08
1652,Prachya Pinkaew,-1.880945e+08
1653,Tony Jaa,-2.998979e+08
1654,Chatrichalerm Yukol,-3.995457e+08


In [12]:
actor_1 = df_people.groupby(["actor_1_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_1

Unnamed: 0,actor_1_name,profit
0,Wayne Knight,2.937840e+08
1,Rupert Everett,2.864710e+08
2,Henry Thomas,2.823595e+08
3,Catherine Dyer,2.269504e+08
4,Josh Gad,2.130122e+08
...,...,...
1418,Nirut Sirichanya,-2.998979e+08
1419,Takeshi Kaneshiro,-3.467623e+08
1420,Paul Brooke,-3.758687e+08
1421,Sarunyu Wongkrachang,-3.995457e+08


In [13]:
actor_2 = df_people.groupby(["actor_2_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_2


Unnamed: 0,actor_2_name,profit
0,Peter Cushing,4.499357e+08
1,Robert Downey Jr.,3.385169e+08
2,Ed Skrein,3.050243e+08
3,Leonard Roberts,2.913236e+08
4,Jennifer Saunders,2.864710e+08
...,...,...
2179,Petchtai Wongkamlao,-2.998979e+08
2180,David Bailie,-3.758687e+08
2181,Chatchai Plengpanich,-3.995457e+08
2182,John Abraham,-6.967246e+08


In [14]:
actor_3 = df_people.groupby(["actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_3

Unnamed: 0,actor_3_name,profit
0,Omar Sy,502177271.0
1,Gloria Stuart,458672302.0
2,Niketa Calame,377783777.0
3,Ian McDiarmid,359544677.0
4,Anthony Reynolds,329999255.0
...,...,...
2578,Rab Affleck,-375868702.0
2579,Mai Charoenpura,-399545745.0
2580,Wei Zhao,-553005191.0
2581,Preity Zinta,-696724557.0


In [27]:
director_high = list(director.loc[director["profit"] > 0]["director_name"])
director_high

['Tim Miller',
 'Colin Trevorrow',
 'George Lucas',
 'Richard Marquand',
 'Kyle Balda',
 'Chris Buck',
 'Yarrow Cheney',
 'Pierre Coffin',
 'Joss Whedon',
 'Lee Unkrich',
 'Roger Allers',
 'William Cottrell',
 'James Cameron',
 'Peter Faiman',
 'Irvin Kershner',
 'Francis Lawrence',
 'Pete Docter',
 'Andrew Adamson',
 'Daniel Myrick',
 'Sam Taylor-Johnson',
 'Phil Lord',
 'George Roy Hill',
 'David Slade',
 'Robert Wise',
 'Joel Zwick',
 'Josh Boone',
 'Peter Farrelly',
 'Christopher Nolan',
 'David Silverman',
 'Oren Peli',
 'Victor Fleming',
 'Gary Ross',
 'Elizabeth Banks',
 'Jay Roach',
 'Penelope Spheeris',
 'Leonard Nimoy',
 'Shane Black',
 'Randal Kleiser',
 'Jon Favreau',
 'Robert Stevenson',
 'John Lasseter',
 'Michael Sucsy',
 'John Cornell',
 'Steven Spielberg',
 'Rawson Marshall Thurber',
 'John G. Avildsen',
 'Paul Tibbitt',
 'Chris Columbus',
 'Mike Gabriel',
 'Chris Miller',
 'Jeannot Szwarc',
 'Jeff Tremaine',
 'Tod Williams',
 'Norman Ferguson',
 'P.J. Hogan',
 'Jim Ab

In [32]:
director_low = list(director.loc[director["profit"] <= 0]["director_name"])
director_low

['Hunter Richards',
 'Michel Orion Scott',
 'Marc Levin',
 'Brandon Trost',
 'Jean-François Richet',
 'Cédric Klapisch',
 'Joe Swanberg',
 'Bill Plympton',
 'Xavier Beauvois',
 'Ricky Gervais',
 'Hans Canosa',
 'Clark Gregg',
 'Matt Piedmont',
 'Tom Putnam',
 'Eric Valette',
 'E.L. Katz',
 'Tony Goldwyn',
 "Eddie O'Flaherty",
 'Ricki Stern',
 'Richard Dutcher',
 'Eric Schaeffer',
 'C. Jay Cox',
 'Henry Alex Rubin',
 'Nadine Labaki',
 'Neema Barnette',
 'Gareth Edwards',
 'Stephen Kay',
 'Lucio Fulci',
 'Fabián Bielinsky',
 'Bruce Dellis',
 'Jon Gunn',
 'Jon Amiel',
 'Julian Schnabel',
 'Lucky McKee',
 'Richard Eyre',
 'Quentin Dupieux',
 'Darren Stein',
 'Stephen Frears',
 'Nick Tomnay',
 'Peter Landesman',
 'Ben Wheatley',
 'Steve Buscemi',
 'Lloyd Kaufman',
 'Woody Allen',
 'Alex Smith',
 'Fernando Meirelles',
 'Alex Craig Mann',
 'Josef Rusnak',
 'Joshua Oppenheimer',
 'Marielle Heller',
 'François Girard',
 'Finn Taylor',
 'Karen Moncrieff',
 'Nick Gomez',
 'Mira Nair',
 'Scott Zie

In [33]:
actor_1_high = list(actor_1.loc[actor_1["profit"] > 0]["actor_1_name"])

In [34]:
actor_1_low = list(actor_1.loc[actor_1["profit"] <- 0]["actor_1_name"])

In [35]:
actor_2_high = list(actor_2.loc[actor_2["profit"] > 0]["actor_2_name"])

In [36]:
actor_2_low = list(actor_2.loc[actor_2["profit"] <- 0]["actor_2_name"])

In [37]:
actor_3_high = list(actor_3.loc[actor_3["profit"] > 0]["actor_3_name"])

In [38]:
actor_3_low = list(actor_3.loc[actor_3["profit"] <- 0]["actor_3_name"])

In [39]:
df["director_high"] = [1 if director_high.count(x) > 0 else 0 for x in df["director_name"]]
df["director_low"] = [1 if director_low.count(x) > 0 else 0 for x in df["director_name"]]


In [40]:
df["actor_1_high"] = [1 if actor_1_high.count(x) > 0 else 0 for x in df["actor_1_name"]]
df["actor_1_low"] = [1 if actor_1_low.count(x) > 0 else 0 for x in df["actor_1_name"]]

In [41]:
df["actor_2_high"] = [1 if actor_2_high.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_low"] = [1 if actor_2_low.count(x) > 0 else 0 for x in df["actor_2_name"]]

In [42]:
df["actor_3_high"] = [1 if actor_3_high.count(x) > 0 else 0 for x in df["actor_3_name"]]
df["actor_3_low"] = [1 if actor_3_low.count(x) > 0 else 0 for x in df["actor_3_name"]]

In [43]:
df_actors = df.copy()

In [44]:
train_data, test_data = train_test_split(df_actors, test_size=0.2, random_state=42)

# Save the training and testing data as CSV files
train_data.to_csv('team_train_X.csv', index=False)
test_data.to_csv('team_test_X.csv', index=False)

In [45]:
team_train_X = pd.read_csv('team_train_X.csv')
team_test_X = pd.read_csv('team_test_X.csv')


## Making a regression model by looking at whether the movies have a profitable team

In [46]:
#Using the ols function to create an ols object. 'ols' stands for 'Ordinary least squares'
ols_object = smf.ols(formula = 'profit~director_high+director_low+actor_1_high+actor_1_low+actor_2_high+actor_2_low+actor_3_high+actor_3_low', data = team_train_X)
model = ols_object.fit()
model.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.302
Model:,OLS,Adj. R-squared:,0.302
Method:,Least Squares,F-statistic:,969.2
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,0.0
Time:,21:55:20,Log-Likelihood:,-171670.0
No. Observations:,8968,AIC:,343400.0
Df Residuals:,8963,BIC:,343400.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.035e+20,1.06e+20,0.974,0.330,-1.05e+20,3.12e+20
director_high,-8.504e+19,1.56e+20,-0.546,0.585,-3.9e+20,2.2e+20
director_low,8.504e+19,1.56e+20,0.546,0.585,-2.2e+20,3.9e+20
actor_1_high,-3.257e+18,3.33e+18,-0.978,0.328,-9.78e+18,3.27e+18
actor_1_low,-3.257e+18,3.33e+18,-0.978,0.328,-9.78e+18,3.27e+18
actor_2_high,-1.583e+20,1.58e+20,-1.000,0.317,-4.69e+20,1.52e+20
actor_2_low,-1.583e+20,1.58e+20,-1.000,0.317,-4.69e+20,1.52e+20
actor_3_high,5.808e+19,7.29e+19,0.797,0.425,-8.47e+19,2.01e+20
actor_3_low,5.808e+19,7.29e+19,0.797,0.425,-8.47e+19,2.01e+20

0,1,2,3
Omnibus:,3401.619,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,294310.281
Skew:,0.902,Prob(JB):,0.0
Kurtosis:,31.007,Cond. No.,2550000000000000.0
