In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [2]:
#import cleaned
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.corr()["cast_total_facebook_likes"]

num_critic_for_reviews       0.279639
duration                     0.143359
director_facebook_likes      0.145200
actor_3_facebook_likes       0.551871
actor_1_facebook_likes       0.932273
gross                        0.266763
num_voted_users              0.288863
cast_total_facebook_likes    1.000000
facenumber_in_poster         0.085601
num_user_for_reviews         0.216289
budget                       0.027778
title_year                   0.152753
actor_2_facebook_likes       0.706831
imdb_score                   0.128256
aspect_ratio                 0.101322
movie_facebook_likes         0.241069
profit                       0.049964
action                       0.033454
adventure                    0.040156
animation                   -0.003015
biography                    0.012525
comedy                      -0.035042
crime                        0.015020
documentary                 -0.040534
drama                       -0.002791
family                      -0.000840
fantasy     

We can see from above that facebook likes is extremely correlated with the number of facebook likes of actor 1 (0.932273), as well as actor 2 (0.706831), and actor 3(0.551871). This should be taken into account when making the model (interactions).

## Which combination of actors are most profitable?

In [4]:
df_people = df[["director_name", "director_facebook_likes",  "actor_1_name", "actor_1_facebook_likes","actor_2_name", "actor_2_facebook_likes", "actor_3_name","actor_3_facebook_likes", "cast_total_facebook_likes",  "profit"]]

In [26]:
yay = df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False)
yay

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,5.235058e+08
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,5.021773e+08
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,4.586723e+08
George Lucas,Harrison Ford,Peter Cushing,Kenny Baker,4.499357e+08
Steven Spielberg,Henry Thomas,Dee Wallace,Peter Coyote,4.244495e+08
...,...,...,...,...
Katsuhiro Ôtomo,William Hootkins,Robin Atkin Downes,Rosalind Ayres,-2.127110e+09
Hayao Miyazaki,Minnie Driver,Jada Pinkett Smith,Billy Crudup,-2.397702e+09
Lajos Koltai,Marcell Nagy,Péter Fancsikai,Bálint Péntek,-2.499804e+09
Chan-wook Park,Min-sik Choi,Yeong-ae Lee,Hye-jeong Kang,-4.199788e+09


In [6]:
yay[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,523505847.0
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,502177271.0
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,458672302.0


In [7]:
df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Schneider,Bill Murray,Robert Duvall,Bill Cobbs,2,2,2,2,2,2
Aaron Seltzer,Alyson Hannigan,Carmen Electra,Fred Willard,2,2,2,2,2,2
Abel Ferrara,Isabella Rossellini,Vincent Gallo,Gretchen Mol,2,2,2,2,2,2
Adam Goldberg,Judy Greer,Marisa Coughlan,Nicky Katt,2,2,2,2,2,2
Adam Marcus,Kane Hodder,Leslie Jordan,Erin Gray,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...
Zack Snyder,Ty Burrell,Kevin Zegers,Mekhi Phifer,6,6,6,6,6,6
Zak Penn,Zak Penn,Gabriel Beristain,John Bailey,3,3,3,3,3,3
Zal Batmanglij,Alexander Skarsgård,Julia Ormond,Jason Ritter,2,2,2,2,2,2
Álex de la Iglesia,Jim Carter,Leonor Watling,Danny Sapani,3,3,3,3,3,3


Most profitable combination of director and actors is James Cameron, CCH Pounder, Joel David Moore, and Wes Studi.

In [55]:
director = df_people.groupby(["director_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
director

Unnamed: 0,director_name,profit
0,Tim Miller,3.050243e+08
1,Colin Trevorrow,2.883546e+08
2,George Lucas,2.859128e+08
3,Richard Marquand,2.766254e+08
4,Kyle Balda,2.620296e+08
...,...,...
1654,Takao Okawara,-9.899626e+08
1655,Chan-wook Park,-1.203233e+09
1656,Katsuhiro Ôtomo,-1.784593e+09
1657,Lajos Koltai,-2.499804e+09


In [56]:
actor_1 = df_people.groupby(["actor_1_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_1

Unnamed: 0,actor_1_name,profit
0,Wayne Knight,2.937840e+08
1,Rupert Everett,2.864710e+08
2,Henry Thomas,2.823595e+08
3,Catherine Dyer,2.269504e+08
4,Josh Gad,2.130122e+08
...,...,...
1423,Min-sik Choi,-1.054638e+09
1424,Mitsuo Iwata,-1.099561e+09
1425,William Hootkins,-2.127110e+09
1426,Marcell Nagy,-2.499804e+09


In [57]:
actor_2 = df_people.groupby(["actor_2_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_2


Unnamed: 0,actor_2_name,profit
0,Peter Cushing,4.499357e+08
1,Robert Downey Jr.,3.385169e+08
2,Ed Skrein,3.050243e+08
3,Leonard Roberts,2.913236e+08
4,Jennifer Saunders,2.864710e+08
...,...,...
2184,Takeshi Kusao,-1.099561e+09
2185,Robin Atkin Downes,-1.281831e+09
2186,Péter Fancsikai,-2.499804e+09
2187,Yeong-ae Lee,-4.199788e+09


In [58]:
actor_3 = df_people.groupby(["actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_3

Unnamed: 0,actor_3_name,profit
0,Omar Sy,5.021773e+08
1,Gloria Stuart,4.586723e+08
2,Niketa Calame,3.777838e+08
3,Ian McDiarmid,3.595447e+08
4,Anthony Reynolds,3.299993e+08
...,...,...
2583,Tesshô Genda,-1.099561e+09
2584,Hye-jeong Kang,-1.680407e+09
2585,Rosalind Ayres,-2.127110e+09
2586,Bálint Péntek,-2.499804e+09


In [64]:
director_high = [x for x in director['director_name'].iloc[:(round(len(director)/2))]]
director_high

['Tim Miller',
 'Colin Trevorrow',
 'George Lucas',
 'Richard Marquand',
 'Kyle Balda',
 'Chris Buck',
 'Yarrow Cheney',
 'Pierre Coffin',
 'Joss Whedon',
 'Lee Unkrich',
 'Roger Allers',
 'William Cottrell',
 'James Cameron',
 'Peter Faiman',
 'Irvin Kershner',
 'Francis Lawrence',
 'Pete Docter',
 'Andrew Adamson',
 'Daniel Myrick',
 'Sam Taylor-Johnson',
 'Phil Lord',
 'George Roy Hill',
 'David Slade',
 'Robert Wise',
 'Joel Zwick',
 'Josh Boone',
 'Peter Farrelly',
 'Christopher Nolan',
 'David Silverman',
 'Oren Peli',
 'Victor Fleming',
 'Gary Ross',
 'Elizabeth Banks',
 'Jay Roach',
 'Penelope Spheeris',
 'Leonard Nimoy',
 'Shane Black',
 'Randal Kleiser',
 'Jon Favreau',
 'Robert Stevenson',
 'John Lasseter',
 'Michael Sucsy',
 'John Cornell',
 'Steven Spielberg',
 'Rawson Marshall Thurber',
 'John G. Avildsen',
 'Paul Tibbitt',
 'Chris Columbus',
 'Mike Gabriel',
 'Chris Miller',
 'Jeannot Szwarc',
 'Jeff Tremaine',
 'Tod Williams',
 'Norman Ferguson',
 'P.J. Hogan',
 'Jim Ab

In [63]:
director_low = [x for x in director['director_name'].iloc[(round(len(director)/2)):len(director)]]

In [65]:
actor_1_high = [x for x in actor_1['actor_1_name'].iloc[:(round(len(actor_1)/2))]]

In [66]:
actor_1_low = [x for x in actor_1['actor_1_name'].iloc[(round(len(actor_1)/2)):len(actor_1)]]

In [81]:
actor_2_high = [x for x in actor_2['actor_2_name'].iloc[:(round(len(actor_2)/2))]]

In [82]:
actor_2_low = [x for x in actor_2['actor_2_name'].iloc[(round(len(actor_2)/2)):len(actor_2)]]

In [83]:
actor_3_high = [x for x in actor_3['actor_3_name'].iloc[:(round(len(actor_3)/2))]]

In [84]:
actor_3_low = [x for x in actor_3['actor_3_name'].iloc[(round(len(actor_3)/2)):len(actor_3)]]

In [78]:
df["director_high"] = [1 if director_high.count(x) > 0 else 0 for x in df["director_name"]]
df["director_low"] = [1 if director_low.count(x) > 0 else 0 for x in df["director_name"]]


In [85]:
df["actor_1_high"] = [1 if actor_1_high.count(x) > 0 else 0 for x in df["actor_1_name"]]
df["actor_1_low"] = [1 if actor_1_low.count(x) > 0 else 0 for x in df["actor_1_name"]]

In [86]:
df["actor_2_high"] = [1 if actor_2_high.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_low"] = [1 if actor_2_low.count(x) > 0 else 0 for x in df["actor_2_name"]]

In [87]:
df["actor_3_high"] = [1 if actor_3_high.count(x) > 0 else 0 for x in df["actor_3_name"]]
df["actor_3_low"] = [1 if actor_3_low.count(x) > 0 else 0 for x in df["actor_3_name"]]

In [90]:
df_actors = df.copy()

In [91]:
train_data, test_data = train_test_split(df_actors, test_size=0.2, random_state=42)

# Save the training and testing data as CSV files
train_data.to_csv('team_train_X.csv', index=False)
test_data.to_csv('team_test_X.csv', index=False)

In [92]:
team_train_X = pd.read_csv('team_train_X.csv')
team_test_X = pd.read_csv('team_test_X.csv')


## Making a regression model by looking at whether the movies have a profitable team

In [93]:
#Using the ols function to create an ols object. 'ols' stands for 'Ordinary least squares'
ols_object = smf.ols(formula = 'profit~director_high+director_low+actor_1_high+actor_1_low+actor_2_high+actor_2_low+actor_3_high+actor_3_low', data = team_train_X)
model = ols_object.fit()
model.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.033
Model:,OLS,Adj. R-squared:,0.032
Method:,Least Squares,F-statistic:,43.4
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,1.08e-60
Time:,15:39:09,Log-Likelihood:,-186490.0
No. Observations:,8989,AIC:,373000.0
Df Residuals:,8981,BIC:,373100.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.345e+20,9.05e+20,-0.149,0.882,-1.91e+21,1.64e+21
director_high,-3.875e+18,4.01e+20,-0.010,0.992,-7.9e+20,7.82e+20
director_low,-3.875e+18,4.01e+20,-0.010,0.992,-7.9e+20,7.82e+20
actor_1_high,3.567e+19,1.6e+20,0.222,0.824,-2.79e+20,3.5e+20
actor_1_low,3.567e+19,1.6e+20,0.222,0.824,-2.79e+20,3.5e+20
actor_2_high,1.643e+19,2.88e+20,0.057,0.955,-5.49e+20,5.82e+20
actor_2_low,1.643e+19,2.88e+20,0.057,0.955,-5.49e+20,5.82e+20
actor_3_high,8.631e+19,5.65e+20,0.153,0.879,-1.02e+21,1.19e+21
actor_3_low,8.631e+19,5.65e+20,0.153,0.879,-1.02e+21,1.19e+21

0,1,2,3
Omnibus:,26356.347,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1426846360.926
Skew:,-41.133,Prob(JB):,0.0
Kurtosis:,1953.08,Cond. No.,808000000000000.0
