In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [3]:
#import cleaned
df = pd.read_csv('cleaned_data.csv')

In [4]:
df.corr()["cast_total_facebook_likes"]

num_critic_for_reviews       0.279639
duration                     0.143359
director_facebook_likes      0.145200
actor_3_facebook_likes       0.551871
actor_1_facebook_likes       0.932273
gross                        0.266763
num_voted_users              0.288863
cast_total_facebook_likes    1.000000
facenumber_in_poster         0.085601
num_user_for_reviews         0.216289
budget                       0.027778
title_year                   0.152753
actor_2_facebook_likes       0.706831
imdb_score                   0.128256
aspect_ratio                 0.101322
movie_facebook_likes         0.241069
profit                       0.049964
action                       0.033454
adventure                    0.040156
animation                   -0.003015
biography                    0.012525
comedy                      -0.035042
crime                        0.015020
documentary                 -0.040534
drama                       -0.002791
family                      -0.000840
fantasy     

We can see from above that facebook likes is extremely correlated with the number of facebook likes of actor 1 (0.932273), as well as actor 2 (0.706831), and actor 3(0.551871). This should be taken into account when making the model (interactions).

## Which combination of actors are most profitable?

In [5]:
df_people = df[["director_name", "director_facebook_likes",  "actor_1_name", "actor_1_facebook_likes","actor_2_name", "actor_2_facebook_likes", "actor_3_name","actor_3_facebook_likes", "cast_total_facebook_likes",  "profit"]]

In [6]:
yay = df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False)
yay

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,5.235058e+08
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,5.021773e+08
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,4.586723e+08
George Lucas,Harrison Ford,Peter Cushing,Kenny Baker,4.499357e+08
Steven Spielberg,Henry Thomas,Dee Wallace,Peter Coyote,4.244495e+08
...,...,...,...,...
Katsuhiro Ôtomo,William Hootkins,Robin Atkin Downes,Rosalind Ayres,-2.127110e+09
Hayao Miyazaki,Minnie Driver,Jada Pinkett Smith,Billy Crudup,-2.397702e+09
Lajos Koltai,Marcell Nagy,Péter Fancsikai,Bálint Péntek,-2.499804e+09
Chan-wook Park,Min-sik Choi,Yeong-ae Lee,Hye-jeong Kang,-4.199788e+09


In [7]:
yay[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1
James Cameron,CCH Pounder,Joel David Moore,Wes Studi,523505847.0
Colin Trevorrow,Bryce Dallas Howard,Judy Greer,Omar Sy,502177271.0
James Cameron,Leonardo DiCaprio,Kate Winslet,Gloria Stuart,458672302.0


In [8]:
df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,profit
director_name,actor_1_name,actor_2_name,actor_3_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aaron Schneider,Bill Murray,Robert Duvall,Bill Cobbs,2,2,2,2,2,2
Aaron Seltzer,Alyson Hannigan,Carmen Electra,Fred Willard,2,2,2,2,2,2
Abel Ferrara,Isabella Rossellini,Vincent Gallo,Gretchen Mol,2,2,2,2,2,2
Adam Goldberg,Judy Greer,Marisa Coughlan,Nicky Katt,2,2,2,2,2,2
Adam Marcus,Kane Hodder,Leslie Jordan,Erin Gray,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...
Zack Snyder,Ty Burrell,Kevin Zegers,Mekhi Phifer,6,6,6,6,6,6
Zak Penn,Zak Penn,Gabriel Beristain,John Bailey,3,3,3,3,3,3
Zal Batmanglij,Alexander Skarsgård,Julia Ormond,Jason Ritter,2,2,2,2,2,2
Álex de la Iglesia,Jim Carter,Leonor Watling,Danny Sapani,3,3,3,3,3,3


Most profitable combination of director and actors is James Cameron, CCH Pounder, Joel David Moore, and Wes Studi.

In [9]:
df_people.groupby(["director_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).head(10)


Unnamed: 0_level_0,profit
director_name,Unnamed: 1_level_1
Tim Miller,305024300.0
Colin Trevorrow,288354600.0
George Lucas,285912800.0
Richard Marquand,276625400.0
Kyle Balda,262029600.0
Chris Buck,250736600.0
Yarrow Cheney,248505500.0
Pierre Coffin,245100500.0
Joss Whedon,229845800.0
Lee Unkrich,214984500.0


In [10]:
df_people.groupby(["actor_1_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).head(10)


Unnamed: 0_level_0,profit
actor_1_name,Unnamed: 1_level_1
Wayne Knight,293784000.0
Rupert Everett,286471000.0
Henry Thomas,282359500.0
Catherine Dyer,226950400.0
Josh Gad,213012200.0
Kathleen Freeman,207652000.0
Hattie McDaniel,194678300.0
Adriana Caselotti,182925500.0
Olivia Newton-John,175360000.0
Phaldut Sharma,174085000.0


In [11]:
df_people.groupby(["actor_2_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).head(10)


Unnamed: 0_level_0,profit
actor_2_name,Unnamed: 1_level_1
Peter Cushing,449935700.0
Robert Downey Jr.,338516900.0
Ed Skrein,305024300.0
Leonard Roberts,291323600.0
Jennifer Saunders,286471000.0
Ian McDiarmid,276625400.0
Kenny Baker,272158800.0
Miranda Cosgrove,245100500.0
Joel David Moore,235009200.0
Quinton Aaron,226950400.0


In [12]:
df_people.groupby(["actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).head(10)


Unnamed: 0_level_0,profit
actor_3_name,Unnamed: 1_level_1
Omar Sy,502177271.0
Gloria Stuart,458672302.0
Niketa Calame,377783777.0
Ian McDiarmid,359544677.0
Anthony Reynolds,329999255.0
Stefan Kapicic,305024263.0
Bob Peck,293784000.0
Keir O'Donnell,291323553.0
Brad Garrett,286838870.0
Conrad Vernon,286471036.0


In [13]:
#bin profits into 3 bins, with each actor profit in each bin
df_people["profit_bin"] = pd.cut(df_people["profit"], bins=3, labels=["low", "medium", "high"])

df_people.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_people["profit_bin"] = pd.cut(df_people["profit"], bins=3, labels=["low", "medium", "high"])


Unnamed: 0,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,profit,profit_bin
0,James Cameron,0.0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,4834,523505847.0,high
1,James Cameron,0.0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,4834,523505847.0,high
2,James Cameron,0.0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,4834,523505847.0,high
3,James Cameron,0.0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,4834,523505847.0,high
4,Gore Verbinski,563.0,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0,48350,9404152.0,high
