In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [3]:
df = pd.read_csv('cleaned_data.csv')
df.columns

Index(['director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'profit',
       'log_profit', 'action', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi', 'sport',
       'thriller', 'war', 'western'],
      dtype='object')

In [4]:
language_df = df[["profit", "language", "country"]]
language_df

Unnamed: 0,profit,language,country
0,523505847.0,English,USA
1,523505847.0,English,USA
2,523505847.0,English,USA
3,523505847.0,English,USA
4,9404152.0,English,USA
...,...,...,...
11206,2033920.0,Spanish,USA
11207,2033920.0,Spanish,USA
11208,2033920.0,Spanish,USA
11209,2033920.0,Spanish,USA


In [14]:
language_df["country"].value_counts()

USA               8870
UK                 986
France             309
Germany            255
Canada             182
Australia          127
Spain               60
Hong Kong           46
China               46
New Zealand         38
Japan               35
Italy               25
South Korea         20
Denmark             18
Ireland             18
Mexico              15
Thailand            13
Norway              11
India               11
Russia              11
Czech Republic      10
Brazil              10
Netherlands          9
South Africa         9
Iran                 8
Argentina            7
Israel               6
Peru                 5
Taiwan               5
Romania              5
Hungary              4
Greece               4
West Germany         4
Poland               4
Indonesia            3
Aruba                3
Chile                3
Georgia              3
Belgium              3
Finland              2
New Line             2
Colombia             2
Official site        2
Iceland    

In [20]:
language_df.groupby(["country"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()

Unnamed: 0,country,profit
0,Taiwan,62086110.0
1,South Africa,34580080.0
2,New Zealand,25394090.0
3,USA,17826650.0
4,Japan,17203170.0
5,Peru,12362580.0
6,Mexico,7954199.0
7,Argentina,7510026.0
8,Australia,6008258.0
9,Official site,5218921.0


In [21]:
language_df.groupby(["language"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()

Unnamed: 0,language,profit
0,English,14636840.0
1,Maya,10859890.0
2,Persian,3070314.0
3,Indonesian,993179.0
4,Hebrew,783276.0
5,Romanian,595783.0
6,Zulu,-87637.0
7,Arabic,-239409.0
8,Vietnamese,-953049.0
9,,-1398153.0


In [15]:
language_df["language"].value_counts()

English       10792
French           97
Spanish          59
Mandarin         51
German           28
Cantonese        21
Japanese         20
Italian          15
Norwegian        11
Hindi            11
Portuguese       10
Korean           10
Thai             10
Dutch             9
Indonesian        7
Aboriginal        7
Danish            6
Hebrew            6
Persian           5
Mongolian         5
Czech             4
Maya              4
Filipino          3
Kazakh            3
Bosnian           3
Arabic            3
Russian           2
None              2
Zulu              2
Dari              2
Vietnamese        1
Aramaic           1
Romanian          1
Name: language, dtype: int64

## There is an overwhelming amount of english based movies, and they seem more profitable. Will this criteria improve our model?

In [5]:
language_df["is_english"] = [1 if x == "English" else 0 for x in language_df["language"]]
language_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  language_df["is_english"] = [1 if x == "English" else 0 for x in language_df["language"]]


Unnamed: 0,profit,language,country,is_english
0,523505847.0,English,USA,1
1,523505847.0,English,USA,1
2,523505847.0,English,USA,1
3,523505847.0,English,USA,1
4,9404152.0,English,USA,1
...,...,...,...,...
11206,2033920.0,Spanish,USA,0
11207,2033920.0,Spanish,USA,0
11208,2033920.0,Spanish,USA,0
11209,2033920.0,Spanish,USA,0


In [6]:
language_df["is_USA"] = [1 if x == "USA" else 0 for x in language_df["country"]]
language_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  language_df["is_USA"] = [1 if x == "USA" else 0 for x in language_df["country"]]


Unnamed: 0,profit,language,country,is_english,is_USA
0,523505847.0,English,USA,1,1
1,523505847.0,English,USA,1,1
2,523505847.0,English,USA,1,1
3,523505847.0,English,USA,1,1
4,9404152.0,English,USA,1,1
...,...,...,...,...,...
11206,2033920.0,Spanish,USA,0,1
11207,2033920.0,Spanish,USA,0,1
11208,2033920.0,Spanish,USA,0,1
11209,2033920.0,Spanish,USA,0,1


In [7]:
train_data, test_data = train_test_split(language_df, test_size=0.2, random_state=42)

# Save the training and testing data as CSV files
train_data.to_csv('team_train_lang.csv', index=False)
test_data.to_csv('team_test_lang.csv', index=False)

In [8]:
team_train_lang = pd.read_csv('team_train_lang.csv')
team_test_lang = pd.read_csv('team_test_lang.csv')

In [9]:
#Using the ols function to create an ols object. 'ols' stands for 'Ordinary least squares'
ols_object = smf.ols(formula = 'profit~is_english', data = team_train_lang)
model = ols_object.fit()
model.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,169.7
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,1.8799999999999998e-38
Time:,22:05:45,Log-Likelihood:,-173200.0
No. Observations:,8968,AIC:,346400.0
Df Residuals:,8966,BIC:,346400.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.787e+07,3.18e+06,-8.761,0.000,-3.41e+07,-2.16e+07
is_english,4.226e+07,3.24e+06,13.028,0.000,3.59e+07,4.86e+07

0,1,2,3
Omnibus:,3453.071,Durbin-Watson:,2.028
Prob(Omnibus):,0.0,Jarque-Bera (JB):,125102.692
Skew:,1.177,Prob(JB):,0.0
Kurtosis:,21.146,Cond. No.,10.1
