## Data Preparation

In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.3.4'

In [2]:
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [77]:
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

In [78]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   title_year                 4935 non-null   float64
 1   aspect_ratio               4714 non-null   float64
 2   duration                   5028 non-null   float64
 3   cast_total_facebook_likes  5043 non-null   int64  
 4   budget                     4551 non-null   float64
 5   imdb_score                 5043 non-null   float64
 6   gross                      4159 non-null   float64
 7   movie_title                5043 non-null   object 
dtypes: float64(6), int64(1), object(1)
memory usage: 315.3+ KB


In [79]:
colnames_numerics = movie_data.select_dtypes('number').columns.tolist()

In [80]:

movie_data = movie_data[colnames_numerics+['movie_title']]

In [81]:
movie_data.head(5)

Unnamed: 0,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score,gross,movie_title
0,2009.0,1.78,178.0,4834,237000000.0,7.9,760505847.0,Avatar
1,2007.0,2.35,169.0,48350,300000000.0,7.1,309404152.0,Pirates of the Caribbean: At World's End
2,2015.0,2.35,148.0,11700,245000000.0,6.8,200074175.0,Spectre
3,2012.0,2.35,164.0,106759,250000000.0,8.5,448130642.0,The Dark Knight Rises
4,,,,143,,7.1,,Star Wars: Episode VII - The Force Awakens ...


In [82]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']]

In [83]:
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left')

In [84]:
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')

In [87]:
full_movie_data.isnull().sum()

movie_title                   0
opening_gross                25
screens                      83
production_budget             0
worldwide_gross               0
title_year                    0
aspect_ratio                 16
duration                      2
cast_total_facebook_likes     0
budget                       80
imdb_score                    0
gross                        24
dtype: int64

In [88]:
full_movie_data.shape

(2304, 12)

In [92]:
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 12)

In [93]:
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)

In [94]:
full_movie_data.shape

(2304, 10)

In [95]:
full_movie_data.columns

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'cast_total_facebook_likes',
       'budget', 'imdb_score'],
      dtype='object')

## Modeling

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [96]:
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [97]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [98]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10)
results

{'fit_time': array([0.88479304, 0.48088694, 0.47488785, 0.47688961, 0.47585297,
        0.47492003, 0.49485278, 0.46890831, 0.46889091, 0.46685529]),
 'score_time': array([0.00699997, 0.00399971, 0.0070014 , 0.00503254, 0.00499868,
        0.00500178, 0.00599837, 0.00498414, 0.00500011, 0.00499916]),
 'test_score': array([0.67401976, 0.85106446, 0.64392674, 0.77896441, 0.78399079,
        0.8645305 , 0.75904184, 0.87472207, 0.6565591 , 0.65745399]),
 'train_score': array([0.91673951, 0.91581777, 0.9228721 , 0.91654412, 0.92172829,
        0.91476722, 0.92151444, 0.91734995, 0.92320705, 0.91766026])}

In [99]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.918820072167686
Test Score: 0.754427366793438


## Hyperparameter tunning

In [100]:
from sklearn.model_selection import GridSearchCV

In [101]:
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [102]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [103]:
grid_search= GridSearchCV(estimator,
                       param_grid = param_tunning,
                       scoring='r2',
                       cv=5) 

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [105]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('core_model',
                                        GradientBoostingRegressor())]),
             param_grid={'core_model__n_estimators': range(20, 501, 20)},
             scoring='r2')

In [106]:
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [107]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9716090255265606
Test Score: 0.7627039035608394


In [110]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=260))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=260),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 260,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample'

In [111]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=260,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [112]:
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=260))])

In [114]:
estimator.score(X_test, y_test)

0.7299850823653558

## Saving model

In [115]:
from joblib import dump

In [116]:
dump(estimator, '../model/model.pkl')

['../model/model.pkl']

In [117]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'cast_total_facebook_likes', 'budget',
       'imdb_score'],
      dtype='object')