## Data Preparation

In [3]:
!pip install pandas
import pandas as pd

Collecting pandas
  Using cached pandas-2.0.3-cp310-cp310-win_amd64.whl (10.7 MB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting numpy>=1.21.0 (from pandas)
  Downloading numpy-1.25.1-cp310-cp310-win_amd64.whl (15.0 MB)
                                              0.0/15.0 MB ? eta -:--:--
                                              0.0/15.0 MB ? eta -:--:--
                                             0.0/15.0 MB 495.5 kB/s eta 0:00:31
                                             0.1/15.0 MB 544.7 kB/s eta 0:00:28
                                             0.1/15.0 MB 438.9 kB/s eta 0:00:35
                                             0.1/15.0 MB 467.6 kB/s eta 0:00:32
                                             0.1/15.0 MB 481.4 kB/s eta 0:00:31
                                             0.2/15.0 MB 459.5 kB/s eta 0:00:3

In [4]:
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [5]:
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

In [6]:
numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]

In [7]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']]

In [8]:
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left')

In [10]:
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')

In [11]:
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 11)

In [12]:
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)

In [13]:
full_movie_data.columns

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'budget', 'imdb_score'],
      dtype='object')

## Modeling

In [15]:
!pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

Collecting scikit-learn
  Using cached scikit_learn-1.3.0-cp310-cp310-win_amd64.whl (9.2 MB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Using cached scipy-1.11.1-cp310-cp310-win_amd64.whl (44.0 MB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached joblib-1.3.1-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.1 scikit-learn-1.3.0 scipy-1.11.1 threadpoolctl-3.1.0


In [16]:
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [17]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [18]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10)
results

{'fit_time': array([1.58731246, 2.22308278, 1.48026586, 1.46880007, 1.44086313,
        1.42003059, 1.40931082, 1.40800524, 1.40927339, 1.38363266]),
 'score_time': array([0.0183537 , 0.00678182, 0.00505304, 0.        , 0.        ,
        0.00251651, 0.0037148 , 0.00261211, 0.        , 0.        ]),
 'test_score': array([0.66049965, 0.84786774, 0.64125353, 0.78401314, 0.77510459,
        0.86241128, 0.76320159, 0.85147252, 0.67303404, 0.65020744]),
 'train_score': array([0.91105346, 0.9140962 , 0.9183163 , 0.91613755, 0.91832576,
        0.91492969, 0.91866643, 0.91378583, 0.9201035 , 0.91384172])}

In [19]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9159256441060745
Test Score: 0.7509065522067779


## Hyperparameter tunning

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [22]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [23]:
grid_search= GridSearchCV(estimator,
                       param_grid = param_tunning,
                       scoring='r2',
                       cv=5) 

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [25]:
grid_search.fit(X_train, y_train)

In [26]:
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [27]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9627561411785589
Test Score: 0.7639459819930264


In [28]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=200))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=200),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__keep_empty_features': False,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 200,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_m

In [29]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [30]:
estimator.fit(X_train,y_train)

In [31]:
estimator.score(X_test, y_test)

0.7406643130832358

## Saving model

In [32]:
from joblib import dump

In [33]:
dump(estimator, '../model/model.pkl')

['../model/model.pkl']

In [34]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'budget', 'imdb_score'],
      dtype='object')