<a href="https://colab.research.google.com/github/dynasty-29/carbon_emmision_streamlit_prototype_app/blob/main/predictor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing libraries
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
#loading our clean data
data = pd.read_csv('/content/data')

In [3]:
data.head()

Unnamed: 0,Country,ISO 3166-1 alpha-3,Year,Total,Coal,Oil,Gas,Cement,Flaring,Per Capita
0,Afghanistan,AFG,2015,9.791093,2.843264,6.624512,0.282128,0.041189,0.0,0.290076
1,Afghanistan,AFG,2016,9.067598,2.878736,5.794097,0.318639,0.076126,0.0,0.261795
2,Afghanistan,AFG,2017,9.867969,3.447824,6.074912,0.300448,0.044785,0.0,0.276852
3,Afghanistan,AFG,2018,10.818048,3.8472,6.620848,0.29312,0.05688,0.0,0.294876
4,Afghanistan,AFG,2019,11.081621,3.954767,6.842956,0.245569,0.038329,0.0,0.293401


In [4]:
#dropping unnecessary columns

data.drop(columns=['Year', 'Country', 'ISO 3166-1 alpha-3'], inplace=True, axis=1)

#printing variables that will be used in modelling
print(data.columns)

Index(['Total', 'Coal', 'Oil', 'Gas', 'Cement', 'Flaring', 'Per Capita'], dtype='object')


In [5]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [6]:
#separating features and target for use in modelling

X = data.drop(['Per Capita'], axis=1)

y = data['Per Capita']

In [7]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)

In [8]:
xgb1 = XGBRegressor()
parameters = {
              'objective':['reg:squarederror'],
              'learning_rate': [.0001, 0.001, .01],
              'max_depth': [3, 5, 7],
              'min_child_weight': [3,5,7],
              'subsample': [0.1,0.5,1.0],
              'colsample_bytree': [0.1, 0.5, 1.0],
              'n_estimators': [500]}

In [9]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=0)

In [10]:
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=XGBRegressor(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.1, 0.5, 1.0],
                         'learning_rate': [0.0001, 0.001, 0.01],
                         'max_depth': [3, 5, 7], 'min_child_weight': [3, 5, 7],
                         'n_estimators': [500],
                         'objective': ['reg:squarederror'],
                         'subsample': [0.1, 0.5, 1.0]})

In [11]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

0.7005956734469501
{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 500, 'objective': 'reg:squarederror', 'subsample': 1.0}


In [12]:
xgb_cv = (xgb_grid.best_estimator_)

In [13]:
eval_set = [(X_train, y_train),
            (X_val, y_val)]

In [14]:
fit_model = xgb_cv.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    eval_metric='mae',
    early_stopping_rounds=50,
    verbose=False)

In [15]:
print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("R2:", r2_score(y_val, fit_model.predict(X_val)))

MAE: 1.423950374275143
MSE: 5.039964254189755
R2: 0.8265221437772832


In [16]:
print("MAE:", mean_absolute_error(y_test, fit_model.predict(X_test)))
print("MSE:", mean_squared_error(y_test, fit_model.predict(X_test)))
print("R2:", r2_score(y_test, fit_model.predict(X_test)))

MAE: 1.459400357910397
MSE: 4.608591199734939
R2: 0.7933383178990873


In [17]:
fit_model.save_model('xgb_model.json')

In [18]:
# saving the model

import pickle
file = open('carbonemissionmodel.pkl','wb')
pickle.dump(fit_model,file)
file.close()