In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.seasonal import STL


from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

---
# Read Data
---

In [2]:
df = pd.read_csv("../data/_all_data.csv")

In [3]:
df = df[~((df["yil"] == 2021) & (df["ay"] == 6))]

---
# EDA
---

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((np.array(y_true) - np.array(y_pred)) / y_true)) * 100

In [5]:
df.shape

(203381, 58)

In [6]:
df.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'en_guncel_kod', 'koli_i̇ci_adet', 'koli',
       'kg', 'tl', 'adet', 'date', 'portfoy', 'satis_var', 'enflasyon_etkisi',
       'yarisma', 'peak', 'fiyat', 'fiyat_gecisi', 'promosyon_tutari',
       'ciro_kull_i̇ade_dus', 'aktivite_tipi', 'indirim__', 'no_of_days',
       'weekdays_n', 'weekdays_ratio', 'weekend_n', 'weekend_ratio',
       'actual_holiday_n', 'actual_holiday_ratio', 'total_holiday_n',
       'total_holiday_ratio', 'school_day_n', 'school_day_ratio',
       'school_day_brdg_n', 'school_day_brdg_ratio', 'ramadan_n',
       'ramadan_ratio', 'pandemic', 'lockdown', 'gozlem_sayisi',
       'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
       'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'scope', 'indirim__bins',
       'new_adet', 'season', 'trend', 'residual', 'adet_flag', 'Kanal',
       'scope_type'],
      dtype='object')

In [34]:
df.date = pd.to_datetime(df.date, format='%Y-%m-%d', errors="coerce")

In [35]:
df_reg = df[(df["scope"] == 3) & (df["portfoy"] == 1)]

In [36]:
test2 = df_reg[(df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6") & (df_reg["grup_adi"] == "ŞOK")]
test = df_reg[(df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6") & (df_reg["grup_adi"] == "ŞOK")]

In [37]:
def trend_seasonality_decomp(data):
    df_all = []
    for sku in data["en_guncel_kod"].unique():
        for grup in data["grup_adi"].unique():
            temp_df = data[(data["en_guncel_kod"] == sku) & 
                           (data["grup_adi"] == grup)]

            if len(temp_df) > 2:
                df_ts = temp_df[['new_adet','date']]
                df_ts.set_index('date',inplace=True)

                result = STL(df_ts).fit()
                temp_df['season'] = list(result.seasonal)
                temp_df['trend']  = list(result.trend)
                temp_df['residual']  = list(result.resid)
                df_all.append(temp_df)
            else:
                pass
    df_all = pd.concat(df_all)
    return df_all

In [38]:
for sku in df_reg.urun_adi.unique():
    for grp in df_reg.grup_adi.unique():
        temp_df = df_reg[(df_reg["urun_adi"] == sku) & (df_reg["grup_adi"] == grp)]
        if len(temp_df) > 1:
            if len(temp_df.trend.value_counts()) > 1:
                pass
            else:
                print(sku, grp)
        else:
            pass

In [39]:
test3 = df_reg[(df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6") & (df_reg["grup_adi"] == "ŞOK")]

In [40]:
df_reg[(df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6") & (df_reg["grup_adi"] == "ŞOK")].trend.value_counts()

2.255116e+06    2
2.255116e+06    1
2.255116e+06    1
2.255116e+06    1
2.255116e+06    1
2.255116e+06    1
2.255116e+06    1
2.255116e+06    1
Name: trend, dtype: int64

In [81]:
cols_to_drop = ['en_guncel_kod', 'koli_i̇ci_adet', 'koli',
                'kg', 'tl', 'adet', 'date', 'satis_var', 'promosyon_tutari',
                'ciro_kull_i̇ade_dus', 'weekdays_ratio', 'weekend_ratio',
                'total_holiday_ratio', 'school_day_ratio', 'school_day_brdg_n', 'school_day_brdg_ratio',
                'ramadan_ratio', 'gozlem_sayisi', 'actual_holiday_ratio',
                'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
                'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'adet_flag', 
                'scope_type']

In [82]:
df_droped = df[cols_to_drop]

In [83]:
df.drop(columns=cols_to_drop, axis=1, inplace=True)

In [84]:
df.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'portfoy', 'enflasyon_etkisi', 'yarisma',
       'peak', 'fiyat', 'fiyat_gecisi', 'aktivite_tipi', 'indirim__',
       'no_of_days', 'weekdays_n', 'weekend_n', 'actual_holiday_n',
       'total_holiday_n', 'school_day_n', 'ramadan_n', 'pandemic', 'lockdown',
       'scope', 'indirim__bins', 'new_adet', 'season', 'trend', 'residual',
       'Kanal'],
      dtype='object')

In [85]:
(df.isna().sum() / len(df))*100

yil                 0.0000
ay                  0.0000
grup_adi            0.0000
ana_kategori_adi    0.0000
kategori_adi        0.0000
marka_adi           0.0000
urun_adi            0.0000
portfoy             0.0000
enflasyon_etkisi    0.0000
yarisma             0.0000
peak                0.0000
fiyat               6.7268
fiyat_gecisi        6.7268
aktivite_tipi      98.9950
indirim__           0.0000
no_of_days          0.0000
weekdays_n          0.0000
weekend_n           0.0000
actual_holiday_n    0.0000
total_holiday_n     0.0000
school_day_n        0.0000
ramadan_n           0.0000
pandemic            0.0000
lockdown            0.0000
scope               0.0000
indirim__bins       0.0000
new_adet            0.0000
season             89.6534
trend              89.6534
residual           89.6534
Kanal               0.0000
dtype: float64

In [86]:
df.rename(columns={"Kanal": "kanal"}, inplace=True)

---
# Regresyon
---

In [87]:
df_reg = df[df["scope"] == 3]

## Aktivite Tipi silindi çünkü %97'si missing

In [88]:
df_reg.drop(columns=["aktivite_tipi"], axis=1, inplace=True)

In [89]:
(df_reg.isna().sum() / len(df_reg))*100

yil                0.0000
ay                 0.0000
grup_adi           0.0000
ana_kategori_adi   0.0000
kategori_adi       0.0000
marka_adi          0.0000
urun_adi           0.0000
portfoy            0.0000
enflasyon_etkisi   0.0000
yarisma            0.0000
peak               0.0000
fiyat              6.2919
fiyat_gecisi       6.2919
indirim__          0.0000
no_of_days         0.0000
weekdays_n         0.0000
weekend_n          0.0000
actual_holiday_n   0.0000
total_holiday_n    0.0000
school_day_n       0.0000
ramadan_n          0.0000
pandemic           0.0000
lockdown           0.0000
scope              0.0000
indirim__bins      0.0000
new_adet           0.0000
season             0.0000
trend              0.0000
residual           0.0000
kanal              0.0000
dtype: float64

In [90]:
df_reg["enflasyon_etkisi"].fillna(method="ffill", inplace=True)

In [91]:
df_reg["fiyat"].fillna(method="ffill", inplace=True)
df_reg["fiyat_gecisi"].fillna(value=0, inplace=True)

In [92]:
df_reg = df_reg[df_reg["portfoy"] == 1]

In [93]:
df_reg.drop(columns=["portfoy", "scope", "yarisma"], axis=1, inplace=True)

In [94]:
df_reg.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'enflasyon_etkisi', 'peak', 'fiyat',
       'fiyat_gecisi', 'indirim__', 'no_of_days', 'weekdays_n', 'weekend_n',
       'actual_holiday_n', 'total_holiday_n', 'school_day_n', 'ramadan_n',
       'pandemic', 'lockdown', 'indirim__bins', 'new_adet', 'season', 'trend',
       'residual', 'kanal'],
      dtype='object')

In [21]:
X_vars = ['yil', 'ay', 'kanal', 'grup_adi', 'ana_kategori_adi', 
          'kategori_adi', 'marka_adi', 'urun_adi', 'enflasyon_etkisi', 
          'peak', 'indirim__bins', 'lockdown', 'season', 'trend']

In [22]:
y_vars = ["new_adet"]

In [23]:
df_reg = df_reg[X_vars+y_vars]

In [24]:
df_reg.urun_adi.unique()

array(['ULK TOZ KAK.YENİ TASARIM 50Gx12x12',
       'LAVİVA DOLG.VE BİSK.ÇİK.35Gx24x6', 'ULK.ÇİK.GOF. BEYAZ 35Gx36x6',
       'CARAMIO K.MELLİ BAT.ÇİK.35Gx24x6',
       'ULK KARE ÇİK.ÜZÜMLÜ FIN.SÜT.65Gx6x6', 'ULK ÇİK.KARE FIN.70Gx6x6',
       'CARAMIO KARE ÇİK.K.MELLİ 60Gx12x6', 'ULK ÇİK.KARE A.FIS.70Gx6x6',
       'ALBENİ KAPL.BAR B.BOY 56Gx24x6', 'YUPO DRAJE DOYPACK 111Gx24',
       'ÇOKONAT KAPL.GOF.5x33Gx24', 'ALBENİ KAPL.BAR 5x40Gx24',
       'COCOSTAR H.CEV.BAR 28Gx24x6 YENİ', 'ULK ÇİK.GOF.5x36Gx24',
       'ÇOKOKREM CAM KAV.700Gx6', 'ALTINBAŞAK Ç.OTLU KİNOA KR.5x40Gx18',
       'ÇOKOKREM SAKLAMA KABI 1000Gx6', 'HOBBY MİNİ İKR.POŞET 250Gx12',
       'METRO KAPL.BAR 5x40Gx18', 'ULK ÇİK.KARE SÜT.70Gx6x6',
       'ULK DAMLA ÇİK.BİT.150Gx12', 'HALLEY KARADUT DOL.7x33,7Gx12',
       'OLALA SUFLE MINI 162Gx12', 'YUPO JELLY PORTAKAL HALKASI 70GRX24',
       'İKRAM KRE.BİSK.ÇİK.3x92Gx12', 'ULK BEBE BİSK.2x500Gx4',
       'ULK PÖTİBÖR BİSK.450Gx10', 'ULK KUVERTÜR %54 BİT.TAB.200Gx6',
    

In [25]:
df_reg[df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6"].grup_adi.value_counts()

ŞOK                  9
Diğer_Pasifik        9
POTANSİYEL MARKET    8
MİGROS               8
BTT                  8
YEREL ZİNCİR         8
BİM                  8
GELENEKSEL KANAL     8
ORTA MARKET          8
Diğer_Horizon        8
Name: grup_adi, dtype: int64

In [69]:
test2 = df_reg[(df_reg["urun_adi"] == "ULK.ÇİK.GOF. BEYAZ 35Gx36x6") & (df_reg["grup_adi"] == "ŞOK")]

In [27]:
test.drop(columns=['kanal', 'grup_adi', 'ana_kategori_adi', 
                   'kategori_adi', 'marka_adi', 'urun_adi'], axis=1, inplace=True)

In [28]:
for idx in test.iloc[-3:].index:
    test.loc[idx, "season"] = test.loc[list(range(idx-1, idx-7, -1)), "season"].mean()
    test.loc[idx, "trend"] = test.loc[list(range(idx-1, idx-7, -1)), "trend"].mean()

In [29]:
X_train, X_test, y_train, y_test = test.iloc[:-3, :-1], test.iloc[-3:, :-1], test.iloc[:-3, -1:], test.iloc[-3:, -1:]

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6, 8), (3, 8), (6, 1), (3, 1))

---
# Linear Regression
---

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lm = LinearRegression()
lm.fit(X_train, y_train)
yhat_lm = lm.predict(X_test)

In [33]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_lm))

MAPE: 132.62121458449633


In [34]:
yhat_lm

array([[2787319.41927052],
       [3184323.51063323],
       [3519226.95055639]])

In [35]:
y_test

Unnamed: 0,new_adet
4555,2665518.8711
4556,1314006.8711
4557,1002750.8711


---
# KNN
---

In [37]:
#GridSearch to determine best parameters
param_grid = {
    'algorithm': ['auto', 'ball_tree'],
    'leaf_size': [30, 50, 75],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
    'n_neighbors' : [2,3,4]
}

gsm = GridSearchCV(KNeighborsRegressor(),
                   param_grid=param_grid,
                   cv=5,
                   n_jobs = -1,
                   verbose=2,
                   refit=True,
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   21.9s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree'],
                         'leaf_size': [30, 50, 75],
                         'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [2, 3, 4],
                         'weights': ['uniform', 'distance']},
             scoring='neg_mean_absolute_error', verbose=2)

In [38]:
best_params_ = gsm.best_params_

In [39]:
yhat_knn = gsm.predict(X_test)

In [40]:
y_test

Unnamed: 0,new_adet
4555,2665518.8711
4556,1314006.8711
4557,1002750.8711


In [41]:
yhat_knn

array([[2787319.41927995],
       [3826958.77390231],
       [4015312.80062742]])

In [42]:
print("MAPE:", mean_absolute_percentage_error(y_test, yhat_knn))

MAPE: new_adet   165.4142
dtype: float64


---
# XGBoost
---

In [43]:
#GridSearch to determine best parameters
param_grid = {'colsample_bytree': [0.4, 0.7],
              'booster': ["gbtree", "gblinear", "dart"],
 'learning_rate': [0.1, 0.001],
 'max_depth': [15, 20],
# 'min_child_weight': [5,6,7],
 'n_estimators': [500],
 'objective': ['reg:squarederror'],
# 'subsample': [0.5, 0.6, 0.7],
             'random_state': [42]}


gsm = GridSearchCV(XGBRegressor(),
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   refit=True,
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.0min finished


Parameters: { "colsample_bytree", "max_depth" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'booster': ['gbtree', 'gblinear', '

In [44]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'booster': 'gblinear', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 500, 'objective': 'reg:squarederror', 'random_state': 42}


In [45]:
yhat_xgb = gsm.predict(X_test)

In [46]:
yhat_xgb

array([2878027.2, 3303018.2, 3607729. ], dtype=float32)

In [47]:
y_test

Unnamed: 0,new_adet
4555,2665518.8711
4556,1314006.8711
4557,1002750.8711


In [48]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_xgb))

MAPE: 132.0428105277991


---
# Decision Tree
---

In [49]:
#GridSearch to determine best parameters

param_grid = {'criterion':['mse', 'mae'],
              'max_depth': [i for i in range(3, 11)],
              'min_samples_leaf': [i for i in range(3, 11)],
              'min_samples_split': [i for i in range(3, 11)],
              'random_state': [42]}

gsm = GridSearchCV(DecisionTreeRegressor(),
                  param_grid=param_grid,
                  cv=5,
                  n_jobs=-1,
                  verbose=2,
                  refit=True,
                  scoring='neg_mean_absolute_error')

#gsm = DecisionTreeRegressor(random_state=42)

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 1172 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 2304 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 3764 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 5120 out of 5120 | elapsed:   35.4s finished


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10],
                         'random_state': [42]},
             scoring='neg_mean_absolute_error', verbose=2)

In [50]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'mse', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 42}


In [51]:
yhat_dt = gsm.predict(X_test)

In [52]:
yhat_dt

array([1088214.87109424, 4486423.96744678, 4486423.96744678])

In [53]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_dt))

MAPE: 155.46581163883383


---
# Random Forest Regressor
---

In [54]:
#GridSearch to determine best parameters

param_grid = {'criterion':['mse', 'mae'],
              'max_depth': [i for i in range(3, 7)],
              'min_samples_leaf': [i for i in range(3, 7)],
              'min_samples_split': [i for i in range(3, 7)],
              'random_state': [42]}

gsm = GridSearchCV(RandomForestRegressor(),
                  param_grid=param_grid,
                  cv=5,
                  n_jobs=-1,
                  verbose=2,
                  refit=True,
                  scoring='neg_mean_absolute_error')

#gsm = DecisionTreeRegressor(random_state=42)

gsm.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   39.3s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [3, 4, 5, 6],
                         'min_samples_leaf': [3, 4, 5, 6],
                         'min_samples_split': [3, 4, 5, 6],
                         'random_state': [42]},
             scoring='neg_mean_absolute_error', verbose=2)

In [55]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'mse', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 42}


In [56]:
yhat_rf = gsm.predict(X_test)

In [57]:
yhat_rf

array([2741371.66475227, 2741371.66475227, 2741371.66475227])

In [58]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_rf))

MAPE: 94.95257116926801


---
# Gradient Boosting Regressor
---

In [59]:
gradient_boosting_parameters = {
    "learning_rate": [0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 3),
    "min_samples_leaf": np.linspace(0.1, 0.5, 3),
    "max_depth":[3,5],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 1.0],
    "n_estimators":[10]
    }

gsm = GridSearchCV(GradientBoostingRegressor(), 
                   param_grid=gradient_boosting_parameters, 
                   cv=5, 
                   n_jobs=-1, 
                   verbose=2, 
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   15.2s finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'mae'],
                         'learning_rate': [0.1, 0.15, 0.2], 'max_depth': [3, 5],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': array([0.1, 0.3, 0.5]),
                         'min_samples_split': array([0.1, 0.3, 0.5]),
                         'n_estimators': [10], 'subsample': [0.5, 1.0]},
             scoring='neg_mean_absolute_error', verbose=2)

In [60]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'mae', 'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 10, 'subsample': 1.0}


In [61]:
yhat_gb = gsm.predict(X_test)

In [62]:
yhat_gb

array([3151050.00100916, 4213633.79784968, 4213633.79784968])

In [63]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_gb))

MAPE: 174.46391336563286
