In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import mean_absolute_percentage_error


from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.4f}'.format

---
# Read Data
---

In [2]:
df = pd.read_csv("../data/_all_data.csv")

In [3]:
df = df[~((df["yil"] == 2021) & (df["ay"] == 6))]

---
# EDA
---

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((np.array(y_true) - np.array(y_pred)) / y_true)) * 100

In [5]:
df.shape

(203381, 58)

In [6]:
df.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'en_guncel_kod', 'koli_i̇ci_adet', 'koli',
       'kg', 'tl', 'adet', 'date', 'portfoy', 'satis_var', 'enflasyon_etkisi',
       'yarisma', 'peak', 'fiyat', 'fiyat_gecisi', 'promosyon_tutari',
       'ciro_kull_i̇ade_dus', 'aktivite_tipi', 'indirim__', 'no_of_days',
       'weekdays_n', 'weekdays_ratio', 'weekend_n', 'weekend_ratio',
       'actual_holiday_n', 'actual_holiday_ratio', 'total_holiday_n',
       'total_holiday_ratio', 'school_day_n', 'school_day_ratio',
       'school_day_brdg_n', 'school_day_brdg_ratio', 'ramadan_n',
       'ramadan_ratio', 'pandemic', 'lockdown', 'gozlem_sayisi',
       'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
       'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'scope', 'indirim__bins',
       'new_adet', 'season', 'trend', 'residual', 'adet_flag', 'Kanal',
       'scope_type'],
      dtype='object')

In [7]:
cols_to_drop = ['en_guncel_kod', 'koli_i̇ci_adet', 'koli',
                'kg', 'tl', 'adet', 'date', 'satis_var', 'promosyon_tutari',
                'ciro_kull_i̇ade_dus', 'weekdays_ratio', 'weekend_ratio',
                'total_holiday_ratio', 'school_day_ratio', 'school_day_brdg_n', 'school_day_brdg_ratio',
                'ramadan_ratio', 'gozlem_sayisi', 'actual_holiday_ratio',
                'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
                'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'adet_flag', 
                'scope_type']

In [8]:
df_droped = df[cols_to_drop]

In [9]:
df.drop(columns=cols_to_drop, axis=1, inplace=True)

In [10]:
df.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'portfoy', 'enflasyon_etkisi', 'yarisma',
       'peak', 'fiyat', 'fiyat_gecisi', 'aktivite_tipi', 'indirim__',
       'no_of_days', 'weekdays_n', 'weekend_n', 'actual_holiday_n',
       'total_holiday_n', 'school_day_n', 'ramadan_n', 'pandemic', 'lockdown',
       'scope', 'indirim__bins', 'new_adet', 'season', 'trend', 'residual',
       'Kanal'],
      dtype='object')

In [11]:
(df.isna().sum() / len(df))*100

yil                 0.0000
ay                  0.0000
grup_adi            0.0000
ana_kategori_adi    0.0000
kategori_adi        0.0000
marka_adi           0.0000
urun_adi            0.0000
portfoy             0.0000
enflasyon_etkisi    0.0000
yarisma             0.0000
peak                0.0000
fiyat               6.7268
fiyat_gecisi        6.7268
aktivite_tipi      98.9950
indirim__           0.0000
no_of_days          0.0000
weekdays_n          0.0000
weekend_n           0.0000
actual_holiday_n    0.0000
total_holiday_n     0.0000
school_day_n        0.0000
ramadan_n           0.0000
pandemic            0.0000
lockdown            0.0000
scope               0.0000
indirim__bins       0.0000
new_adet            0.0000
season             89.6534
trend              89.6534
residual           89.6534
Kanal               0.0000
dtype: float64

In [12]:
df.rename(columns={"Kanal": "kanal"}, inplace=True)

---
# Regresyon
---

In [13]:
df_reg = df[df["scope"] == 3]

## Aktivite Tipi silindi çünkü %97'si missing

In [14]:
df_reg.drop(columns=["aktivite_tipi"], axis=1, inplace=True)

In [15]:
(df_reg.isna().sum() / len(df_reg))*100

yil                0.0000
ay                 0.0000
grup_adi           0.0000
ana_kategori_adi   0.0000
kategori_adi       0.0000
marka_adi          0.0000
urun_adi           0.0000
portfoy            0.0000
enflasyon_etkisi   0.0000
yarisma            0.0000
peak               0.0000
fiyat              6.2919
fiyat_gecisi       6.2919
indirim__          0.0000
no_of_days         0.0000
weekdays_n         0.0000
weekend_n          0.0000
actual_holiday_n   0.0000
total_holiday_n    0.0000
school_day_n       0.0000
ramadan_n          0.0000
pandemic           0.0000
lockdown           0.0000
scope              0.0000
indirim__bins      0.0000
new_adet           0.0000
season             0.0000
trend              0.0000
residual           0.0000
kanal              0.0000
dtype: float64

In [16]:
df_reg["enflasyon_etkisi"].fillna(method="ffill", inplace=True)

In [17]:
df_reg["fiyat"].fillna(method="ffill", inplace=True)
df_reg["fiyat_gecisi"].fillna(value=0, inplace=True)

In [18]:
df_reg = df_reg[df_reg["portfoy"] == 1]

In [19]:
df_reg.drop(columns=["portfoy", "scope", "yarisma"], axis=1, inplace=True)

In [20]:
df_reg.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'enflasyon_etkisi', 'peak', 'fiyat',
       'fiyat_gecisi', 'indirim__', 'no_of_days', 'weekdays_n', 'weekend_n',
       'actual_holiday_n', 'total_holiday_n', 'school_day_n', 'ramadan_n',
       'pandemic', 'lockdown', 'indirim__bins', 'new_adet', 'season', 'trend',
       'residual', 'kanal'],
      dtype='object')

In [21]:
X_vars = ['yil', 'ay', 'kanal', 'grup_adi', 'ana_kategori_adi', 
          'kategori_adi', 'marka_adi', 'urun_adi', 'enflasyon_etkisi', 
          'peak', 'indirim__bins', 'lockdown', 'season', 'trend']

In [22]:
y_vars = ["new_adet"]

In [23]:
df_reg = df_reg[X_vars+y_vars]

In [27]:
df_reg.urun_adi.value_counts().sort_values(ascending=True)

YUPO JELLY PORTAKAL HALKASI 70GRX24        6
ULK ÇİK.KARE A.FIS.70Gx6x6                 7
KREMINI MINI TOFFE KARPUZ 7Gx50x12         7
KREMINI MINI TOFFE ÇILEK ARO.7Gx50x12      8
ULK PÖTİBÖR BİSK.450Gx10                  10
                                        ... 
KEKSTRA JÖLEBOL KEK ÇİLEK 35Gx24         106
METRO KAPL.BAR 5x40Gx18                  106
ÇİZİVİÇ PEY.SAND.KR.3x90Gx12             106
DANKEK RULO PASTA ÇİLEK 245Gx8           111
ÇOKOKREM SAKLAMA KABI 1000Gx6            113
Name: urun_adi, Length: 61, dtype: int64

In [29]:
df_reg[df_reg["urun_adi"] == "ULK ÇİK.KARE A.FIS.70Gx6x6"]

Unnamed: 0,yil,ay,kanal,grup_adi,ana_kategori_adi,kategori_adi,marka_adi,urun_adi,enflasyon_etkisi,peak,indirim__bins,lockdown,season,trend,new_adet
12729,2020,11,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,14.03,0,-1,0,-61247.5766,211032.5,149784.9234
12730,2020,12,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,14.6,0,-1,0,-78599.5766,211032.5,132432.9234
12731,2021,1,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,14.97,0,-1,0,-53543.5766,211032.5,157488.9234
12732,2021,2,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,15.61,0,-1,0,-40295.5766,211032.5,170736.9234
12733,2021,3,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,16.19,0,-1,0,-47999.5766,211032.5,163032.9234
12734,2021,4,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,17.14,0,-1,1,-96635.5766,211032.5,114396.9234
12735,2021,5,pasifik,BİM,ATIŞTIRMALIK,ÇİKOLATA,ÜLKER,ULK ÇİK.KARE A.FIS.70Gx6x6,16.59,1,-1,1,341465.0359,211032.5,552497.5359


In [None]:
test = df_reg[(df_reg["kanal"] == "btt") & (df_reg["urun_adi"] == "8KEK FIN.52Gx24")]

In [None]:
test.drop(columns=['kanal', 'grup_adi', 'ana_kategori_adi', 
                   'kategori_adi', 'marka_adi', 'urun_adi'], axis=1, inplace=True)

In [None]:
for idx in test.iloc[-3:].index:
    test.loc[idx, "season"] = test.loc[list(range(idx-1, idx-7, -1)), "season"].mean()
    test.loc[idx, "trend"] = test.loc[list(range(idx-1, idx-7, -1)), "trend"].mean()

In [None]:
X_train, X_test, y_train, y_test = test.iloc[:-3, :-1], test.iloc[-3:, :-1], test.iloc[:-3, -1:], test.iloc[-3:, -1:]

In [52]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14, 8), (3, 8), (14, 1), (3, 1))

---
# Linear Regression
---

In [53]:
from sklearn.linear_model import LinearRegression

In [54]:
lm = LinearRegression()
lm.fit(X_train, y_train)
yhat_lm = lm.predict(X_test)

In [55]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_lm))

MAPE: 18.547099839878125


In [56]:
yhat_lm

array([[159774.66645665],
       [155987.850411  ],
       [158069.89835775]])

In [57]:
y_test

Unnamed: 0,new_adet
202073,220314.1851
202074,183455.5627
202075,182087.5627


---
# KNN
---

In [58]:
#GridSearch to determine best parameters
param_grid = {
    'algorithm': ['auto', 'ball_tree'],
    'leaf_size': [30, 50, 75],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
    'n_neighbors' : [5, 7, 10]
}

gsm = GridSearchCV(KNeighborsRegressor(),
                   param_grid=param_grid,
                   cv=5,
                   n_jobs = -1,
                   verbose=2,
                   refit=True,
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   21.7s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree'],
                         'leaf_size': [30, 50, 75],
                         'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [5, 7, 10],
                         'weights': ['uniform', 'distance']},
             scoring='neg_mean_absolute_error', verbose=2)

In [59]:
best_params_ = gsm.best_params_

In [60]:
yhat_knn = gsm.predict(X_test)

In [61]:
y_test

Unnamed: 0,new_adet
202073,220314.1851
202074,183455.5627
202075,182087.5627


In [62]:
yhat_knn

array([[157250.29872178],
       [156224.61328736],
       [156886.81876326]])

In [63]:
print("MAPE:", mean_absolute_percentage_error(y_test, yhat_knn))

MAPE: new_adet   19.1026
dtype: float64


---
# XGBoost
---

In [64]:
#GridSearch to determine best parameters
param_grid = {'colsample_bytree': [0.4, 0.7],
              'booster': ["gbtree", "gblinear", "dart"],
 'learning_rate': [0.1, 0.001],
 'max_depth': [15, 20],
# 'min_child_weight': [5,6,7],
 'n_estimators': [500],
 'objective': ['reg:squarederror'],
# 'subsample': [0.5, 0.6, 0.7],
             'random_state': [42]}


gsm = GridSearchCV(XGBRegressor(),
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   refit=True,
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.2s


Parameters: { "colsample_bytree", "max_depth" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.5min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'booster': ['gbtree', 'gblinear', '

In [65]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'booster': 'gblinear', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500, 'objective': 'reg:squarederror', 'random_state': 42}


In [66]:
yhat_xgb = gsm.predict(X_test)

In [67]:
yhat_xgb

array([162025.34, 156462.55, 157236.02], dtype=float32)

In [68]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_xgb))

MAPE: 18.16625401321893


---
# Decision Tree
---

In [69]:
#GridSearch to determine best parameters

param_grid = {'criterion':['mse', 'mae'],
              'max_depth': [i for i in range(3, 11)],
              'min_samples_leaf': [i for i in range(3, 11)],
              'min_samples_split': [i for i in range(3, 11)],
              'random_state': [42]}

gsm = GridSearchCV(DecisionTreeRegressor(),
                  param_grid=param_grid,
                  cv=5,
                  n_jobs=-1,
                  verbose=2,
                  refit=True,
                  scoring='neg_mean_absolute_error')

#gsm = DecisionTreeRegressor(random_state=42)

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 3908 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 5105 out of 5120 | elapsed:   35.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 5120 out of 5120 | elapsed:   35.8s finished


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10],
                         'random_state': [42]},
             scoring='neg_mean_absolute_error', verbose=2)

In [70]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'mse', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'random_state': 42}


In [71]:
yhat_dt = gsm.predict(X_test)

In [72]:
yhat_dt

array([122891.56273052, 122891.56273052, 122891.56273052])

In [73]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_dt))

MAPE: 36.58079741452237


---
# Random Forest Regressor
---

In [74]:
#GridSearch to determine best parameters

param_grid = {'criterion':['mse', 'mae'],
              'max_depth': [i for i in range(3, 7)],
              'min_samples_leaf': [i for i in range(3, 7)],
              'min_samples_split': [i for i in range(3, 7)],
              'random_state': [42]}

gsm = GridSearchCV(RandomForestRegressor(),
                  param_grid=param_grid,
                  cv=5,
                  n_jobs=-1,
                  verbose=2,
                  refit=True,
                  scoring='neg_mean_absolute_error')

#gsm = DecisionTreeRegressor(random_state=42)

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   48.9s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [3, 4, 5, 6],
                         'min_samples_leaf': [3, 4, 5, 6],
                         'min_samples_split': [3, 4, 5, 6],
                         'random_state': [42]},
             scoring='neg_mean_absolute_error', verbose=2)

In [75]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'mse', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 42}


In [76]:
yhat_rf = gsm.predict(X_test)

In [77]:
yhat_rf

array([120209.47825571, 119089.44625571, 119089.44625571])

In [78]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_rf))

MAPE: 38.350243675664395


---
# Gradient Boosting Regressor
---

In [79]:
gradient_boosting_parameters = {
    "learning_rate": [0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 3),
    "min_samples_leaf": np.linspace(0.1, 0.5, 3),
    "max_depth":[3,5],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 1.0],
    "n_estimators":[10]
    }

gsm = GridSearchCV(GradientBoostingRegressor(), 
                   param_grid=gradient_boosting_parameters, 
                   cv=5, 
                   n_jobs=-1, 
                   verbose=2, 
                   scoring='neg_mean_absolute_error')

gsm.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   20.8s finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'mae'],
                         'learning_rate': [0.1, 0.15, 0.2], 'max_depth': [3, 5],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': array([0.1, 0.3, 0.5]),
                         'min_samples_split': array([0.1, 0.3, 0.5]),
                         'n_estimators': [10], 'subsample': [0.5, 1.0]},
             scoring='neg_mean_absolute_error', verbose=2)

In [80]:
best_params_ = gsm.best_params_
print(gsm.best_params_)

{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 10, 'subsample': 1.0}


In [81]:
yhat_gb = gsm.predict(X_test)

In [82]:
yhat_gb

array([146229.48211061, 124213.84502498, 124213.84502498])

In [83]:
print("MAPE:", mean_absolute_percentage_error(np.array(y_test), yhat_gb))

MAPE: 32.111305834864815
