In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.seasonal import STL

from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

In [3]:
pd.set_option("display.max.columns", 100)
pd.set_option("display.max.rows", 100)

---
# Read Data
---

In [4]:
df = pd.read_csv("../data/_all_data.csv")

In [5]:
df = df[~((df["yil"] == 2021) & (df["ay"] == 6))]

---
# EDA
---

In [6]:
def mape(y_true, y_pred): 
    return np.mean(np.abs((np.array(y_true) - np.array(y_pred)) / y_true)) * 100

In [7]:
df.shape

(203435, 59)

In [8]:
df.date = pd.to_datetime(df.date, format='%Y-%m-%d', errors="coerce")

In [9]:
df_reg = df[(df["scope"] == 3) & (df["portfoy"] == 1)]

In [10]:
df.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'en_guncel_kod', 'koli_i̇ci_adet', 'koli',
       'kg', 'tl', 'adet', 'date', 'portfoy', 'satis_var', 'enflasyon_etkisi',
       'yarisma', 'new_adet', 'peak', 'fiyat', 'fiyat_gecisi',
       'promosyon_tutari', 'ciro_kull_i̇ade_dus', 'aktivite_tipi', 'indirim__',
       'no_of_days', 'weekdays_n', 'weekdays_ratio', 'weekend_n',
       'weekend_ratio', 'actual_holiday_n', 'actual_holiday_ratio',
       'total_holiday_n', 'total_holiday_ratio', 'school_day_n',
       'school_day_ratio', 'school_day_brdg_n', 'school_day_brdg_ratio',
       'ramadan_n', 'ramadan_ratio', 'pandemic', 'lockdown', 'gozlem_sayisi',
       'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
       'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'scope', 'indirim__bins',
       'season', 'trend', 'residual', 'adet_flag', 'Kanal', 'scope_type',
       'durum'],
      dtype='object')

In [11]:
cols_to_drop = ['en_guncel_kod', 'koli_i̇ci_adet', 'koli',
                'kg', 'tl', 'adet', 'date', 'satis_var', 'promosyon_tutari',
                'ciro_kull_i̇ade_dus', 'weekdays_ratio', 'weekend_ratio',
                'total_holiday_ratio', 'school_day_ratio', 'school_day_brdg_n', 'school_day_brdg_ratio',
                'ramadan_ratio', 'gozlem_sayisi', 'actual_holiday_ratio',
                'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
                'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'adet_flag', 
                'scope_type', 'durum']

In [12]:
df_droped = df[cols_to_drop]

In [13]:
df.drop(columns=cols_to_drop, axis=1, inplace=True)

In [14]:
(df.isna().sum() / len(df))*100

yil                  0.000000
ay                   0.000000
grup_adi             0.000000
ana_kategori_adi     0.000000
kategori_adi         0.000000
marka_adi            0.000000
urun_adi             0.000000
portfoy              0.000000
enflasyon_etkisi     0.000000
yarisma              0.000000
new_adet             0.000000
peak                 0.000000
fiyat                6.718116
fiyat_gecisi         6.718116
aktivite_tipi        0.000000
indirim__            0.000000
no_of_days           0.000000
weekdays_n           0.000000
weekend_n            0.000000
actual_holiday_n     0.000000
total_holiday_n      0.000000
school_day_n         0.000000
ramadan_n            0.000000
pandemic             0.000000
lockdown             0.000000
scope                0.000000
indirim__bins        0.000000
season              88.834271
trend               88.834271
residual            88.834271
Kanal                0.000000
dtype: float64

In [15]:
df.rename(columns={"Kanal": "kanal"}, inplace=True)

## Aktivite Tipi silindi çünkü %97'si missing

In [16]:
(df_reg.isna().sum() / len(df_reg))*100

yil                       0.000000
ay                        0.000000
grup_adi                  0.000000
ana_kategori_adi          0.000000
kategori_adi              0.000000
marka_adi                 0.000000
urun_adi                  0.000000
en_guncel_kod             0.000000
koli_i̇ci_adet            0.000000
koli                      0.000000
kg                        0.000000
tl                        0.000000
adet                      0.000000
date                      0.000000
portfoy                   0.000000
satis_var                 0.000000
enflasyon_etkisi          0.000000
yarisma                   0.000000
new_adet                  0.000000
peak                      0.000000
fiyat                     1.497121
fiyat_gecisi              1.497121
promosyon_tutari         49.635317
ciro_kull_i̇ade_dus      48.061420
aktivite_tipi             0.000000
indirim__                 0.000000
no_of_days                0.000000
weekdays_n                0.000000
weekdays_ratio      

In [17]:
df_reg["enflasyon_etkisi"].fillna(method="ffill", inplace=True)

In [18]:
df_reg["fiyat"].fillna(method="ffill", inplace=True)
df_reg["fiyat_gecisi"].fillna(value=0, inplace=True)

In [19]:
df_reg = df_reg[df_reg["portfoy"] == 1]

In [20]:
df_reg.drop(columns=["portfoy", "scope"], axis=1, inplace=True)

In [21]:
df_reg.columns

Index(['yil', 'ay', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'en_guncel_kod', 'koli_i̇ci_adet', 'koli',
       'kg', 'tl', 'adet', 'date', 'satis_var', 'enflasyon_etkisi', 'yarisma',
       'new_adet', 'peak', 'fiyat', 'fiyat_gecisi', 'promosyon_tutari',
       'ciro_kull_i̇ade_dus', 'aktivite_tipi', 'indirim__', 'no_of_days',
       'weekdays_n', 'weekdays_ratio', 'weekend_n', 'weekend_ratio',
       'actual_holiday_n', 'actual_holiday_ratio', 'total_holiday_n',
       'total_holiday_ratio', 'school_day_n', 'school_day_ratio',
       'school_day_brdg_n', 'school_day_brdg_ratio', 'ramadan_n',
       'ramadan_ratio', 'pandemic', 'lockdown', 'gozlem_sayisi',
       'toplam_satir', 'oran', 'baslangic_tarih', 'bitis_tarih',
       'son_kac_ay_eksik', 'eksik_repeat_sayisi', 'indirim__bins', 'season',
       'trend', 'residual', 'adet_flag', 'Kanal', 'scope_type', 'durum'],
      dtype='object')

In [22]:
X_vars = ['yil', 'ay', 'Kanal', 'grup_adi', 'ana_kategori_adi', 
          'kategori_adi', 'marka_adi', 'urun_adi', 'enflasyon_etkisi', 
          'peak', 'indirim__bins', 'aktivite_tipi', 'lockdown', 'season', 'trend']

In [23]:
y_vars = ["new_adet"]

In [24]:
df_reg = df_reg[X_vars+y_vars]

In [25]:
df_reg.columns

Index(['yil', 'ay', 'Kanal', 'grup_adi', 'ana_kategori_adi', 'kategori_adi',
       'marka_adi', 'urun_adi', 'enflasyon_etkisi', 'peak', 'indirim__bins',
       'aktivite_tipi', 'lockdown', 'season', 'trend', 'new_adet'],
      dtype='object')

---
# Label Encoding
---

In [26]:
le_kanal = LabelEncoder()
le_grup_adi = LabelEncoder()
le_ana_kategori_adi = LabelEncoder()
le_kategori_adi = LabelEncoder()
le_marka_adi = LabelEncoder()
le_urun_adi = LabelEncoder()
#le_aktivite = LabelEncoder()

df_reg["Kanal"] = le_kanal.fit_transform(df_reg["Kanal"])
df_reg["grup_adi"] = le_grup_adi.fit_transform(df_reg["grup_adi"])
df_reg["ana_kategori_adi"] = le_ana_kategori_adi.fit_transform(df_reg["ana_kategori_adi"])
df_reg["kategori_adi"] = le_kategori_adi.fit_transform(df_reg["kategori_adi"])
df_reg["marka_adi"] = le_marka_adi.fit_transform(df_reg["marka_adi"])
df_reg["urun_adi"] = le_urun_adi.fit_transform(df_reg["urun_adi"])
#df_reg["aktivite_tipi"] = le_aktivite.fit_transform(df_reg["aktivite_tipi"])

---
# Son 3 aya seasonality ve trend eklenmesi
---

In [27]:
df_all = []
for sku in df_reg.urun_adi.unique():
    for grp in df_reg.grup_adi.unique():
        test = df_reg[(df_reg["urun_adi"] == sku) & (df_reg["grup_adi"] == grp)]
        for idx in test.iloc[-3:].index:
            try:
                test.loc[idx, "season"] = test.loc[list(range(idx-1, idx-7, -1)), "season"].mean()
                test.loc[idx, "trend"] = test.loc[list(range(idx-1, idx-7, -1)), "trend"].mean()
                df_all.append(test)
            except KeyError:
                test.loc[idx, "season"] = test.loc[list(range(idx-1, test.index.min(), -1)), "season"].mean()
                test.loc[idx, "trend"] = test.loc[list(range(idx-1, test.index.min(), -1)), "trend"].mean()
                df_all.append(test)
df_all = pd.concat(df_all)
df_all.drop_duplicates(subset=df_all.columns.to_list(), ignore_index=True, inplace=True)

In [28]:
df_reg = df_all.copy()

In [29]:
X_train = []
X_test = []
y_train = []
y_test = []
for sku in df_reg.urun_adi.unique():
    for grp in df_reg.grup_adi.unique():
        temp_df = df_reg[(df_reg["urun_adi"] == sku) & (df_reg["grup_adi"] == grp)]
        temp_df.sort_values(by=["yil", "ay"], inplace=True)
        X_train.append(temp_df.iloc[:-3, :-1])
        X_test.append(temp_df.iloc[-3:, :-1])
        y_train.append(temp_df.iloc[:-3, -1:])
        y_test.append(temp_df.iloc[-3:, -1:])

In [30]:
X_train, X_test, y_train, y_test = pd.concat(X_train), pd.concat(X_test), pd.concat(y_train), pd.concat(y_test)

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2308, 15), (297, 15), (2308, 1), (297, 1))

---
# Regresyon

### Başlıyoruz

---
# Linear Regression
---

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
lm = LinearRegression()
lm.fit(X_train, y_train)
yhat_lm = lm.predict(X_test)

In [34]:
print("MAPE:", mape(np.array(y_test), yhat_lm))

MAPE: 2597.9622546244696


In [35]:
yhat_lm[:20]

array([[  49263.43542983],
       [  25314.31829228],
       [  25914.82354466],
       [  38819.15583731],
       [  38582.49661359],
       [  45811.95288503],
       [  40830.85605397],
       [  16604.9696635 ],
       [  19005.79099208],
       [1251513.19268982],
       [1343128.25775858],
       [1435314.61758602],
       [  65351.38212328],
       [  68759.37034635],
       [  74894.34568941],
       [  95346.63767553],
       [  99181.17707931],
       [ 115091.01954461],
       [ 169889.11165471],
       [ 161299.31776057]])

In [36]:
y_test.iloc[:20, :]

Unnamed: 0,new_adet
32,95049.999
33,3420.0
34,1766.9988
59,23097.9942
60,5458.9941
61,11183.9976
112,49835.99232
113,16575.98976
114,17280.0
158,845856.0


---
# KNN
---

In [37]:
#GridSearch to determine best parameters
param_grid_knn = {
    'algorithm': ['auto', 'ball_tree'],
    'leaf_size': [30, 50, 75, 100],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
    'n_neighbors' : list(range(20, 51, 5))
}

gsm_knn = GridSearchCV(KNeighborsRegressor(),
                   param_grid=param_grid_knn,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   refit=True,
                   scoring='neg_mean_absolute_error')

gsm_knn.fit(X_train, y_train)

Fitting 5 folds for each of 224 candidates, totalling 1120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 594 tasks      | elapsed:   36.5s
[Parallel(n_jobs=-1)]: Done 1105 out of 1120 | elapsed:   46.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 1120 out of 1120 | elapsed:   46.5s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree'],
                         'leaf_size': [30, 50, 75, 100],
                         'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [20, 25, 30, 35, 40, 45, 50],
                         'weights': ['uniform', 'distance']},
             scoring='neg_mean_absolute_error', verbose=2)

In [38]:
best_params_knn = gsm_knn.best_params_
print(best_params_knn)

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'n_neighbors': 20, 'weights': 'distance'}


In [39]:
yhat_knn = gsm_knn.predict(X_test)

In [40]:
print("MAPE:", mape(y_test, yhat_knn))

MAPE: new_adet    825.96309
dtype: float64


---
# XGBoost
---

In [41]:
#GridSearch to determine best parameters
param_grid_xgb = {'colsample_bytree': [0.4, 0.7, 0.9],
                  'booster': ["gbtree", "gblinear", "dart"],
                  'learning_rate': [0.1, 0.001],
                  'max_depth': list(range(5, 9)),
                  'min_child_weight': list(range(5, 11)),
                  'objective': ['reg:squarederror'],
                  'subsample': [0.4, 0.5, 0.6, 0.7],
                  'random_state': [42]}


gsm_xgb = GridSearchCV(XGBRegressor(),
                       param_grid=param_grid_xgb,
                       cv=5,
                       n_jobs=-1,
                       verbose=2,
                       refit=True,
                       scoring='neg_mean_absolute_error')

gsm_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | 

Parameters: { "colsample_bytree", "max_depth", "min_child_weight", "subsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'booster': ['gbtree', 'gblinear', 'dart'],
                         'colsample_bytree': [0.4, 0.7, 0.9],
     

In [42]:
best_params_xgb = gsm_xgb.best_params_
print(gsm_xgb.best_params_)

{'booster': 'gblinear', 'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 5, 'objective': 'reg:squarederror', 'random_state': 42, 'subsample': 0.5}


In [43]:
yhat_xgb = gsm_xgb.predict(X_test)

In [44]:
yhat_xgb[:5]

array([47887.516, 23888.453, 24168.527, 34886.188, 34001.74 ],
      dtype=float32)

In [45]:
y_test.iloc[:5, :]

Unnamed: 0,new_adet
32,95049.999
33,3420.0
34,1766.9988
59,23097.9942
60,5458.9941


In [46]:
print("MAPE:", mape(np.array(y_test), yhat_xgb))

MAPE: 79360.69128828096


---
# Decision Tree
---

In [47]:
#GridSearch to determine best parameters

param_grid_dt = {'criterion':['mse', 'mae'],
                 'max_depth': [i for i in range(3, 11)],
                 'min_samples_leaf': [i for i in range(3, 11)],
                 'min_samples_split': [i for i in range(3, 11)],
                 'random_state': [42]}

gsm_dt = GridSearchCV(DecisionTreeRegressor(),
                      param_grid=param_grid_dt,
                      cv=5,
                      n_jobs=-1,
                      verbose=2,
                      refit=True,
                      scoring='neg_mean_absolute_error')

gsm_dt.fit(X_train, y_train)

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 1850 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 2740 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 3425 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4032 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 4721 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 5120 out of 5120 | elapsed:  3.4min finished


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10],
                         'random_state': [42]},
             scoring='neg_mean_absolute_error', verbose=2)

In [48]:
best_params_dt = gsm_dt.best_params_
print(gsm_dt.best_params_)

{'criterion': 'mae', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 42}


In [49]:
yhat_dt = gsm_dt.predict(X_test)

In [51]:
print("MAPE:", mape(np.array(y_test), yhat_dt))

MAPE: 79483.05732166424


---
# Random Forest Regressor
---

In [None]:
#GridSearch to determine best parameters

param_grid_rf = {'criterion':['mse', 'mae'],
                 'max_depth': [i for i in range(5, 11)],
                 'min_samples_leaf': [i for i in range(3, 11)],
                 'min_samples_split': [i for i in range(3, 11)],
                 'random_state': [42]}

gsm_rf = GridSearchCV(RandomForestRegressor(),
                      param_grid=param_grid_rf,
                      cv=5,
                      n_jobs=-1,
                      verbose=2,
                      refit=True,
                      scoring='neg_mean_absolute_error')

#gsm = DecisionTreeRegressor(random_state=42)

gsm_rf.fit(X_train, y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.3min


In [None]:
best_params_rf = gsm_rf.best_params_
print(gsm_rf.best_params_)

In [None]:
yhat_rf = gsm_rf.predict(X_test)

In [None]:
print("MAPE:", mape(np.array(y_test), yhat_rf))

In [None]:
print("MAPE:", mape(np.array(y_test), yhat_rf))

---
# Gradient Boosting Regressor
---

In [None]:
gradient_boosting_parameters = {"learning_rate": [0.1, 0.15, 0.2],
                                "min_samples_split": np.linspace(0.1, 0.5, 3),
                                "min_samples_leaf": np.linspace(0.1, 0.5, 3),
                                "max_depth":[3,5],
                                "max_features":["log2","sqrt"],
                                "criterion": ["friedman_mse",  "mae"],
                                "subsample":[0.5, 1.0],
                                "n_estimators":[10]}

gsm_gb = GridSearchCV(GradientBoostingRegressor(), 
                      param_grid=gradient_boosting_parameters, 
                      cv=5, 
                      n_jobs=-1, 
                      verbose=2, 
                      scoring='neg_mean_absolute_error')

gsm_gb.fit(X_train, y_train)

In [None]:
best_params_gb = gsm_gb.best_params_
print(gsm_gb.best_params_)

In [None]:
yhat_gb = gsm_gb.predict(X_test)

In [None]:
print("MAPE:", mape(np.array(y_test), yhat_gb))

In [None]:
print("MAPE:", mape(np.array(y_test), yhat_gb))

---
# Regresyon test sonuçlarının çıktılarının alınması
---

In [None]:
test_results = X_test.copy()

In [None]:
test_results["Kanal"] = le_kanal.inverse_transform(test_results["Kanal"])
test_results["grup_adi"] = le_grup_adi.inverse_transform(test_results["grup_adi"])
test_results["ana_kategori_adi"] = le_ana_kategori_adi.inverse_transform(test_results["ana_kategori_adi"])
test_results["kategori_adi"] = le_kategori_adi.inverse_transform(test_results["kategori_adi"])
test_results["marka_adi"] = le_marka_adi.inverse_transform(test_results["marka_adi"])
test_results["urun_adi"] = le_urun_adi.inverse_transform(test_results["urun_adi"])

In [None]:
test_results["linear_regression"] = yhat_lm.reshape(-1)
test_results["knn"] = yhat_knn.reshape(-1)
#test_results["xgboost"] = yhat_xgb.reshape(-1)
test_results["decision_tree"] = yhat_dt.reshape(-1)
#test_results["random_forest"] = yhat_rf.reshape(-1)
test_results["gradient_boosting"] = yhat_gb.reshape(-1)

In [None]:
test_results = pd.concat([test_results, y_test], axis=1)

In [None]:
test_results.to_excel("all_test_results.xlsx", index=False)

In [None]:
kanal_l = []
grup = []
urun_l = []
model_type = []
mape_l = []

for kanal in test_results.Kanal.unique():
    for grp in test_results.grup_adi.unique():
        for urun in test_results.urun_adi.unique():
            for model in test_results.columns.to_list()[15:-1]:
                temp_df = test_results[(test_results["Kanal"] == kanal) & 
                                       (test_results["grup_adi"] == grp) & 
                                       (test_results["urun_adi"] == urun)]
                kanal_l.append(kanal)
                grup.append(grp)
                urun_l.append(urun)
                model_type.append(model)
                model_com = temp_df[[model, "new_adet"]]
                mape_l.append(mape(np.array(model_com["new_adet"]), np.array(model_com[model])))

In [None]:
results_by_sku = pd.DataFrame({"kanal_l": kanal_l,
                               "grup": grup,
                               "urun": urun_l,
                               "model_type": model_type,
                               "mape": mape_l})

results_by_sku.dropna(inplace=True)

In [None]:
results_by_sku.to_excel("regression_test_results.xlsx", index=False)

In [None]:
kanal_results = test_results.groupby(["yil", "ay", "Kanal"]).agg({"linear_regression": "sum",
                                                                  "knn": "sum",
                                                                  "xgboost": "sum",
                                                                  "decision_tree": "sum",
                                                                  "random_forest": "sum",
                                                                  "gradient_boosting": "sum",
                                                                  "new_adet": "sum"}).reset_index()

In [None]:
knl_list = []
mape_list = []
model_type = []
for knl in kanal_results["Kanal"].unique():
    for mdl in kanal_results.columns.to_list()[3:-1]:
        temp_df = kanal_results[kanal_results["Kanal"] == knl]
        knl_list.append(knl)
        model_type.append(mdl)
        mape_list.append(mape(np.array(temp_df["new_adet"]), np.array(temp_df[mdl])))

In [None]:
results_by_kanal = pd.DataFrame({"kanal": knl_list,
                                 "model_type": model_type,
                                 "mape": mape_list})

In [None]:
results_by_kanal.to_excel("results_by_kanal.xlsx", index=False)

In [None]:
results_by_kanal.groupby("kanal").agg({"mape": "min"})

---
# GridSearch parametreleri çıktısı
---

In [None]:
pd.DataFrame({"models": ["KNN", "Gradient Boosting", "XGBoost", "Decision Tree", "Random Forest"],
             "parameters": [best_params_knn, best_params_gb, best_params_xgb, best_params_dt, best_params_rf]}).to_excel("gridsearch_parameters.xlsx", index=False)

---
# Time Series Çıktıları
---

In [None]:
df_ts = pd.read_csv('../data/table_DS_07_07_OUTFOR.csv',delimiter=',')
df_ts = df_ts[["Kanal", "grup_adi", "urun_adi", "date", "ACTUAL", "PREDICT"]]

df_ts.ACTUAL = df_ts.ACTUAL.str.replace('           .', '0')
df_ts.PREDICT = df_ts.PREDICT.str.replace('           .', '0')
df_ts.ACTUAL = df_ts.ACTUAL.str.replace(',', '.')
df_ts.PREDICT = df_ts.PREDICT.str.replace(',', '.')
df_ts.ACTUAL = df_ts.ACTUAL.astype('float')
df_ts.PREDICT = df_ts.PREDICT.astype('float')

In [None]:
date_names = {'JAN':"01-01",
              'FEB':"01-02" ,
              'MAR':"01-03",
              'APR':"01-04",
              'MAY':"01-05",
              'JUN':"01-06",
              'JUL':"01-07",
              'AUG':"01-08",
              'SEP':"01-09",
              'OCT':"01-10",
              'NOV':"01-11",
              'DEC':"01-12"}

In [None]:
maximum, minimum = int((list(pd.Series([i[-4:] for i in df_ts.date.unique()]).drop_duplicates()))[-1]), int((list(pd.Series([i[-4:] for i in df_ts.date.unique()]).drop_duplicates()))[0])

In [None]:
date_dict = {}
for tm in range(minimum, maximum+1):
    for dt in date_names.keys():
        date_dict.update({dt+str(tm):date_names[dt]+"-"+str(tm)})

In [None]:
df_ts["date"] = df_ts["date"].map(date_dict)

In [None]:
df_ts["date"] = pd.to_datetime(df_ts["date"], format="%d-%m-%Y", errors="coerce")

In [None]:
ytrue_all = []
yhat_all = []
sku_all = []
grp_all = []
date_all = []

mape_ts = []
sku_ts = []
grup_adi_ts = []
for sku in df_ts.urun_adi.unique():
    for grp in df_ts.grup_adi.unique():
        temp_df = df_ts[(df_ts["grup_adi"] == grp) & (df_ts["urun_adi"] == sku)]
        if len(temp_df) < 1:
            pass
        else:
            temp_df = temp_df[(temp_df["date"] > datetime(2020, 12, 1)) & (temp_df["ACTUAL"] != 0)]
            ytrue = temp_df.iloc[-6:, -2:-1]
            yhat = temp_df.iloc[-6:, -1:]
            date_ = temp_df.iloc[-6:, -3:-2]
            mape_ts.append(mape(np.array(ytrue), np.array(yhat)))
            sku_ts.append(sku)
            grup_adi_ts.append(grp)
            
            ytrue_all.extend(list(ytrue.ACTUAL))
            yhat_all.extend(list(yhat.PREDICT))
            sku_all.extend([sku]*len(ytrue))
            grp_all.extend([grp]*len(ytrue))
            date_all.extend(list(date_.date))

In [None]:
ts_results = pd.DataFrame({"grup": grup_adi_ts,
                           "urun_adi": sku_ts,
                           "mape": mape_ts})

In [None]:
ts_results.to_excel("time_series_sku_mape_all.xlsx", index=False)

In [None]:
ts_results["mape_bins"] = ts_results["mape"].apply(lambda x: "10'dan küçük" \
                                                   if x<10 else ("10 ile 25 arasında" \
                                                                 if x<25 else ("25 ile 50 arasında" \
                                                                               if x<50 else("50 ile 100 arasında" if x<100 else "100'den büyük"))))

In [None]:
ts_results = ts_results.merge(results_by_sku[["kanal_l", "grup"]].drop_duplicates(subset=["kanal_l", "grup"]), how="left", on="grup")

In [None]:
ts_results_raw_data = pd.DataFrame({"grup": grp_all,
                                   "urun_adi": sku_all,
                                   "date": date_all,
                                   "ytrue": ytrue_all,
                                   "yhat": yhat_all})

In [None]:
ts_results_raw_data[(ts_results_raw_data["grup"] == "ŞOK") & (ts_results_raw_data["urun_adi"] == "ONEO DRAJE ŞİŞE NANE AR.SAK.60Gx12")]

In [None]:
ts_results_raw_data = ts_results_raw_data.merge(results_by_sku[["kanal_l", "grup"]].drop_duplicates(subset=["kanal_l", "grup"]), how="left", on="grup")

In [None]:
ts_results_raw_data.to_excel("time_series_sku_raw_data.xlsx", index=False)

---
## Kanal bazında toplam yukarıya doğru mape hesaplaması

In [None]:
ts_kanal_results = ts_results_raw_data.groupby(["date", "kanal_l"]).agg({"ytrue": "sum",
                                                                         "yhat": "sum"}).reset_index()

In [None]:
ts_kanal_results.to_excel("time_series_results_kanal_raw_data.xlsx", index=False)

In [None]:
ts_kanal = []
ts_mape_kanal = []
for knl in ts_kanal_results.kanal_l.unique():
    temp_df = ts_kanal_results[ts_kanal_results["kanal_l"] == knl]
    ts_kanal.append(knl)
    ts_mape_kanal.append(mape(np.array(temp_df.ytrue), np.array(temp_df.yhat)))
ts_kanal_mape_results = pd.DataFrame({"kanal": ts_kanal, "mape": ts_mape_kanal})
ts_kanal_mape_results

In [None]:
ts_kanal_mape_results.to_excel("time_series_kanal_mape.xlsx", index=False)

---
# Time Series ve Regresyon Sonuçlarının Bir Aradaki Hali
---

# Kanal Bazlı
---

In [None]:
kanal_results["date"] = kanal_results["yil"].astype(str) + "-" + kanal_results["ay"].astype(str) + "-01"

In [None]:
kanal_results["date"] = pd.to_datetime(kanal_results["date"], format="%Y-%m-%d", errors="coerce")

In [None]:
kanal_results.rename(columns={"Kanal": "kanal_l"}, inplace=True)

In [None]:
kanal_results.to_excel("regression_results_all_algorithms.xlsx", index=False)

In [None]:
reg_pas_best_mape = kanal_results[(kanal_results["kanal_l"] == "pasifik")][["date", "kanal_l", "new_adet", "knn"]].rename(columns={"new_adet": "ytrue", 
                                                                                                              "knn": "yhat",
                                                                                                              "linear_regression": "yhat",
                                                                                                              "xgboost": "yhat",
                                                                                                              "decision_tree": "yhat",
                                                                                                              "random_forest": "yhat",
                                                                                                              "gradient_boosting": "yhat"})

reg_hor_best_mape = kanal_results[(kanal_results["kanal_l"] == "horizon")][["date", "kanal_l", "new_adet", "knn"]].rename(columns={"new_adet": "ytrue", 
                                                                                                              "knn": "yhat",
                                                                                                              "linear_regression": "yhat",
                                                                                                              "xgboost": "yhat",
                                                                                                              "decision_tree": "yhat",
                                                                                                              "random_forest": "yhat",
                                                                                                              "gradient_boosting": "yhat"})

reg_btt_best_mape = kanal_results[(kanal_results["kanal_l"] == "btt")][["date", "kanal_l", "new_adet", "xgboost"]].rename(columns={"new_adet": "ytrue", 
                                                                                                              "knn": "yhat",
                                                                                                              "linear_regression": "yhat",
                                                                                                              "xgboost": "yhat",
                                                                                                              "decision_tree": "yhat",
                                                                                                              "random_forest": "yhat",
                                                                                                              "gradient_boosting": "yhat"})

## Regresyon Kanal Bazında Şampiyon Algoritma Sonuçlarının Aggregate Edilmiş Hali

In [None]:
reg_results_kanal_all = pd.concat([reg_pas_best_mape,
                                   reg_hor_best_mape,
                                   reg_btt_best_mape], axis=0, ignore_index=True)

In [None]:
reg_ts_results_kanal_all = pd.concat([reg_results_kanal_all,
                                      ts_kanal_results], axis=0, ignore_index=True)

In [None]:
reg_ts_results_kanal_all

In [None]:
reg_ts_results_kanal_all.to_excel("regression_time_series_combained_results_raw_data.xlsx", index=False)

## Time Series ile Regresyon'da İlgili Kanallardaki Kazanan Modellerin Tek Potada Eritilip MAPE Hesaplaması

In [None]:
reg_ts_results_kanal_all = reg_ts_results_kanal_all.groupby(["kanal_l", "date"]).agg({"ytrue": "sum", "yhat": "sum"}).reset_index()

In [None]:
all_mape = []
all_kanal = []
for knl in reg_ts_results_kanal_all.kanal_l.unique():
    temp_df = reg_ts_results_kanal_all[reg_ts_results_kanal_all["kanal_l"] == knl]
    all_mape.append(mape(np.array(temp_df.ytrue), np.array(temp_df.yhat)))
    all_kanal.append(knl)

all_results_ts_reg = pd.DataFrame({"kanal": all_kanal, "mape": all_mape})

In [None]:
all_results_ts_reg

In [None]:
all_results_ts_reg.to_excel("regression_time_series_combained_mape.xlsx", index=False)

In [None]:
all_results_ts_reg

In [None]:
for knl in reg_ts_results_kanal_all.kanal_l.unique():
    temp_df = reg_ts_results_kanal_all[(reg_ts_results_kanal_all["kanal_l"] == knl) & 
                                      (reg_ts_results_kanal_all["date"] >= datetime(2021, 1, 1)) & 
                                      (reg_ts_results_kanal_all["date"] < datetime(2021, 6, 1))].reset_index(drop=True)
    son_ay_true = temp_df.loc[temp_df.index, "ytrue"]
    son_ay_yhat = temp_df.loc[temp_df.index, "yhat"]
    date_ = temp_df.loc[temp_df.index, "date"]
    mape_son_ay = mape(son_ay_true, son_ay_yhat)
    plt.figure(figsize=(20,8))
    plt.plot(date_, son_ay_true, marker="o")
    plt.plot(date_, son_ay_yhat, marker="o", color="r")
    plt.title(knl.upper())
    plt.xticks(rotation=90)
    plt.legend(["Actual", "Predicted"])
    plt.grid()
    plt.savefig(knl+"satis_grafigi.jpeg")
    plt.show()

In [None]:
for knl in reg_ts_results_kanal_all.kanal_l.unique():
    temp_df = reg_ts_results_kanal_all[(reg_ts_results_kanal_all["kanal_l"] == knl) & 
                                      (reg_ts_results_kanal_all["date"] >= datetime(2021, 1, 1)) & 
                                      (reg_ts_results_kanal_all["date"] < datetime(2021, 2, 1))].reset_index(drop=True)
    son_ay_true = temp_df.loc[temp_df.index.max(), "ytrue"]
    son_ay_yhat = temp_df.loc[temp_df.index.max(), "yhat"]
    date_ = temp_df.loc[temp_df.index.max(), "date"]
    mape_son_ay = mape(son_ay_true, son_ay_yhat)
    print()
    print("="*50)
    print(f"Kanal: {knl.upper()}")
    print(f"MAPE: {mape_son_ay:.2f}")
    print("-"*50)
    print()
    print("Yıl:", date_.year)
    print("Ay:", date_.month)
    print()
    print(f"Gerçekleşen Satış: {son_ay_true:.2f}")
    print(f"Tahminlenen Satış: {son_ay_yhat:.2f}")

In [None]:
for knl in mape_dagilimi.kanal.unique():
    tmp_df = mape_dagilimi[mape_dagilimi["kanal"] == knl]
    plt.figure(figsize=(20, 8))
    sns.lineplot(x="date", y="mape", data=mape_dagilimi)

---
# Check Results
---

In [None]:
df_reg2 = df_reg.copy()

In [None]:
df_reg2["Kanal"] = le_kanal.inverse_transform(df_reg2["Kanal"])
df_reg2["grup_adi"] = le_grup_adi.inverse_transform(df_reg2["grup_adi"])
df_reg2["ana_kategori_adi"] = le_ana_kategori_adi.inverse_transform(df_reg2["ana_kategori_adi"])
df_reg2["kategori_adi"] = le_kategori_adi.inverse_transform(df_reg2["kategori_adi"])
df_reg2["marka_adi"] = le_marka_adi.inverse_transform(df_reg2["marka_adi"])
df_reg2["urun_adi"] = le_urun_adi.inverse_transform(df_reg2["urun_adi"])

---
# MAPE Visualization
## SKU
---

# Regression MAPE Bins
---

In [None]:
best_results_sku = results_by_sku.groupby(["kanal_l", "grup", "urun"]).agg({"mape": "min"}).reset_index()
best_results_sku.dropna(inplace=True, axis=0)

best_results_sku["mape_bins"] = best_results_sku["mape"].apply(lambda x: "10'dan küçük" \
                                                               if x<10 else ("10 ile 25 arasında" \
                                                                             if x<25 else ("25 ile 50 arasında" \
                                                                                           if x<50 else("50 ile 100 arasında" if x<100 else "100'den büyük"))))

mape_dist = (best_results_sku.mape_bins.value_counts() / len(best_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
mape_dist = mape_dist.reindex(reorderlist)

In [None]:
btt_best_results_sku = best_results_sku[best_results_sku["kanal_l"] == "btt"]
pas_best_results_sku = best_results_sku[best_results_sku["kanal_l"] == "pasifik"]
hor_best_results_sku = best_results_sku[best_results_sku["kanal_l"] == "horizon"]

In [None]:
btt_mape_dist = (btt_best_results_sku.mape_bins.value_counts() / len(btt_best_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
btt_mape_dist = btt_mape_dist.reindex(reorderlist)

hor_mape_dist = (hor_best_results_sku.mape_bins.value_counts() / len(hor_best_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
hor_mape_dist = hor_mape_dist.reindex(reorderlist)

pas_mape_dist = (pas_best_results_sku.mape_bins.value_counts() / len(pas_best_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
pas_mape_dist = pas_mape_dist.reindex(reorderlist)

In [None]:
btt_mape_dist.fillna(0, inplace=True)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=btt_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(mape_dist)), labels=btt_mape_dist.index, pctdistance=0.5)
plt.title("BTT SKU Bazında MAPE Dağılımı (Regresyon)", fontsize=14)
plt.savefig("btt_sku_bazli_mape_dagilimi.jpg")

In [None]:
hor_mape_dist.fillna(0, inplace=True)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=hor_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(mape_dist)), labels=hor_mape_dist.index, pctdistance=0.5)
plt.title("Horizon SKU Bazında MAPE Dağılımı (Regresyon)", fontsize=14)
plt.savefig("hor_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=pas_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(mape_dist)), labels=pas_mape_dist.index, pctdistance=0.5)
plt.title("Pasifik SKU Bazında MAPE Dağılımı (Regresyon)", fontsize=14)
plt.savefig("pas_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=mape_dist, autopct="%.1f%%", explode=[0.05]*(len(mape_dist)), labels=mape_dist.index, pctdistance=0.5)
plt.title("SKU Bazında MAPE Dağılımı (Regresyon)", fontsize=14)
plt.savefig("sku_bazli_mape_dagilimi.jpg")

---
# Time Series MAPE Bins
---

In [None]:
ts_results.dropna(inplace=True, axis=0)

In [None]:
ts_mape_dist = (ts_results.mape_bins.value_counts() / len(ts_results))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
ts_mape_dist = ts_mape_dist.reindex(reorderlist)

In [None]:
btt_ts_results_sku = ts_results[ts_results["kanal_l"] == "btt"]
pas_ts_results_sku = ts_results[ts_results["kanal_l"] == "pasifik"]
hor_ts_results_sku = ts_results[ts_results["kanal_l"] == "horizon"]

In [None]:
btt_ts_mape_dist = (btt_ts_results_sku.mape_bins.value_counts() / len(btt_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
btt_ts_mape_dist = btt_ts_mape_dist.reindex(reorderlist)

hor_ts_mape_dist = (hor_ts_results_sku.mape_bins.value_counts() / len(hor_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
hor_ts_mape_dist = hor_ts_mape_dist.reindex(reorderlist)

pas_ts_mape_dist = (pas_ts_results_sku.mape_bins.value_counts() / len(pas_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
pas_ts_mape_dist = pas_ts_mape_dist.reindex(reorderlist)

In [None]:
btt_ts_mape_dist.fillna(0, inplace=True)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=btt_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(btt_ts_mape_dist)), labels=btt_ts_mape_dist.index, pctdistance=0.5)
plt.title("BTT SKU Bazında MAPE Dağılımı (Time Series)", fontsize=14)
plt.savefig("ts_btt_sku_bazli_mape_dagilimi.jpg")

In [None]:
hor_mape_dist.fillna(0, inplace=True)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=hor_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(hor_ts_mape_dist)), labels=hor_ts_mape_dist.index, pctdistance=0.5)
plt.title("Horizon SKU Bazında MAPE Dağılımı (Time Series)", fontsize=14)
plt.savefig("ts_hor_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=pas_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(pas_ts_mape_dist)), labels=pas_ts_mape_dist.index, pctdistance=0.5)
plt.title("Pasifik SKU Bazında MAPE Dağılımı (Time Series)", fontsize=14)
plt.savefig("ts_pas_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(ts_mape_dist)), labels=ts_mape_dist.index, pctdistance=0.5)
plt.title("SKU Bazında MAPE Dağılımı (Time Series)", fontsize=14)
plt.savefig("ts_sku_bazli_mape_dagilimi.jpg")

---
# Regression + Time Series MAPE Bins
---

In [None]:
best_results_sku2 = best_results_sku.copy()
best_results_sku2.rename(columns={"urun": "urun_adi"}, inplace=True)
ts_results = ts_results[best_results_sku2.columns.to_list()]

In [None]:
reg_ts_results_sku = pd.concat([ts_results, best_results_sku2], axis=0, ignore_index=True)

In [None]:
ts_reg_mape_dist = (reg_ts_results_sku.mape_bins.value_counts() / len(reg_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
ts_reg_mape_dist = ts_reg_mape_dist.reindex(reorderlist)

In [None]:
btt_reg_ts_results_sku = reg_ts_results_sku[reg_ts_results_sku["kanal_l"] == "btt"]
pas_reg_ts_results_sku = reg_ts_results_sku[reg_ts_results_sku["kanal_l"] == "pasifik"]
hor_reg_ts_results_sku = reg_ts_results_sku[reg_ts_results_sku["kanal_l"] == "horizon"]

In [None]:
btt_reg_ts_mape_dist = (btt_reg_ts_results_sku.mape_bins.value_counts() / len(btt_reg_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
btt_reg_ts_mape_dist = btt_reg_ts_mape_dist.reindex(reorderlist)

hor_reg_ts_mape_dist = (hor_reg_ts_results_sku.mape_bins.value_counts() / len(hor_reg_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
hor_reg_ts_mape_dist = hor_reg_ts_mape_dist.reindex(reorderlist)

pas_reg_ts_mape_dist = (pas_reg_ts_results_sku.mape_bins.value_counts() / len(pas_reg_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
pas_reg_ts_mape_dist = pas_reg_ts_mape_dist.reindex(reorderlist)

In [None]:
btt_reg_ts_mape_dist.fillna(0, inplace=True)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=btt_reg_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(btt_reg_ts_mape_dist)), labels=btt_reg_ts_mape_dist.index, pctdistance=0.5)
plt.title("BTT SKU Bazında MAPE Dağılımı (Time Series + Regresyon)", fontsize=14)
plt.savefig("reg_ts_btt_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=hor_reg_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(hor_reg_ts_mape_dist)), labels=hor_reg_ts_mape_dist.index, pctdistance=0.5)
plt.title("Horizon SKU Bazında MAPE Dağılımı (Time Series + Regresyon)", fontsize=14)
plt.savefig("reg_ts_hor_sku_bazli_mape_dagilimi.jpg")

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=pas_reg_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(pas_reg_ts_mape_dist)), labels=pas_reg_ts_mape_dist.index, pctdistance=0.5)
plt.title("Pasifik SKU Bazında MAPE Dağılımı (Time Series + Regresyon)", fontsize=14)
plt.savefig("reg_ts_pas_sku_bazli_mape_dagilimi.jpg")

In [None]:
reg_ts_mape_dist = (reg_ts_results_sku.mape_bins.value_counts() / len(reg_ts_results_sku))*100
reorderlist = ["10'dan küçük", "10 ile 25 arasında", "25 ile 50 arasında", "50 ile 100 arasında", "100'den büyük"]
reg_ts_mape_dist = reg_ts_mape_dist.reindex(reorderlist)

In [None]:
#Using matplotlib
plt.figure(figsize=(10,10))
plt.pie(x=reg_ts_mape_dist, autopct="%.1f%%", explode=[0.05]*(len(reg_ts_mape_dist)), labels=reg_ts_mape_dist.index, pctdistance=0.5)
plt.title("SKU Bazında MAPE Dağılımı (Time Series + Regresyon)", fontsize=14)
plt.savefig("reg_ts_sku_bazli_mape_dagilimi.jpg")

In [None]:
X_train2 = X_train.copy()

In [None]:
X_train2["Kanal"] = le_kanal.inverse_transform(X_train2["Kanal"])
X_train2["grup_adi"] = le_grup_adi.inverse_transform(X_train2["grup_adi"])
X_train2["ana_kategori_adi"] = le_ana_kategori_adi.inverse_transform(X_train2["ana_kategori_adi"])
X_train2["kategori_adi"] = le_kategori_adi.inverse_transform(X_train2["kategori_adi"])
X_train2["marka_adi"] = le_marka_adi.inverse_transform(X_train2["marka_adi"])
X_train2["urun_adi"] = le_urun_adi.inverse_transform(X_train2["urun_adi"])

In [None]:
training_all = pd.concat([X_train2, y_train], axis=1)

In [None]:
best_results_sku["mape_bins"].unique()

In [None]:
best_results_sku[best_results_sku["mape_bins"] == "10'dan küçük"]

In [None]:
df_all = pd.concat([training_all, test_results], axis=0)
df_all["date"] = df_all["yil"].astype(str) + "-" + df_all["ay"].astype(str) + "-" + "01"
df_all["date"] = pd.to_datetime(df_all["date"], format="%Y-%m-%d", errors="coerce")

In [None]:
def draw_graph(grup, sku):
    mape_sample = results_by_sku[(results_by_sku["grup"] == grup) & (results_by_sku["urun"] == sku)]
    l_mape = mape_sample[mape_sample["model_type"] == "linear_regression"]["mape"].values[0]
    knn_mape = mape_sample[mape_sample["model_type"] == "knn"]["mape"].values[0]
    xgb_mape = mape_sample[mape_sample["model_type"] == "xgboost"]["mape"].values[0]
    dt_mape = mape_sample[mape_sample["model_type"] == "decision_tree"]["mape"].values[0]
    rf_mape = mape_sample[mape_sample["model_type"] == "random_forest"]["mape"].values[0]
    gb_mape = mape_sample[mape_sample["model_type"] == "gradient_boosting"]["mape"].values[0]

    test = df_all[(df_all["grup_adi"] == grup) & (df_all["urun_adi"] == sku)]
    plt.figure(figsize=(20, 8))
    plt.plot(test["date"], test["new_adet"], marker="o", markersize=5)
    plt.plot(test["date"], test["linear_regression"], marker="o", markersize=5)
    plt.plot(test["date"], test["knn"], marker="o", markersize=5)
    plt.plot(test["date"], test["xgboost"], marker="o", markersize=5)
    plt.plot(test["date"], test["decision_tree"], marker="o", markersize=5)
    plt.plot(test["date"], test["random_forest"], marker="o", markersize=5)
    plt.plot(test["date"], test["gradient_boosting"], marker="o", markersize=5)
    plt.legend(["Adet", f"Linear Regression: {l_mape:.2f}", f"KNN: {knn_mape:.2f}", 
                f"XGBoost: {xgb_mape:.2f}", f"Decision Tree: {dt_mape:.2f}", 
                f"Random Forest: {rf_mape:.2f}", f"Gradient Boosting: {gb_mape:.2f}"])
    plt.title(grup+" & "+sku)
    plt.grid()
    plt.show()
    print()
    print(test[["date", "new_adet", "linear_regression", "knn", "xgboost"]])
    print()
    print(test[["date", "new_adet", "decision_tree", "random_forest"]])
    print()
    print(test[["date", "new_adet", "gradient_boosting"]])
    print()

In [None]:
draw_graph("ORTA MARKET", "HANIMELLER LİMONLU 138Gx9")

In [None]:
check_sku = ['ULK TOZ KAK.YENİ TASARIM 50Gx12x12',
             'ULK.ÇİK.GOF. BEYAZ 35Gx36x6',
             'ULK KARE ÇİK.ÜZÜMLÜ FIN.SÜT.65Gx6x6', 'ULK ÇİK.KARE FIN.70Gx6x6',
             'CARAMIO KARE ÇİK.K.MELLİ 60Gx12x6', 'ULK ÇİK.KARE A.FIS.70Gx6x6',
             'ALBENİ KAPL.BAR B.BOY 56Gx24x6', 'YUPO DRAJE DOYPACK 111Gx24',
             'ÇOKONAT KAPL.GOF.5x33Gx24', 'ALBENİ KAPL.BAR 5x40Gx24',
             'COCOSTAR H.CEV.BAR 28Gx24x6 YENİ', 'ULK ÇİK.GOF.5x36Gx24',
             'ÇOKOKREM CAM KAV.700Gx6', 'ALTINBAŞAK Ç.OTLU KİNOA KR.5x40Gx18',
             'ÇOKOKREM SAKLAMA KABI 1000Gx6', 'HOBBY MİNİ İKR.POŞET 250Gx12',
             'METRO KAPL.BAR 5x40Gx18', 'ULK ÇİK.KARE SÜT.70Gx6x6',
             'ULK DAMLA ÇİK.BİT.150Gx12', 'OLALA SUFLE MINI 162Gx12',
             'YUPO JELLY PORTAKAL HALKASI 70GRX24',
             'İKRAM KRE.BİSK.ÇİK.3x92Gx12', 'ULK BEBE BİSK.2x500Gx4',
             'ULK PÖTİBÖR BİSK.450Gx10', 'ULK KUVERTÜR %54 BİT.TAB.200Gx6',
             'ÇİZİVİÇ PEY.SAND.KR.3x90Gx12', 'ULK KRE.SAND.BİSK.10x61Gx8',
             '9 KAT TAT İNCE İNCE ÇİLEK 114Gx16', 'ÇİZİ KR.4x70Gx12',
             'DANKEK RULO PASTA ÇİLEK 245Gx8', 'ULK KRE.SAND.BİSK.4x61Gx15',
             'ULK GOF.FIN.220Gx12', '9 KAT TAT RULOKAT ÇİK.KRE.230Gx6',
             'OLALA SUFLE KEK 70Gx12', 'HANIMELLER LİMONLU KURABİYE 138Gx9',
             'DANKEK LOKMALIK HİNDİSTAN CEVİZ.160Gx8',
             'DANKEK LOKMALIK HAV.LU TARÇINLI 180Gx8',
             'SMARTT KORNET ÇİK.32Gx24x6', 'ALBENİ ÇİK.KAPL.KEK 40Gx18',
             'KEKSTRA JÖLEBOL KEK ÇİLEK 35Gx24',
             'DANKEK PÖTİ MUFFIN KEK H.CEV.40Gx24']

In [None]:
best_results_sku[(best_results_sku["urun"].isin(check_sku)) & (best_results_sku["kanal_l"] == "pasifik")]["mape_bins"].value_counts()