In [1]:
#This code optimizes hyperparameters for the final model. 



# =============================================================================
# Briefing Summary:
# This script is designed to perform data loading, preprocessing, and hyperparameter tuning 
# for a sales forecasting model using CatBoostRegressor on Kaggle datasets. The code reads 
# various CSV files, converts date columns, reduces memory usage by downcasting data types, 
# processes categorical features, splits the data into training and validation sets, and 
# uses RandomizedSearchCV with time series cross-validation to tune the model's hyperparameters. 
# Finally, it exports the best hyperparameters to a CSV file.
# =============================================================================



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rohlik-dataprep/__results__.html
/kaggle/input/rohlik-dataprep/__notebook__.ipynb
/kaggle/input/rohlik-dataprep/train_merged_corrected.csv
/kaggle/input/rohlik-dataprep/__output__.json
/kaggle/input/rohlik-dataprep/custom.css
/kaggle/input/rohlik-sales-forecasting-challenge-v2/calendar.csv
/kaggle/input/rohlik-sales-forecasting-challenge-v2/test_weights.csv
/kaggle/input/rohlik-sales-forecasting-challenge-v2/inventory.csv
/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_train.csv
/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_test.csv
/kaggle/input/rohlik-sales-forecasting-challenge-v2/solution.csv


In [2]:


# Input files are imported to get training and testing dates of the competition. 

sales_train = pd.read_csv('/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_train.csv')
sales_test = pd.read_csv('/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_test.csv')

# Tarih sütunlarını datetime formatına çevirme
sales_train['date'] = pd.to_datetime(sales_train['date'])
sales_test['date'] = pd.to_datetime(sales_test['date'])

# Train veri seti için min ve max tarihleri
train_min_date = sales_train['date'].min()
train_max_date = sales_train['date'].max()

# Test veri seti için min ve max tarihleri
test_min_date = sales_test['date'].min()
test_max_date = sales_test['date'].max()

# Sonuçları yazdırma
print("Sales Train - En Küçük Tarih:", train_min_date)
print("Sales Train - En Büyük Tarih:", train_max_date)
print("Sales Test - En Küçük Tarih:", test_min_date)
print("Sales Test - En Büyük Tarih:", test_max_date)

Sales Train - En Küçük Tarih: 2020-08-01 00:00:00
Sales Train - En Büyük Tarih: 2024-06-02 00:00:00
Sales Test - En Küçük Tarih: 2024-06-03 00:00:00
Sales Test - En Büyük Tarih: 2024-06-16 00:00:00


In [None]:
#Memory is cleaned and preprocessed data is imported

# Delete the loaded sales datasets to free memory
del sales_train
del sales_test

# Force garbage collection to reclaim memory
gc.collect()


train_merged_corrected = pd.read_csv('/kaggle/input/rohlik-dataprep/train_merged_corrected.csv')


In [None]:

# Setting Pandas options to display all columns (and optionally all rows)
pd.set_option('display.max_columns', None)  # Tüm sütunları göster
pd.set_option('display.max_rows', None)     # Tüm satırları göster (opsiyonel)



train_merged_corrected=train_merged_corrected.drop(columns=['is_weekend','day_sin',
 'day_cos','sales_max',
 'sales_mean',
 'sales_median',
 'sales_std',
 'sales_skew',
 'sales_zero_ratio',
 'total_orders_max',
 'total_orders_mean',
 'total_orders_median',
 'total_orders_std',
 'total_orders_skew',
 'total_orders_zero_ratio','sell_price_main_max',
 'sell_price_main_mean',
 'sell_price_main_median',
 'sell_price_main_std',
 'sell_price_main_skew',
 'sell_price_main_zero_ratio',
 'total_discount_max',
 'total_discount_mean',
 'total_discount_median',
 'total_discount_std',
 'total_discount_skew',
 'total_discount_zero_ratio','sales_yearly_q25',
 'sales_yearly_q75','total_orders_yearly_q25',
 'total_orders_yearly_q75',
 'total_orders_yearly_zero_ratio',
 'total_orders_yearly_cv','sell_price_main_yearly_q25',
 'sell_price_main_yearly_q75',
 'sell_price_main_yearly_zero_ratio',
 'sell_price_main_yearly_cv','availability_yearly_max', 'total_discount_yearly_q25',
 'total_discount_yearly_q75',
 'total_discount_yearly_zero_ratio','days_to_next_closed', 'total_discount_yearly_min'])




In [None]:
# Listing columns of the merged dataset
train_merged_corrected.columns.tolist()

# Defining columns to drop from X sets
columns_to_drop = [
    'unique_id', 'date'
]


In [None]:

# Custom function to reduce memory usage by converting columns to smaller data types

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type not in [object, 'category', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()

            # tamsayı sütunlar
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                # float sütunlar
                df[col] = df[col].astype(np.float32)
                
        elif col_type == object:
            # Eğer gerçekte kategorik veya sayısal değilse, kategorik dönüştürebilirsiniz
            # df[col] = df[col].astype('category')
            pass
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Bellek kullanımı: {start_mem:.2f} MB -> {end_mem:.2f} MB")
    return df
train_merged_corrected=  reduce_mem_usage  (train_merged_corrected)


In [None]:
import os
import gc
import cupy as cp  # GPU bellek yönetimi için
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# ------------------------------
# Bellek Temizleme Fonksiyonu
# ------------------------------
def clean_gpu_memory():
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()


# Converting 'date' column to datetime and sorting by date
train_merged_corrected['date'] = pd.to_datetime(train_merged_corrected['date'])
train_merged_corrected = train_merged_corrected.sort_values('date')


# Temporarily dropping columns_to_drop to identify non-numeric and non-string column
df_temp = train_merged_corrected.drop(columns=columns_to_drop)
non_num_str_df = df_temp.select_dtypes(exclude=["number", "string"])
del df_temp
categorical_features = non_num_str_df.columns.tolist()
categorical_features.append('product_unique_id')

train_merged_corrected[categorical_features] = train_merged_corrected[categorical_features].astype('string')
for col in categorical_features:
    train_merged_corrected[col] = train_merged_corrected[col].astype(str).fillna("-1")



sales_train = train_merged_corrected[
    (train_merged_corrected['date'] >= pd.Timestamp('2020-08-01')) &
    (train_merged_corrected['date'] <= pd.Timestamp('2024-06-02'))
]

del non_num_str_df
clean_gpu_memory()


X_train, X_val, y_train, y_val = train_test_split(
    sales_train.drop(columns=columns_to_drop + ['sales']),
    sales_train['sales'],
    test_size=0.1,
    shuffle=False
)

# Taking the forecasting weights from dataset for wmae
train_weight = X_train['weight']
val_weight = X_val['weight']
X_train = X_train.drop(['weight'], axis=1)
X_val = X_val.drop(['weight'], axis=1)


X_train[categorical_features] = X_train[categorical_features].astype(str)
X_val[categorical_features] = X_val[categorical_features].astype(str)

# Applying log1p transformation to target variables

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

clean_gpu_memory()

# ------------------------------
# Hiperparametre Tuning: RandomizedSearchCV (GPU Bellek Sınırlandırması ile)
# ------------------------------
base_model = CatBoostRegressor(
    loss_function='MAE',
    random_seed=42,
    boosting_type='Ordered',
    task_type='GPU',
    thread_count=-1,
    max_bin=128,
    cat_features=categorical_features,
    verbose=0,
    early_stopping_rounds=10)

param_grid = {
    'iterations': [200, 500, 1000],
    'learning_rate': [0.05, 0.1, 0.2],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [0.1, 1, 10],
    'min_data_in_leaf': [20, 30, 40],
    'bagging_temperature': [0.2, 0.5, 0.8],
    'random_strength': [0.2, 0.5, 0.8]
}

tscv = TimeSeriesSplit(n_splits=3)

random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=100,  # ✅ Düzgün yazılmış
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=50,
    random_state=42,
    n_jobs=1
)

clean_gpu_memory()

print("Starting hyperparameter tuning...")
random_search.fit(X_train, y_train_log, sample_weight=train_weight)
best_params = random_search.best_params_
print("Best Parameters from tuning:", best_params)

clean_gpu_memory()

# ------------------------------
# Best Hyperparameters Export
# ------------------------------
best_params_df = pd.DataFrame([best_params])
best_params_df.to_csv("best_hyperparameters.csv", index=False)
print("Best hyperparameters exported to 'best_hyperparameters.csv'")
