In [None]:
# This script loads and processes sales data, performs feature engineering,
# optimizes memory usage, and trains a CatBoost model for forecasting sales.
# The model works with pre-set hyperparameters (optimized in another notebook) and preprocessed data(processed in another notebook)

# Import necessary libraries
import numpy as np  # For linear algebra operations
import pandas as pd  # For data processing, CSV file I/O (e.g. pd.read_csv)
import gc  # For garbage collection to free up memory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# LOAD DATA

In [None]:
# Load sales training and test data
sales_train = pd.read_csv('/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_train.csv')
sales_test = pd.read_csv('/kaggle/input/rohlik-sales-forecasting-challenge-v2/sales_test.csv')

# Convert date columns to datetime format
sales_train['date'] = pd.to_datetime(sales_train['date'])
sales_test['date'] = pd.to_datetime(sales_test['date'])

# Print min and max dates for train and test sets
train_min_date = sales_train['date'].min()
train_max_date = sales_train['date'].max()
test_min_date = sales_test['date'].min()
test_max_date = sales_test['date'].max()
print("Sales Train - En Küçük Tarih:", train_min_date)
print("Sales Train - En Büyük Tarih:", train_max_date)
print("Sales Test - En Küçük Tarih:", test_min_date)
print("Sales Test - En Büyük Tarih:", test_max_date)

# Delete the loaded sales_train and sales_test DataFrames to free memory
del sales_train
del sales_test

# Run garbage collection to reclaim memory from deleted objects
gc.collect()

In [None]:
# Load the pre-processed training dataset
train_merged_corrected = pd.read_csv('/kaggle/input/rohlik-dataprep/train_merged_corrected.csv', low_memory=True)

# Optimize display settings for better visualization
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows (optional)

# Data Cleaning & Memory Optimization


In [None]:
# ------------------------------
# Memory Reduction Function
# ------------------------------
def reduce_mem_usage(df):
    """ Reduce memory usage of a DataFrame by downcasting numerical columns. """
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type not in [object, 'category', 'datetime64[ns]']:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB")
    return df

# Reduce memory usage of dataset
train_merged_corrected = reduce_mem_usage(train_merged_corrected)


In [None]:
# ------------------------------
# Data Cleaning & Memory Optimization
# ------------------------------
# Drop unnecessary columns to reduce dataset size
columns_to_drop = ['unique_id', 'date']
train_merged_corrected = train_merged_corrected.drop(columns=[
    'is_weekend','day_sin','day_cos','sales_max','sales_mean','sales_median','sales_std',
    'sales_skew','sales_zero_ratio','total_orders_max','total_orders_mean','total_orders_median',
    'total_orders_std','total_orders_skew','total_orders_zero_ratio','sell_price_main_max',
    'sell_price_main_mean','sell_price_main_median','sell_price_main_std','sell_price_main_skew',
    'sell_price_main_zero_ratio','total_discount_max','total_discount_mean','total_discount_median',
    'total_discount_std','total_discount_skew','total_discount_zero_ratio','sales_yearly_q25',
    'sales_yearly_q75','total_orders_yearly_q25','total_orders_yearly_q75','total_orders_yearly_zero_ratio',
    'total_orders_yearly_cv','sell_price_main_yearly_q25','sell_price_main_yearly_q75',
    'sell_price_main_yearly_zero_ratio','sell_price_main_yearly_cv','availability_yearly_max',
    'total_discount_yearly_q25','total_discount_yearly_q75','total_discount_yearly_zero_ratio',
    'days_to_next_closed', 'total_discount_yearly_min'
])


In [None]:


def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type not in [object, 'category', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()

            # tamsayı sütunlar
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                # float sütunlar
                df[col] = df[col].astype(np.float32)
                
        elif col_type == object:
            # Eğer gerçekte kategorik veya sayısal değilse, kategorik dönüştürebilirsiniz
            # df[col] = df[col].astype('category')
            pass
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Bellek kullanımı: {start_mem:.2f} MB -> {end_mem:.2f} MB")
    return df
train_merged_corrected=  reduce_mem_usage  (train_merged_corrected)


# Feature Engineering & Data Splitting

In [None]:
# Convert the 'date' column to datetime format and sort the DataFrame by date
train_merged_corrected['date'] = pd.to_datetime(train_merged_corrected['date'])
train_merged_corrected = train_merged_corrected.sort_values('date')

# Drop the defined columns to simplify the dataset
df = train_merged_corrected.drop(columns=columns_to_drop)

# Select non-numeric and non-string columns to identify categorical features
non_num_str_df = df.select_dtypes(exclude=["number", "string"])
del df  # Delete temporary DataFrame to free memory

# List out the identified categorical features
categorical_features = non_num_str_df.columns.tolist()
categorical_features.append('product_unique_id')

# Convert the identified categorical columns in the original DataFrame to string type
train_merged_corrected[categorical_features] = train_merged_corrected[categorical_features].astype('string')
for col in categorical_features:
    # Ensure that missing values are filled with a placeholder ("-1") and cast as string
    train_merged_corrected[col] = train_merged_corrected[col].astype(str).fillna("-1")

# =============================================================================
# Data Splitting: Define Sales Train and Test Sets Based on Date Ranges
# =============================================================================

# Define the training period for sales data: 2020-08-01 to 2024-06-02
sales_train = train_merged_corrected[
    (train_merged_corrected['date'] >= pd.Timestamp('2020-08-01')) &
    (train_merged_corrected['date'] <= pd.Timestamp('2024-06-02'))
]

# Define the test period for sales data: 2024-06-03 to 2024-06-16
sales_test = train_merged_corrected[
    (train_merged_corrected['date'] >= pd.Timestamp('2024-06-03')) &
    (train_merged_corrected['date'] <= pd.Timestamp('2024-06-16'))
]
del non_num_str_df
del train_merged_corrected
gc.collect()  # Clean up memory




# Split the training data into training and validation sets without shuffling 
# to maintain the time series order
X_train, X_val, y_train, y_val = train_test_split(
    sales_train.drop(columns=columns_to_drop + ['sales']), 
    sales_train['sales'], 
    test_size=0.1, 
    shuffle=False
)


# Prepare the test set by dropping the columns not needed
X_test = sales_test.drop(columns=columns_to_drop)




# =============================================================================
# Handle Sample Weights and Memory Reduction for Training, Validation, and Test Sets
# =============================================================================

# Save the weight column separately for training, validation, and test sets
train_weight = X_train['weight']
val_weight   = X_val['weight']
test_weight  = X_test['weight']

# Apply memory reduction on each set
X_train = reduce_mem_usage(X_train)
X_val   = reduce_mem_usage(X_val)
X_test  = reduce_mem_usage(X_test)

# Drop the 'weight' column from feature sets as it is stored separately
X_train = X_train.drop(['weight'], axis=1)
X_val   = X_val.drop(['weight'], axis=1)
X_test  = X_test.drop(['weight'], axis=1)

# =============================================================================
# Process Categorical Features: Ensure all categorical features are strings
# =============================================================================

# Convert categorical features to string type for training, validation, and test sets
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_val[categorical_features]   = X_val[categorical_features].astype(str)
X_test[categorical_features]  = X_test[categorical_features].astype(str)

# Debug: Print data type and unique values of each categorical feature in validation and test sets
for col in categorical_features:
    print(col, X_val[col].dtype, X_test[col].unique())

# Set option to display all items in sequences for better debugging/inspection
pd.set_option('display.max_seq_items', None)

print("Those are categorical features:")
print(categorical_features)

# Loop through each categorical feature and print its data type and unique values
for col in categorical_features:
    print(col, X_val[col].dtype)
    print("X_val unique values:")
    print(X_val[col].unique())
    print("X_test unique values:")
    print(X_test[col].unique())
    print("\n" + "="*50 + "\n")

# =============================================================================
# Log Transformation on the Target Variable
# =============================================================================

# Apply log1p transformation to the target variable for better model stability
y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)

# Model Training & Feature Importance Analysis

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from catboost import Pool
# Best parameters for the CatBoostRegressor model
best_params = {
    'eval_metric': 'MAE',
    'bagging_temperature': 0.25,
    'iterations': 2500,
    'learning_rate': 0.05,
    'max_depth': 10,
    'l2_leaf_reg': 1,
    'min_data_in_leaf': 24,
    'random_strength': 0.25
}

# List the names of all global variables that are pandas DataFrames for debugging purposes
dfs_in_memory = [
    var_name 
    for var_name, var_value in globals().items() 
    if isinstance(var_value, pd.DataFrame)
]
gc.collect()  # Clean up memory
print("Aşağıdaki isimler pandas DataFrame nesnelerini temsil ediyor:")
print(dfs_in_memory)

# Create an evaluation pool for CatBoost using the validation set
eval_pool = Pool(data=X_val, label=y_val_log, cat_features=categorical_features, weight=val_weight)

# Initialize the CatBoostRegressor with specified parameters, using GPU acceleration
best_model = CatBoostRegressor(
    **best_params, 
    loss_function='MAE', 
    random_seed=42,
    boosting_type='Ordered',
    thread_count=-1,      # Use all available CPU cores
    task_type='GPU',
    allow_writing_files=False,
    max_bin=128
)

# Train the model using the training data, with early stopping and evaluation on the validation set
best_model.fit(
    X_train, y_train_log,
    sample_weight=train_weight,
    eval_set=eval_pool,
    cat_features=categorical_features,
    verbose=300,
    early_stopping_rounds=10
)

# =============================================================================
# Predictions: Validate and Generate Test Set Predictions
# =============================================================================

# Predict on the validation set (output is in log scale)
y_val_pred_log = best_model.predict(X_val)

# Create a prediction pool for the test set
test_pool = Pool(data=X_test, cat_features=categorical_features, weight=test_weight)
# Predict on the test set (output is in log scale)
y_test_pred_log = best_model.predict(test_pool)

# Convert predictions back from log scale to original scale using expm1 (inverse of log1p)
y_val_pred = np.expm1(y_val_pred_log)
y_test_pred = np.expm1(y_test_pred_log)

# Delete the temporary log-scale predictions to free memory
del y_val_pred_log
del y_test_pred_log
gc.collect()




# Results

In [None]:
# =============================================================================
# Post-processing: Adjust Predictions Based on Thresholds
# =============================================================================

# Adjust predictions if they exceed a threshold of 5000 (scaling factor 1.1)
y_val_pred2 = np.where(y_val_pred > 5000, y_val_pred * 1.1, y_val_pred)
y_test_pred2 = np.where(y_test_pred > 5000, y_test_pred * 1.1, y_test_pred)

# Another adjustment for predictions exceeding 6000 (scaling factor 1.1)
y_val_pred3 = np.where(y_val_pred > 6000, y_val_pred * 1.1, y_val_pred)
y_test_pred3 = np.where(y_test_pred > 6000, y_test_pred * 1.1, y_test_pred)

In [None]:

# =============================================================================
# Feature Importance Reporting and Export
# =============================================================================

# Calculate feature importances from the trained CatBoost model using the training data
feature_importances = best_model.get_feature_importance(Pool(X_train, cat_features=categorical_features))
feature_names = X_train.columns.tolist()

# Create a DataFrame to store feature names and their corresponding importance values
feature_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importances
})

# Sort the DataFrame by importance in descending order
feature_importances_df = feature_importances_df.sort_values(by="Importance", ascending=False)

# Print the feature importances to the console
print("Feature Importances:")
print(feature_importances_df)


# Plot the feature importances using a horizontal bar chart
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 12))
plt.barh(feature_importances_df["Feature"], feature_importances_df["Importance"])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importances from CatBoost Model")
plt.gca().invert_yaxis()  # Invert y-axis so the most important features are at the top
plt.tight_layout()
plt.show()

# Export the feature importances to a CSV file
feature_importances_df.to_csv("feature_importances.csv", index=False)
print("Feature importances exported to 'feature_importances.csv'")

In [None]:

# =============================================================================
# Model Evaluation: Calculate MAE and Weighted MAE Metrics
# =============================================================================

# Calculate Mean Absolute Error (MAE) on the validation set
mae_val = mean_absolute_error(y_val, y_val_pred)
print(f"Test MAE VAL: {mae_val}")

# Calculate Weighted MAE using the adjusted predictions (first adjustment with threshold 5000)
weighted_mae_val = np.sum(val_weight * np.abs(y_val - y_val_pred2)) / np.sum(val_weight)
print(f"Weighted MAE VAL: {weighted_mae_val}")

# Calculate Weighted MAE using the adjusted predictions (second adjustment with threshold 6000)
weighted_mae_val = np.sum(val_weight * np.abs(y_val - y_val_pred3)) / np.sum(val_weight)
print(f"Weighted MAE VAL: {weighted_mae_val}")

# Calculate Weighted MAE using the original predictions without adjustments
weighted_mae_val = np.sum(val_weight * np.abs(y_val - y_val_pred)) / np.sum(val_weight)
print(f"Weighted MAE VAL: {weighted_mae_val}")

In [None]:
# =============================================================================
# Visualization: Scatter Plots for Actual vs Predicted Sales
# =============================================================================

import matplotlib.pyplot as plt

# Plot actual sales vs predicted sales (original predictions)
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred, alpha=0.5)
plt.title("Actual vs Predicted Sales")
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.grid(True)
plt.show()

# Plot actual sales vs predicted sales (first adjustment)
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred2, alpha=0.5)
plt.title("Actual vs Predicted Sales")
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.grid(True)
plt.show()

# Plot actual sales vs predicted sales (second adjustment)
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred3, alpha=0.5)
plt.title("Actual vs Predicted Sales")
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.grid(True)
plt.show()

In [None]:
# =============================================================================
# Visualization: Residual Plots
# =============================================================================

# Calculate residuals for the original predictions (difference between actual and predicted values)
residuals = y_val - y_val_pred

# Plot residuals for the original predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_val_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)  # Add horizontal zero line for reference
plt.xlabel("Tahmin Edilen Değerler (Sales)")
plt.ylabel("Residual (Gerçek - Tahmin)")
plt.title("Residual Grafiği")
plt.show()

# Calculate residuals for the first adjustment predictions
residuals = y_val - y_val_pred2

# Plot residuals for the first adjustment predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_val_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel("Tahmin Edilen Değerler (Sales)")
plt.ylabel("Residual (Gerçek - Tahmin)")
plt.title("Residual Grafiği")
plt.show()

# Calculate residuals for the second adjustment predictions
residuals = y_val - y_val_pred3

# Plot residuals for the second adjustment predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_val_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel("Tahmin Edilen Değerler (Sales)")
plt.ylabel("Residual (Gerçek - Tahmin)")
plt.title("Residual Grafiği")
plt.show()

In [None]:
# =============================================================================
# Post-processing for Test Set Predictions and Submission File Creation
# =============================================================================

# Ensure that no negative predictions exist by setting negatives to 0
y_test_pred[y_test_pred < 0] = 0

# Create a unique submission ID by concatenating 'unique_id' and 'date'
sales_test['id'] = sales_test['unique_id'].astype(str) + "_" + sales_test['date'].astype(str)
# Assign the predicted sales values to a new column 'sales_hat'
sales_test['sales_hat'] = y_test_pred
# Export the predictions to a CSV file for submission
sales_test[['id', 'sales_hat']].to_csv("submission.csv", index=False)

# Second submission: Apply the first adjustment to ensure negatives are handled (if any)
y_test_pred[y_test_pred2 < 0] = 0
sales_test['id'] = sales_test['unique_id'].astype(str) + "_" + sales_test['date'].astype(str)
sales_test['sales_hat'] = y_test_pred
sales_test[['id', 'sales_hat']].to_csv("submission2.csv", index=False)

# Third submission: Apply the second adjustment to ensure negatives are handled (if any)
y_test_pred[y_test_pred3 < 0] = 0
sales_test['id'] = sales_test['unique_id'].astype(str) + "_" + sales_test['date'].astype(str)
sales_test['sales_hat'] = y_test_pred
sales_test[['id', 'sales_hat']].to_csv("submission3.csv", index=False)