In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [2]:
import warnings

# Suppress all deprecation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Suppress specific user warnings
warnings.filterwarnings('ignore', category=UserWarning, message=".*gpu_hist.*")


In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

# Load the dataset
train_df = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')

# Inspect the shape and first few rows
print(train_df.shape)
print(test_df.shape)
train_df.head()

# Handle missing values
train_df = train_df.dropna(subset=['num_sold'])

# Convert date to datetime format
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

# Extract year, month, and day as new features
train_df['Year'] = train_df['date'].dt.year
train_df['Month'] = train_df['date'].dt.month
train_df['Day'] = train_df['date'].dt.day
test_df['Year'] = test_df['date'].dt.year
test_df['Month'] = test_df['date'].dt.month
test_df['Day'] = test_df['date'].dt.day

# Drop the date column
train_df.drop('date', axis=1, inplace=True)
test_df.drop('date', axis=1, inplace=True)

# Apply log transformation to 'num_sold'
train_df['num_sold'] = np.log1p(train_df['num_sold'])

# Label encode categorical columns
cat_cols = ['country', 'store', 'product']
label_encoders = {col: LabelEncoder() for col in cat_cols}
for col in cat_cols:
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = label_encoders[col].transform(test_df[col])

# Define features and target
X = train_df.drop(['num_sold', 'id'], axis=1)  # Drop 'num_sold' and 'id' columns
y = train_df['num_sold']
X_test = test_df.drop(['id'], axis=1)  # Drop 'id' column for prediction

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define MAPE metric
def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

# Hyperparameter optimization for XGBoost
def objective_xgb(trial):
    param = {
        'tree_method': 'hist',  # Change to 'hist' instead of 'gpu_hist'
        'device': 'cuda',  # Specify CUDA device for GPU
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'eta': trial.suggest_categorical('eta', [0.3, 0.4, 0.5, 0.6, 0.7]),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    
    model = XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=25, verbose=False)
    y_pred = model.predict(X_val)
    score = mape(y_val, y_pred)
    return score

# Optimize XGBoost using Optuna
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=100)

# Extract best parameters and train XGBoost model
best_xgb_params = study_xgb.best_trial.params
xgb_model = XGBRegressor(**best_xgb_params)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)




# Create a submission DataFrame
submission_xgb = pd.DataFrame({'id': test_df['id'], 'num_sold': np.expm1(xgb_preds)})


# Save submissions to CSV
submission_xgb.to_csv('submission_xgb_original.csv', index=False)


print("XGBoost Submission Head:")
print(submission_xgb.head())




(230130, 6)
(98550, 5)


[I 2025-01-03 13:05:28,818] A new study created in memory with name: no-name-e5754b25-7fdb-46cc-9076-0e0e08ecdc81
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-01-03 13:05:29,906] Trial 0 finished with value: 0.047295507436236686 and parameters: {'lambda': 2.143744639467413e-08, 'alpha': 0.12463863463883076, 'eta': 0.3, 'gamma': 8, 'learning_rate': 0.02, 'colsample_bytree': 0.7750850377920581, 'n_estimators': 145, 'max_depth': 6, 'subsample': 0.7758327917462845}. Best is trial 0 with value: 0.047295507436236686.
[I 2025-01-03 13:05:31,957] Trial 1 finished with value: 0.018377388314437654 and parameters: {'lambda': 0.15818185844736285, 'alpha': 3.2460657286913906e-05, 'eta': 0.7, 'gamma': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.8420252862045654, 'n_estimators': 1000, 'max_depth': 5, 'subsample': 0.7736761463204659}. Best is trial 1 with value: 0.01837738831443765

XGBoost Submission Head:
       id    num_sold
0  230130  133.447723
1  230131  696.953491
2  230132  646.496582
3  230133  321.949158
4  230134  398.780548


In [4]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
import optuna
from xgboost import XGBRegressor

# Load the dataset
train_df = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')

# Handle missing values
train_df = train_df.dropna(subset=['num_sold'])

# Convert date to datetime format
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

# Feature Engineering: Extract year, month, and day
train_df['Year'] = train_df['date'].dt.year
train_df['Month'] = train_df['date'].dt.month
train_df['Day'] = train_df['date'].dt.day
test_df['Year'] = test_df['date'].dt.year
test_df['Month'] = test_df['date'].dt.month
test_df['Day'] = test_df['date'].dt.day

# Feature Engineering: Convert 'year_month' to numeric format
train_df['year_month'] = train_df['Year'] * 12 + train_df['Month']
test_df['year_month'] = test_df['Year'] * 12 + test_df['Month']

# Apply log transformation to 'num_sold'
train_df['num_sold'] = np.log1p(train_df['num_sold'])

# Label encode categorical columns
cat_cols = ['country', 'store', 'product']
label_encoders = {col: LabelEncoder() for col in cat_cols}
for col in cat_cols:
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = label_encoders[col].transform(test_df[col])

# Define features and target
X = train_df.drop(['num_sold', 'id', 'date'], axis=1)  # Drop 'num_sold' and 'id' columns
y = train_df['num_sold']
X_test = test_df.drop(['id', 'date'], axis=1)  # Drop 'id' and 'date' columns for prediction

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define MAPE metric
def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

# Hyperparameter optimization for XGBoost
def objective_xgb(trial):
    param = {
        'tree_method': 'hist',  # Change to 'hist' instead of 'gpu_hist'
        'device': 'cuda',  # Specify CUDA device for GPU
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'eta': trial.suggest_categorical('eta', [0.3, 0.4, 0.5, 0.6, 0.7]),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    
    model = XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=15, verbose=False)
    y_pred = model.predict(X_val)
    score = mape(y_val, y_pred)
    return score

# Optimize XGBoost using Optuna
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=100)  # Increased trials

# Extract best parameters and train XGBoost model
best_xgb_params = study_xgb.best_trial.params
xgb_model = XGBRegressor(**best_xgb_params)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

# Create a submission DataFrame
submission_xgb = pd.DataFrame({'id': test_df['id'], 'num_sold': np.expm1(xgb_preds)})

# Save submissions to CSV
submission_xgb.to_csv('submission_xgb.csv', index=False)

print("XGBoost Submission Head:")
print(submission_xgb.head())


[I 2025-01-03 13:12:10,532] A new study created in memory with name: no-name-51363fed-3bb5-4f33-b89a-3a5f3715e788
[I 2025-01-03 13:12:11,798] Trial 0 finished with value: 0.018713569655086273 and parameters: {'lambda': 4.25083917805299e-05, 'alpha': 1.2434747212861483, 'eta': 0.5, 'gamma': 9, 'learning_rate': 0.02, 'colsample_bytree': 0.9390590113926771, 'n_estimators': 943, 'max_depth': 9, 'subsample': 0.5778810930142289}. Best is trial 0 with value: 0.018713569655086273.
[I 2025-01-03 13:12:12,669] Trial 1 finished with value: 0.015416844312219036 and parameters: {'lambda': 0.00013450893642252061, 'alpha': 0.0007665393140795893, 'eta': 0.3, 'gamma': 1, 'learning_rate': 0.05, 'colsample_bytree': 0.562945468961384, 'n_estimators': 1146, 'max_depth': 6, 'subsample': 0.7979960887141613}. Best is trial 1 with value: 0.015416844312219036.
[I 2025-01-03 13:12:14,905] Trial 2 finished with value: 0.01830404012566137 and parameters: {'lambda': 5.19847334569893, 'alpha': 0.030199695391512633, 

XGBoost Submission Head:
       id    num_sold
0  230130  129.644135
1  230131  654.784790
2  230132  536.374390
3  230133  328.527985
4  230134  354.298126
