In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



In [43]:
df_train = pd.read_csv("/kaggle/input/playground-series-s3e19/train.csv")
df_test = pd.read_csv('/kaggle/input/playground-series-s3e19/test.csv')

In [4]:
df_train.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49


In [44]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [6]:
def create_datetime_features(df):
    df['day_of_week'] = df['date'].dt.dayofweek
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    
    return df

In [45]:
df_train = create_datetime_features(df_train)
df_test = create_datetime_features(df_test)

In [46]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [9]:
X = df_train.drop(columns=['id', 'date', 'num_sold'], axis=1)
y = np.log(df_train['num_sold'])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_dists = {
    "max_depth": [3, 5, 7],
    "n_estimators": [300, 500, 700],
    "learning_rate": np.exp(np.random.uniform(np.log(1e-4), np.log(1e-1), size=10)),
}

model = xgb.XGBRegressor()

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dists,
    n_iter=10,
    cv=5,
    random_state=42
)

random_search.fit(X_train, y_train)

In [17]:
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Score: ", random_search.best_score_)

Best Hyperparameters:  {'n_estimators': 700, 'max_depth': 7, 'learning_rate': 0.030995466311068174}
Best Score:  0.9954489022876383


In [25]:
best_model = random_search.best_estimator_

best_model.fit(X_train, y_train)

NameError: name 'train' is not defined

In [27]:
def calculate_smape(y_t, y_p):
    y_true_exp = np.exp(y_t)
    y_pred_exp = np.exp(y_p)
    
    numerator = 2 * np.abs(y_pred_exp - y_true_exp)
    denominator = np.abs(y_true_exp) + np.abs(y_pred_exp)

    return 100 / len(y_true_exp) * np.sum(numerator / denominator)

In [28]:
y_pred_train = best_model.predict(X_train)

smape_train = calculate_smape(y_train, y_pred_train)
smape_train

5.3650360326307345

In [29]:
y_pred_val = best_model.predict(X_valid)

smape_val = calculate_smape(y_valid, y_pred_val)
smape_val

5.574276517462911

In [48]:
X_test = df_test.drop(columns=['id', 'date'], axis=1)

In [49]:
X_test.head()

Unnamed: 0,day_of_week,year,month,quarter,country_Argentina,country_Canada,country_Estonia,country_Japan,country_Spain,store_Kagglazon,store_Kaggle Learn,store_Kaggle Store,product_Using LLMs to Improve Your Coding,product_Using LLMs to Train More LLMs,product_Using LLMs to Win Friends and Influence People,product_Using LLMs to Win More Kaggle Competitions,product_Using LLMs to Write Better
0,5,2022,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0
1,5,2022,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0
2,5,2022,1,1,1,0,0,0,0,0,1,0,0,0,1,0,0
3,5,2022,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0
4,5,2022,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1


In [50]:
y_pred_test = np.exp(best_model.predict(X_test))

In [54]:
idxs = df_test['id']

submission = pd.DataFrame(
    {
        "id": idxs,
        "num_sold": y_pred_test.astype(int)
    }
)

submission.to_csv("submission_2.csv", index=False)