In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [122]:
df = train_df.copy()
test = test_df.copy()

In [123]:
# Date to datetime
df['date'] = pd.to_datetime(df['date'])
test['date'] = pd.to_datetime(test['date'])

In [124]:
df = pd.concat([df,pd.get_dummies(df[['country','store','product']])],axis=1)
test = pd.concat([test,pd.get_dummies(test[['country','store','product']])],axis=1)

In [125]:
def add_date_features(data):
    data = data.copy()
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['year'] = data['date'].dt.year
    data['dayofweek'] = data['date'].dt.dayofweek
    data['quarter'] = data['date'].dt.quarter
    data['dayofmonth'] = data['date'].dt.day
    data['weekofyear'] = data['date'].dt.weekofyear
    data['is_month_start'] = data.date.dt.is_month_start.astype(np.int8)
    data['is_month_end'] = data.date.dt.is_month_end.astype(np.int8)
    data['monday'] = data.date.dt.weekday.eq(0).astype(np.uint8)
    data['tuesday'] = data.date.dt.weekday.eq(1).astype(np.uint8)
    data['wednesday'] = data.date.dt.weekday.eq(2).astype(np.uint8)
    data['thursday'] = data.date.dt.weekday.eq(3).astype(np.uint8)
    data['friday'] = data.date.dt.weekday.eq(4).astype(np.uint8)
    data['saturday'] = data.date.dt.weekday.eq(5).astype(np.uint8)
    data['sunday'] = data.date.dt.weekday.eq(6).astype(np.uint8)
    data['is_quarter_end'] = data['date'].dt.is_quarter_end
    data['is_quarter_start'] = data['date'].dt.is_quarter_start
    data['is_year_end'] = data['date'].dt.is_year_end
    data['is_year_start'] = data['date'].dt.is_year_start
    
    return data

df = add_date_features(df)
test = add_date_features(test)

  if __name__ == "__main__":


numeric_features = list(df.select_dtypes(include=['int64','uint8']).columns) + ['date']
categorical_features = list(df.select_dtypes(['object','bool']).columns)

from sklearn.preprocessing import LabelEncoder

ohe = LabelEncoder()

for i in range(len(categorical_features)):
    ohe.fit(df[categorical_features[i]])
    df.loc[:,categorical_features[i]] = ohe.transform(df[categorical_features[i]])
    test.loc[:,categorical_features[i]] = ohe.transform(test[categorical_features[i]])

In [127]:
def math_features(data):
    data = data.copy()
    data['month_sin'] = np.sin(2*np.pi*data.month/12)
    data['month_cos'] = np.cos(2*np.pi*data.month/12)
    data['day_sin'] = np.sin(2*np.pi*data.day/24)
    data['day_cos'] = np.cos(2*np.pi*data.day/24)
    return data

df = math_features(df)
test = math_features(test)

In [128]:
df.head(2)

Unnamed: 0,id,date,country,store,product,num_sold,country_Argentina,country_Canada,country_Estonia,country_Japan,...,saturday,sunday,is_quarter_end,is_quarter_start,is_year_end,is_year_start,month_sin,month_cos,day_sin,day_cos
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63,1,0,0,0,...,0,1,False,True,False,True,0.5,0.866025,0.258819,0.965926
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66,1,0,0,0,...,0,1,False,True,False,True,0.5,0.866025,0.258819,0.965926


In [129]:
from sklearn.preprocessing import SplineTransformer

In [130]:
def periodic_spline_transformer(period, n_splines=None, degree=3):
    """
    Kaynak: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html
    """
    
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True)

In [131]:
def seasonality_spline_features(hours=np.arange(1,32)):
    hour_df = pd.DataFrame(np.linspace(1, 32, 32).reshape(-1, 1),columns=["day"])
    splines = periodic_spline_transformer(32, n_splines=4).fit_transform(hour_df)
    splines_df = pd.DataFrame(splines,columns=[f"spline_{i}" for i in range(splines.shape[1])])
    splines_df =pd.concat([pd.Series(hours,name='day'), splines_df], axis="columns")
    
    return splines_df

In [132]:
splines_df = seasonality_spline_features()

In [133]:
df["type"]="train"
test["type"]="test"

df = pd.concat([df,test])

In [134]:
df = df.merge(splines_df,on='day',how='left')

train = df[df["type"]=="train"].copy()
test = df[df["type"]=="test"].copy()
train.drop(columns=["type"],inplace=True)
test.drop(columns=["type"],inplace=True)

In [135]:
col=['id','country','date','store','product','num_sold']
X=train.drop(columns=col)
y=train['num_sold']

In [136]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import shuffle

tscv = TimeSeriesSplit(n_splits=4)
scores = []
test_preds_res=[]
test_preds = np.zeros(len(test))
for fold,(train_index, test_index) in enumerate(tscv.split(X,y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_train.shape)

(27390, 41)
(54780, 41)
(82170, 41)
(109560, 41)


In [137]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import QuantileTransformer, PolynomialFeatures

In [147]:
# Pipeline
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
# metrics
from sklearn.metrics import mean_squared_error
params={'n_estimators': 195, 'learning_rate': 0.07725732658711602, 'depth': 7,
        'l2_leaf_reg': 8.601133541582584, 'subsample': 0.4279526734063217, 'colsample_bylevel': 0.6767696482697301,
       "random_state":42}
model = CatBoostRegressor(**params, verbose=0)
# model = GaussianNB()

pre = QuantileTransformer(output_distribution='normal',n_quantiles=2000)
pol = PolynomialFeatures()
steps = [] # Create a list of tuples with the steps
# steps.append(('pol',pol))

steps.append(('model',model))
pipeline = Pipeline(steps=steps)

In [139]:
test = test.drop(col,axis=1)

In [148]:
import random

n_reapts = 3
n_splits = 5
# Fix seed
random.seed(42)
random_state_list = random.sample(range(9999), n_reapts)

from sklearn.metrics import make_scorer

# SMAPE metric
def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100)

smape_scorer = make_scorer(smape, greater_is_better=False)

In [149]:
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor

import time
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [150]:
# TimeSeriesSplit using cross-validation

scores_smape = []
scores_mse = []
test_preds = np.zeros(len(test))

for state, random_state in enumerate(random_state_list):
    tscv = TimeSeriesSplit(n_splits=n_splits,test_size=27375,max_train_size=82200, gap=0)
    for fold, (train_index, val_index) in enumerate(tscv.split(X, y)):
#     for fold,(train_index, test_index) in enumerate(tscv.split(X,y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Pipeline fit
        X_train,y_train = shuffle(X_train,y_train, random_state=random_state)
        X_test,y_test = shuffle(X_test,y_test, random_state=random_state)
        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_test)

        # SMAPE score
        score_smape = smape(y_test, y_pred)
        score_mse = mean_squared_error(y_test, y_pred)
        scores_smape.append(score_smape)
        scores_mse.append(score_mse)
        print(f"State {state+1} Fold {fold+1}: SMAPE = {score_smape}")
        print(f"State {state+1} Fold {fold+1}: MSE = {score_mse}")
    
        fold_preds = pipeline.predict(test)
    
        test_preds += fold_preds
    # Mean SMAPE score
    mean_smape_score = np.mean(scores_smape)
    mean_mse_score = np.mean(scores_mse)
    print("SMAPE =", mean_smape_score)
    print("MSE =", mean_mse_score)

State 1 Fold 1: SMAPE = 49.668304070873155
State 1 Fold 1: MSE = 18364.561162371174
State 1 Fold 2: SMAPE = 18.402655719559643
State 1 Fold 2: MSE = 1900.920819453771
State 1 Fold 3: SMAPE = 10.760621786804116
State 1 Fold 3: MSE = 1020.1256490405457
State 1 Fold 4: SMAPE = 11.29648826892228
State 1 Fold 4: MSE = 1145.3446958215625
State 1 Fold 5: SMAPE = 21.453194974732522
State 1 Fold 5: MSE = 2787.703871202618
SMAPE = 22.31625296417834
MSE = 5043.7312395779345
State 2 Fold 1: SMAPE = 49.91884738230083
State 2 Fold 1: MSE = 18398.12585981271
State 2 Fold 2: SMAPE = 17.831363472425735
State 2 Fold 2: MSE = 1885.5185671090546
State 2 Fold 3: SMAPE = 10.742462632782754
State 2 Fold 3: MSE = 1021.8427614406595
State 2 Fold 4: SMAPE = 11.49438170446243
State 2 Fold 4: MSE = 1142.0335572695187
State 2 Fold 5: SMAPE = 21.480016381736807
State 2 Fold 5: MSE = 2787.6347958605534
SMAPE = 22.304833639460025
MSE = 5045.381173938216
State 3 Fold 1: SMAPE = 49.91522196883682
State 3 Fold 1: MSE = 

In [151]:
test_preds/12

array([ 62.12602359,  64.25752772,  12.00346325, ..., 150.06719824,
       795.9684231 , 649.37743967])

In [152]:
submission = pd.DataFrame(test_df['id'].copy())
y_pred=test_preds/12
y_pred = y_pred.astype(int)
y_pred = np.where(y_pred < 0, 1, y_pred)
submission["num_sold"] = y_pred

submission.to_csv("submission_CatBoost_11.csv",index=False)