In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
# calculate the Pearson's correlation between two variables
from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr

import xgboost as xgb

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, Normalizer


np.random.seed(33)
try:
    import seaborn as sns
except ImportError:
    from pip._internal import main as pip
    pip(['install', '--user', 'seaborn'])
    import seaborn as sns
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

GOAL - train a model to predict book sales

In [None]:
data = r'/content/drive/MyDrive/Colab Notebooks/datafiles/tabular-playground-series-sep-2022/train.csv'
testdata = r'/content/drive/MyDrive/Colab Notebooks/datafiles/tabular-playground-series-sep-2022/test.csv'
samplesub = r'/content/drive/MyDrive/Colab Notebooks/datafiles/tabular-playground-series-sep-2022/sample_submission.csv'

In [None]:
df = pd.read_csv(data)
test_df = pd.read_csv(testdata)
sample_df = pd.read_csv(samplesub)

In [None]:
df

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240
...,...,...,...,...,...,...
70123,70123,2020-12-31,Spain,KaggleMart,Kaggle for Kids: One Smart Goose,614
70124,70124,2020-12-31,Spain,KaggleRama,Kaggle Advanced Techniques,215
70125,70125,2020-12-31,Spain,KaggleRama,Kaggle Getting Started,158
70126,70126,2020-12-31,Spain,KaggleRama,Kaggle Recipe Book,135


In [None]:
store_corr = pearsonr(df.loc[df['store']=='KaggleMart','num_sold'], df.loc[df['store']=='KaggleRama','num_sold'])[0]
print(f'Store correlation: {store_corr:.4f}')

mult_factor = df.loc[df['store']=='KaggleMart','num_sold'].sum()/df.loc[df['store']=='KaggleRama','num_sold'].sum()
print(f'Multiplicative factor: {mult_factor:.4f}')

Store correlation: 0.9812
Multiplicative factor: 2.8837


In [None]:
def date_to_ordinal(date :str):
  d1 = datetime.strptime(date, '%Y-%m-%d').date()
  return d1.toordinal()

def standardize_features( dataframe: pd.DataFrame, feature_names: list, operation="standardize"):
  """standardizes numerical features and returns a dataframe"""
  
  if operation == "normalize":
    operator = Normalizer()
  else:
    operator = StandardScaler()
  df = dataframe
  for column_name in feature_names:
    try:
      raw_data = np.array(df[column_name]).reshape(-1, 1)
      transformed = operator.fit_transform(raw_data)
      ready = pd.DataFrame(transformed, columns=[column_name])
      df = df.drop(column_name, axis=1).join(ready)
    except Exception:
      pass
  return df

def kaggle_submission(trained_model, test_data, savename="my_predictions", neural=False):
  SAVING_FOLDER = r'/content/drive/MyDrive/Colab Notebooks/predictions'
  predictions = trained_model.predict(test_data)
  if neural:
    predictions = [pred for [pred] in predictions]
  predictions = [round(num) for num in predictions]
  indices = range(70128, 87648)
  sub_df = pd.DataFrame(zip(indices, predictions) , columns=["row_id", "num_sold"])
  with open(f"{SAVING_FOLDER}/{savename}.csv", 'w', encoding = 'utf-8-sig') as f:
    sub_df.to_csv(f, index=False)  

def normalize_date(date:int):
  return  (date - 736330) / 100

FEATURE ENGINEERING FROM THIS POST

https://www.kaggle.com/code/samuelcortinhas/tps-sept-22-timeseries-analysis#4.-Seasonality


In [None]:
dff = df.copy()
dff.date = pd.to_datetime(dff.date)

def get_date_features(df):
    # Extract year, month, day, etc
    #df['year'] = df['date'].dt.year                   # 2017 to 2021
    df['day_of_week'] = df['date'].dt.dayofweek       # 0 to 6
    df['day_of_month'] = df['date'].dt.day            # 1 to 31
    df['day_of_year'] = df['date'].dt.dayofyear         # 1 to 366
    #df.loc[(df['date'].dt.year==2020) & (df['day_of_year']>60), 'day_of_year'] -= 1   # 1 to 365
    df['week']=df['date'].dt.isocalendar().week       # 1 to 53
    df['week']=df['week'].astype('int')               # int64
    df['month']=df['date'].dt.month
    df["year"] = df['date'].dt.year
    df['pandemic_year'] = df.year.apply(lambda x : 1 if x == 2020 else 0)
    dff['is_weekend'] = np.where(dff['date'].dt.day_of_week.isin([5,6]), 1,0).astype("object")                   
    return df



In [None]:
def get_fourier_features(df):  # from other tutorial
    # Time period = 2 years
    dayofbiyear = df['date'].dt.dayofyear + 365*(1-(df['date'].dt.year%2))  # 1 to 730
    
    # k=1 -> 2 years, k=2 -> 1 year, k=4 -> 6 months
    for k in [1, 2, 4]:
        df[f'sin{k}'] = np.sin(2 * np.pi * k * dayofbiyear / (2* 365))
        df[f'cos{k}'] = np.cos(2 * np.pi * k * dayofbiyear / (2* 365))
        
        # Different products have different seasonality patterns
        for product in df['product'].unique():
            df[f'sin_{k}_{product}'] = df[f'sin{k}'] * (df['product'] == product)
            df[f'cos_{k}_{product}'] = df[f'cos{k}'] * (df['product'] == product)
        
        df = df.drop([f'sin{k}', f'cos{k}'], axis=1)
    
    return df

def fourier_features( df):
  df["month_cos"] = np.cos(df['date'].dt.month * (2 * np.pi / 12))
  df["month_sin"] = np.sin(df['date'].dt.month * (2 * np.pi / 12))
  return df




In [None]:
dff = get_date_features(dff)
dff = fourier_features(dff)
dff.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,day_of_week,day_of_month,day_of_year,week,month,year,pandemic_year,is_weekend,month_cos,month_sin
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663,6,1,1,52,1,2017,0,1,0.866025,0.5
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615,6,1,1,52,1,2017,0,1,0.866025,0.5
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480,6,1,1,52,1,2017,0,1,0.866025,0.5
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710,6,1,1,52,1,2017,0,1,0.866025,0.5
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240,6,1,1,52,1,2017,0,1,0.866025,0.5


END OF FEATURE ENGINEERING FROM POST

In [None]:
temp = pd.concat([df, test_df])
temp.date = pd.to_datetime(temp.date)

In [None]:
cat_columns = ["country", "store", "product"]
encoded = pd.get_dummies(temp, columns=cat_columns, prefix='', prefix_sep='', drop_first=True)

In [None]:
# APPLYING THE ENGENEERED FEATURES

encoded = get_date_features(encoded)
encoded = fourier_features(encoded)

encoded.drop(["date"], axis=1, inplace=True)
encoded["year"] = encoded["year"].astype("category").cat.codes.astype("category")

In [None]:
preprocessed = encoded.copy()
#preprocessed.date = preprocessed.date.apply(normalize_date)

In [None]:
preprocessed = preprocessed.sort_values(by=['row_id'])
preprocessed.head()

Unnamed: 0,row_id,num_sold,France,Germany,Italy,Poland,Spain,KaggleRama,Kaggle Getting Started,Kaggle Recipe Book,Kaggle for Kids: One Smart Goose,day_of_week,day_of_month,day_of_year,week,month,year,pandemic_year,month_cos,month_sin
0,0,663.0,0,0,0,0,0,0,0,0,0,6,1,1,52,1,0,0,0.866025,0.5
1,1,615.0,0,0,0,0,0,0,1,0,0,6,1,1,52,1,0,0,0.866025,0.5
2,2,480.0,0,0,0,0,0,0,0,1,0,6,1,1,52,1,0,0,0.866025,0.5
3,3,710.0,0,0,0,0,0,0,0,0,1,6,1,1,52,1,0,0,0.866025,0.5
4,4,240.0,0,0,0,0,0,1,0,0,0,6,1,1,52,1,0,0,0.866025,0.5


In [None]:
train_encoded = preprocessed.iloc[0:70128]
test_encoded = preprocessed.iloc[70128:]

In [None]:
x = train_encoded.drop(["row_id", "num_sold"], axis=1)
y = train_encoded.num_sold
x.head()

Unnamed: 0,France,Germany,Italy,Poland,Spain,KaggleRama,Kaggle Getting Started,Kaggle Recipe Book,Kaggle for Kids: One Smart Goose,day_of_week,day_of_month,day_of_year,week,month,year,pandemic_year,month_cos,month_sin
0,0,0,0,0,0,0,0,0,0,6,1,1,52,1,0,0,0.866025,0.5
1,0,0,0,0,0,0,1,0,0,6,1,1,52,1,0,0,0.866025,0.5
2,0,0,0,0,0,0,0,1,0,6,1,1,52,1,0,0,0.866025,0.5
3,0,0,0,0,0,0,0,0,1,6,1,1,52,1,0,0,0.866025,0.5
4,0,0,0,0,0,1,0,0,0,6,1,1,52,1,0,0,0.866025,0.5


In [None]:
# SHUFFLING
#x = x.sample(frac=1)
#x.head()

In [None]:
test_encoded = test_encoded.drop(["row_id","num_sold"], axis=1)
test_encoded.head()

Unnamed: 0,France,Germany,Italy,Poland,Spain,KaggleRama,Kaggle Getting Started,Kaggle Recipe Book,Kaggle for Kids: One Smart Goose,day_of_week,day_of_month,day_of_year,week,month,year,pandemic_year,month_cos,month_sin
0,0,0,0,0,0,0,0,0,0,4,1,1,53,1,4,0,0.866025,0.5
1,0,0,0,0,0,0,1,0,0,4,1,1,53,1,4,0,0.866025,0.5
2,0,0,0,0,0,0,0,1,0,4,1,1,53,1,4,0,0.866025,0.5
3,0,0,0,0,0,0,0,0,1,4,1,1,53,1,4,0,0.866025,0.5
4,0,0,0,0,0,1,0,0,0,4,1,1,53,1,4,0,0.866025,0.5


In [None]:
SPLIT_CUT =  50000   #


x_train_full, x_test, y_train_full, y_test =  train_test_split(x, y, test_size=0.1)          
x_train, x_valid =  x_train_full[:SPLIT_CUT], x_train_full[SPLIT_CUT:]   
y_train, y_valid =  y_train_full[:SPLIT_CUT], y_train_full[SPLIT_CUT:]

In [None]:
xgbr = xgb.XGBRegressor(n_estimators = 200, learning_rate = 1)
train_x = x_train_full.to_numpy()
train_y = y_train_full.to_numpy()
xgbr.fit(train_x, train_y)
y_true = y_test
y_pred = xgbr.predict(x_test.to_numpy())
mean_absolute_error(y_true, y_pred)




11.887823728833103

In [None]:
list(zip(x_train_full.columns,xgbr.feature_importances_))

[('France', 0.007110883),
 ('Germany', 0.019845152),
 ('Italy', 0.027706249),
 ('Poland', 0.15512604),
 ('Spain', 0.046377916),
 ('KaggleRama', 0.40629062),
 ('Kaggle Getting Started', 0.040094074),
 ('Kaggle Recipe Book', 0.15149724),
 ('Kaggle for Kids: One Smart Goose', 0.015456096),
 ('day_of_week', 0.027879499),
 ('day_of_month', 0.0006924967),
 ('day_of_year', 0.01759011),
 ('week', 0.004027621),
 ('month', 0.003768756),
 ('year', 0.060496353),
 ('pandemic_year', 0.0),
 ('month_cos', 0.011869733),
 ('month_sin', 0.0041711284)]

In [None]:
#kaggle_submission(xgbr, test_encoded.to_numpy(), "submission KTP sep  2")

In [None]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor="loss")

sample_to_normalize = x_train
norm_layer = keras.layers.Normalization(axis=1)
norm_layer.adapt(tf.constant(sample_to_normalize))

neural = keras.models.Sequential([
norm_layer,
keras.layers.Dense(72, activation="selu", kernel_initializer='lecun_normal',input_shape=x_train.shape[1:]),
#keras.layers.Dropout(rate=0.2),
keras.layers.Dense(72,  "selu", kernel_initializer='lecun_normal'),
keras.layers.Dense(1, activation=None),
])

#optimizer = keras.optimizers.SGD(learning_rate=0.12, nesterov=True) #keras.optimizers.RMSprop(lr=0.001) 
optimizer = keras.optimizers.Nadam(learning_rate=0.05)
loss = tf.keras.losses.MeanSquaredError()
neural.compile(loss="mse", optimizer= optimizer)

history = neural.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=30, callbacks=[early_stopping_cb])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


In [None]:
#kaggle_submission(neural, test_encoded, "neural>_> 8", neural=True)

checking some predictions

In [None]:
neural_pred = [pred for pred in neural.predict(x_test).tolist()]
predi = [pred[0] for pred in neural_pred]
rounded = [round(value) for value in predi]
rounded[:10]

[165, 286, 89, 57, 345, 146, 150, 99, 251, 312]

In [None]:
y_test.iloc[:10]

32855    172.0
3448     285.0
33518     81.0
38158     58.0
12241    346.0
16756    137.0
11978    155.0
38143    105.0
53570    251.0
58875    322.0
Name: num_sold, dtype: float64

In [None]:
preds0 = neural.predict(test_encoded)

In [None]:
neural_preds = neural.predict(x_test)
xgb_preds = xgbr.predict(x_test.to_numpy())


In [None]:
mean_squared_error(neural_preds, y_test)

312.1622276456109

In [None]:
mean_squared_error(xgb_preds, y_test)

278.2555500832948