# Kaggle Competition 1: Forecasting Sticker Sales - V0.3

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam, RMSprop
import lightgbm as lgb

## Import Data
### Train data

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [3]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

### Test Data

In [4]:
df_test = pd.read_csv('test.csv')
df_test.drop('id', axis=1, inplace=True)

## Handle missing values

In [5]:
df.dropna(inplace=True)
df.drop('id', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221259 entries, 0 to 221258
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      221259 non-null  object 
 1   country   221259 non-null  object 
 2   store     221259 non-null  object 
 3   product   221259 non-null  object 
 4   num_sold  221259 non-null  float64
dtypes: float64(1), object(4)
memory usage: 8.4+ MB


## Quantify Data
### Date

In [6]:
# Extract date-related features
def extract_date_features(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['weekday'] = data['date'].dt.weekday
    data['weekofyear'] = data['date'].dt.isocalendar().week.astype(int)
    return data.drop(columns=['date'])

df = extract_date_features(df)
df_test = extract_date_features(df_test)

### One-Hot Encoding

In [7]:
# Train
df = df.join(pd.get_dummies(df['country'], prefix = 'country').astype(int)).drop('country', axis=1)
df = df.join(pd.get_dummies(df['store'], prefix = 'store').astype(int)).drop('store', axis=1)
df = df.join(pd.get_dummies(df['product'], prefix = 'product').astype(int)).drop('product', axis=1)

In [8]:
# Test
df_test = df_test.join(pd.get_dummies(df_test['country'], prefix = 'country').astype(int)).drop('country', axis=1)
df_test = df_test.join(pd.get_dummies(df_test['store'], prefix = 'store').astype(int)).drop('store', axis=1)
df_test = df_test.join(pd.get_dummies(df_test['product'], prefix = 'product').astype(int)).drop('product', axis=1)

In [9]:
# Correct dtypes
df['num_sold'] = df['num_sold'].astype(int)
df.head()

Unnamed: 0,num_sold,year,month,day,weekday,weekofyear,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,store_Discount Stickers,store_Premium Sticker Mart,store_Stickers for Less,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,product_Kerneler Dark Mode
0,973,2010,1,1,4,53,1,0,0,0,0,0,1,0,0,0,1,0,0,0
1,906,2010,1,1,4,53,1,0,0,0,0,0,1,0,0,0,0,1,0,0
2,423,2010,1,1,4,53,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,491,2010,1,1,4,53,1,0,0,0,0,0,1,0,0,0,0,0,0,1
4,300,2010,1,1,4,53,1,0,0,0,0,0,0,0,1,1,0,0,0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221259 entries, 0 to 221258
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   num_sold                    221259 non-null  int32
 1   year                        221259 non-null  int32
 2   month                       221259 non-null  int32
 3   day                         221259 non-null  int32
 4   weekday                     221259 non-null  int32
 5   weekofyear                  221259 non-null  int32
 6   country_Canada              221259 non-null  int32
 7   country_Finland             221259 non-null  int32
 8   country_Italy               221259 non-null  int32
 9   country_Kenya               221259 non-null  int32
 10  country_Norway              221259 non-null  int32
 11  country_Singapore           221259 non-null  int32
 12  store_Discount Stickers     221259 non-null  int32
 13  store_Premium Sticker Mart  221259 non-null 

In [11]:
X = df.drop('num_sold', axis=1)
y = df['num_sold']

X_test = df_test

In [55]:
X_tr = X[:200000]
X_val = X[200000:]

y_tr = y[:200000]
y_val = y[200000:]

## LGBM 

In [56]:
# LightGBM model
def train_lightgbm(X_tr, y_tr, X_val, y_val):
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'boosting_type': 'gbdt',
        'n_estimators': 400,
        'num_leaves': 25,
        'max_depth': -1,
        'learning_rate': 0.119,
        'reg_alpha': 0.007,
        'min_child_samples': 90,
        'colsample_bytree': 0.65,
        'force_row_wise': True
    }
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data]
    )

    return model

lgb_model = train_lightgbm(X_tr, y_tr, X_val, y_val)

lgb_predictions = lgb_model.predict(X_val)

lgb_mape = mean_absolute_percentage_error(y_val, lgb_predictions)

print(f"LightGBM MAPE: {lgb_mape}")



[LightGBM] [Info] Total Bins 142
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 19
[LightGBM] [Info] Start training from score 761.251485
LightGBM MAPE: 0.13135170454731032


In [59]:
final_model = lgb_model
test_predictions = final_model.predict(X_test)
test_data = pd.read_csv('test.csv')
# Create submission
submission = pd.DataFrame({'id': test_data['id'], 'num_sold': test_predictions.astype(int)})
submission.to_csv('submission_v06.csv', index=False)
print("Submission file created as 'submission.csv'")

Submission file created as 'submission.csv'


## NN

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=[19]),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu')
])

  super().__init__(**kwargs)


In [33]:
model.compile(optimizer=RMSprop(learning_rate=0.01), loss='mean_absolute_error', metrics=['mean_absolute_percentage_error'])

In [34]:
hist = model.fit(X, y, epochs=10)

Epoch 1/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 125.7204 - mean_absolute_percentage_error: 24.0319
Epoch 2/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 75.7458 - mean_absolute_percentage_error: 10.7548
Epoch 3/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 69.7993 - mean_absolute_percentage_error: 10.0475
Epoch 4/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - loss: 66.4197 - mean_absolute_percentage_error: 9.4043
Epoch 5/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 64.2516 - mean_absolute_percentage_error: 9.2820
Epoch 6/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 63.3295 - mean_absolute_percentage_error: 8.9422
Epoch 7/10
[1m6915/6915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 62.2978 - mean_absolute

In [35]:
predictions = model.predict(X_test)
predictions = predictions.reshape(98550,)

[1m3080/3080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step


In [36]:
def make_submission(prediction, sub_name):
  my_submission = pd.DataFrame({'id':pd.read_csv('test.csv').id,'num_sold':prediction})
  my_submission.to_csv('{}.csv'.format(sub_name),index=False)
  print('A submission file has been made')

make_submission(predictions.astype(int),'submission(nn_v03)')

A submission file has been made
