In [1]:
import numpy as np
import pandas as pd
import datetime as dt


import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

from pytorch_tabnet.tab_model import TabNetRegressor


from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("train.csv",low_memory=False)
test_df = pd.read_csv("test.csv",low_memory=False)
store_df = pd.read_csv("store.csv",low_memory=False)

In [3]:
train_merged = pd.merge(left = train_df, right = store_df, how = 'inner', left_on = 'Store', right_on = 'Store')
test_merged = pd.merge(left = test_df, right = store_df, how = 'inner', left_on = 'Store', right_on = 'Store')

In [4]:
# separating the training and testing dataset into feature and target variables
train_feature = train_merged.drop(['Sales'], axis = 1) 
train_target  = train_merged[['Sales']]
test_feature = test_merged.drop(['Id'], axis = 1) 
test_feature['Customers'] = 0

In [5]:
def column_addition(col):
    col['Date'] = pd.to_datetime(col.Date)
    col['Month'] = col.Date.dt.month.to_list()
    col['Year'] = col.Date.dt.year.to_list()
    col['Day'] = col.Date.dt.day.to_list()
    col['WeekOfYear'] = col.Date.dt.weekofyear.to_list()
    col['DayOfWeek'] = col.Date.dt.dayofweek.to_list()
    col['weekday'] = 1        # Initialize the column with default value of 1
    col.loc[col['DayOfWeek'] == 5, 'weekday'] = 0
    col.loc[col['DayOfWeek'] == 6, 'weekday'] = 0
    return col

train_feature = column_addition(train_feature)
test_feature = column_addition(test_feature)

In [6]:
categorical = []
numerical = []
timestamp = []

for col in train_feature.columns:
    if train_feature[col].dtype == object:
        categorical.append(col)
    elif train_feature[col].dtype in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
        numerical.append(col)
    else:
        timestamp.append(col)

    # Keep selected columns only
my_cols = categorical + numerical + timestamp
train_feature = train_feature[my_cols].copy()
test_feature = test_feature[my_cols].copy()
feature = pd.concat([train_feature, test_feature])

In [7]:
# As can be seen from the info function,the year, month and week related columns are in float. CHanging that to integers

feature.CompetitionOpenSinceMonth = feature.CompetitionOpenSinceMonth.astype('Int64') 
feature.CompetitionOpenSinceYear = feature.CompetitionOpenSinceYear.astype('Int64')
feature.Promo2SinceWeek = feature.Promo2SinceWeek.astype('Int64') 
feature.Promo2SinceYear = feature.Promo2SinceYear.astype('Int64')
feature["StateHoliday"].loc[feature["StateHoliday"] == 0] = "0"

In [8]:
# treating missing values for numerical datatype
for i in ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']:
    feature[i] = feature[i].fillna((int(feature[i].median()))) 

# treating missing values for categorical datatype
feature['Open'] = feature['Open'].fillna(feature['Open'].mode()[0])
feature['PromoInterval'] = feature['PromoInterval'].fillna(feature['PromoInterval'].mode()[0])

pm = pd.get_dummies(feature['PromoInterval'],prefix='PromoInterval')
st = pd.get_dummies(feature['StoreType'],prefix='StoreType')
assort = pd.get_dummies(feature['Assortment'],prefix='Assortment')
state_holiday = pd.get_dummies(feature['StateHoliday'],prefix='StateHoliday')

In [9]:
final = [feature,state_holiday,st,pm,assort]
feature = pd.concat(final,axis=1, join='inner')

feature = feature.drop(columns=['StoreType','PromoInterval','Assortment','StateHoliday','Date'])
#min max normalization
feature =(feature-feature.min())/(feature.max()-feature.min())

In [10]:
#separating training and testing back
train_df_feature = feature.iloc[:len(train_feature), ]
test_df_feature = feature.iloc[len(train_feature):, :]

test_df_feature = test_df_feature.drop(columns="Customers")

In [11]:

train_copy = train_df_feature.copy()
train_copy['Sales'] = train_target

#comment this code if you don't want sales to be normalized
#train_copy['Sales'] =(train_copy['Sales']-train_copy['Sales'].min())/(train_copy['Sales'].max()-train_copy['Sales'].min())

In [12]:
X = train_copy.drop(columns=['Sales','Customers'])
y      = np.log1p(train_copy["Sales"])
X_test = test_df_feature

In [13]:
X      = X.to_numpy()
y      = y.to_numpy().reshape(-1, 1)
X_test = X_test.to_numpy()

In [14]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    regressor = TabNetRegressor(verbose=1,seed=42)
    regressor.fit(X_train=X_train, y_train=y_train,
              eval_set=[(X_valid, y_valid)],
              patience=10, max_epochs=10,batch_size=10000,
              eval_metric=['rmse'])
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(np.expm1(regressor.predict(X_test)))

Device used : cpu
epoch 0  | loss: 4.68353 | val_0_rmse: 1.66069 |  0:00:30s
epoch 1  | loss: 0.15195 | val_0_rmse: 0.51531 |  0:01:01s
epoch 2  | loss: 0.13238 | val_0_rmse: 0.52031 |  0:01:33s
epoch 3  | loss: 0.13127 | val_0_rmse: 0.38754 |  0:02:04s
epoch 4  | loss: 0.12402 | val_0_rmse: 0.35397 |  0:02:35s
epoch 5  | loss: 0.12174 | val_0_rmse: 0.34217 |  0:03:05s
epoch 6  | loss: 0.11751 | val_0_rmse: 0.334   |  0:03:36s
epoch 7  | loss: 0.11459 | val_0_rmse: 0.33872 |  0:04:07s
epoch 8  | loss: 0.11609 | val_0_rmse: 0.33114 |  0:04:38s
epoch 9  | loss: 0.11255 | val_0_rmse: 0.3304  |  0:05:09s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_0_rmse = 0.3304
Best weights from best epoch are automatically used!
Device used : cpu
epoch 0  | loss: 4.78547 | val_0_rmse: 2.68846 |  0:00:31s
epoch 1  | loss: 0.14747 | val_0_rmse: 1.09913 |  0:01:02s
epoch 2  | loss: 0.13524 | val_0_rmse: 0.40476 |  0:01:34s
epoch 3  | loss: 0.12666 | val_0_rmse: 0.3490

In [15]:
predictions = np.mean(predictions_array,axis=0)

In [16]:
print("The CV score is %.5f" % np.mean(CV_score_array,axis=0) )

The CV score is 0.33387


In [17]:
print (predictions)

[[7.0984727e+03]
 [7.3432881e+03]
 [7.7243657e+03]
 ...
 [6.7809009e+03]
 [1.8635247e-02]
 [5.0073926e+03]]


In [18]:
print(predictions.size)

41088


In [19]:
submission = pd.read_csv('sample_submission.csv',low_memory=False)

submission['Sales'] = predictions
submission.to_csv('submission_file.csv',index=False)