In [50]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('dataset/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dataset/input\solution_example.csv
dataset/input\test.csv
dataset/input\test_calendar.csv
dataset/input\train.csv
dataset/input\train_calendar.csv


In [52]:
train = pd.read_csv('dataset/input/train.csv')
test  = pd.read_csv('dataset/input/test.csv')
sub   = pd.read_csv('dataset/input/solution_example.csv')

In [54]:
# Overview Data 
train.head()

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id
0,Prague_1,2020-12-05,6895.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1722.0,32575.0,Prague_1_2020-12-05
1,Prague_1,2020-12-06,6584.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1688.0,32507.0,Prague_1_2020-12-06
2,Prague_1,2020-12-07,7030.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1696.0,32552.0,Prague_1_2020-12-07
3,Prague_1,2020-12-08,6550.0,,0,0,0,0,0,0,0,0.0,0,0.8,0.0,1681.0,32423.0,Prague_1_2020-12-08
4,Prague_1,2020-12-09,6910.0,,0,0,0,0,0,0,0,0.0,0,0.5,0.0,1704.0,32410.0,Prague_1_2020-12-09


In [56]:
# Check data 
train['date'] = pd.to_datetime(train['date'])
train['holiday_name'].describe()
train['holiday_name'].isna().sum()

7122

In [58]:
# Function to merge holiday and holiday name
def merge_columns(df, col1, col2): 
    if (pd.isna(df[col1]) and df[col2] == 0) or (pd.isna(df[col2]) and df[col1] == 0):
        return 0
    else:
        return 1
    
# Preprocess holiday and holiday name 
train['merged_holiday'] = train.apply(lambda row: merge_columns(row, 'holiday_name', 'holiday'), axis=1)

# Drop unnesscessary columns
train.drop(columns=['holiday_name'], inplace=True)
train.drop(columns=['holiday'], inplace=True)

In [60]:
# Preprocess id to datetime
from sklearn.preprocessing import LabelEncoder
def transform_columns(df):
    for column in df.columns:
        if column == 'date':
            df[column] = pd.to_datetime(df[column])
            df['year'] = df[column].dt.year
            df['month'] = df[column].dt.month
            df['day'] = df[column].dt.day
            df['weekend'] = df[column].dt.weekday >= 5 
            df.drop(column, axis=1, inplace=True)
        elif df[column].dtype == 'object':
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])
    return df

In [62]:
# Split id into months, days, weekend columns
transform_columns(train)

Unnamed: 0,warehouse,orders,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id,merged_holiday,year,month,day,weekend
0,4,6895.0,0,0,0,0,0,0,0.0,0,0.00,0.0,1722.0,32575.0,3761,0,2020,12,5,True
1,4,6584.0,0,0,0,0,0,0,0.0,0,0.00,0.0,1688.0,32507.0,3762,0,2020,12,6,True
2,4,7030.0,0,0,0,0,0,0,0.0,0,0.00,0.0,1696.0,32552.0,3763,0,2020,12,7,False
3,4,6550.0,0,0,0,0,0,0,0.0,0,0.80,0.0,1681.0,32423.0,3764,0,2020,12,8,False
4,4,6910.0,0,0,0,0,0,0,0.0,0,0.50,0.0,1704.0,32410.0,3765,0,2020,12,9,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7335,1,6733.0,0,0,0,0,0,0,0.0,0,1.51,0.0,2983.0,27111.0,2342,0,2024,3,10,True
7336,1,6492.0,0,0,0,0,0,0,0.0,0,1.03,0.0,2975.0,27133.0,2343,0,2024,3,11,False
7337,1,6661.0,0,0,0,0,0,0,0.0,0,0.21,0.0,2974.0,27151.0,2344,0,2024,3,12,False
7338,1,6843.0,0,0,0,0,0,0,0.0,0,0.00,0.0,2979.0,27180.0,2345,0,2024,3,13,False


In [64]:
# Create submission Id 
submissionId = test['id']
print(submissionId)

0        Prague_1_2024-03-16
1        Prague_1_2024-03-17
2        Prague_1_2024-03-18
3        Prague_1_2024-03-19
4        Prague_1_2024-03-20
               ...          
392    Budapest_1_2024-05-11
393    Budapest_1_2024-05-12
394    Budapest_1_2024-05-13
395    Budapest_1_2024-05-14
396    Budapest_1_2024-05-15
Name: id, Length: 397, dtype: object


In [66]:
# Create X,y to predict and train
drop_columns = ['shutdown', 'mini_shutdown', 'blackout', 'mov_change', 'frankfurt_shutdown', 'precipitation', 'snow', 'user_activity_1', 'user_activity_2']
X = train.drop(columns=drop_columns)
y = train["orders"]

In [68]:
# Preprocess test data
test['merged_holiday'] = test.apply(lambda row: merge_columns(row, 'holiday_name', 'holiday'), axis=1)
test.drop(columns=['holiday_name'], inplace=True)
test.drop(columns=['holiday'], inplace=True)

In [70]:
# Preprocess date in test data 
transform_columns(test)
test.columns

Index(['warehouse', 'shops_closed', 'winter_school_holidays',
       'school_holidays', 'id', 'merged_holiday', 'year', 'month', 'day',
       'weekend'],
      dtype='object')

In [76]:
# Create and train model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
X_train.drop(columns=['orders'], inplace=True)
rf_model.fit(X_train, y_train)

In [78]:
# Predict
pred = rf_model.predict(test)

In [80]:
# Submit 
submission = pd.DataFrame({
    'id': submissionId,
    'orders': pred
})
# Save the submission file
submission.to_csv('submission.csv', index=False)