In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from utils import create_features, create_holidays

In [2]:
DATA_INPUT_PATH = "data/input"
DATA_OUTPUT_PATH = "data/prepared"

In [3]:
train_df = pd.read_csv(f'{DATA_INPUT_PATH}/train.csv', parse_dates=[2], low_memory=False)
test_df = pd.read_csv(f'{DATA_INPUT_PATH}/test.csv', parse_dates=[3], low_memory=False)
store_df = pd.read_csv(f'{DATA_INPUT_PATH}/store.csv')

In [4]:
# Missing values in the open column in test_df, set them to 1 because all those days are Monday to Saturday
test_df['Open'] = test_df['Open'].fillna(value=1)

# Fill missing store information with simple 0, seems to work better than median imputation (https://www.kaggle.com/xwxw2929/rossmann-sales-top1)
store_df = store_df.fillna(value=0)

In [5]:
# Merge store information
train_df = pd.merge(train_df, store_df, on='Store')
test_df = pd.merge(test_df, store_df, on='Store')

In [6]:
# Holidays for Prophet
holidays_df = create_holidays(pd.concat([train_df, test_df]))

# Feature creation
train_df = create_features(train_df)
test_df = create_features(test_df)

# One hot encoding
dummy_cols = ['StateHoliday', 'StoreType', 'Assortment']


for col in dummy_cols:
    encoder = OneHotEncoder()
    encoder.fit(train_df[[col]])

    train_dummies = encoder.transform(train_df[[col]])
    test_dummies = encoder.transform(test_df[[col]])

    train_df = pd.concat([train_df, 
                          pd.DataFrame(train_dummies.toarray(), 
                          columns=encoder.get_feature_names_out())], axis=1)
    
    test_df = pd.concat([test_df, 
                         pd.DataFrame(test_dummies.toarray(), 
                         columns=encoder.get_feature_names_out())], axis=1)

    train_df = train_df.drop(col, axis=1)
    test_df = test_df.drop(col, axis=1)

In [7]:
# Save data to output folder
train_df.to_pickle(f'{DATA_OUTPUT_PATH}/train_df.pkl')
test_df.to_pickle(f'{DATA_OUTPUT_PATH}/test_df.pkl')
holidays_df.to_pickle(f'{DATA_OUTPUT_PATH}/holidays_df.pkl')