# Import Packages

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import itertools
sns.set(style="ticks")
%config IPCompleter.greedy = True
from sklearn.preprocessing import LabelEncoder

import utils

# Import Data

In [2]:
  df_calendar, df_sales, df_prices= utils.import_m5_data(reduce_memory=False)

# Encode Categorical Variables if neccesairy

Problems of categorical variables:
- more memory (RAM) needed
- the notebook may crash due to categorical variables

In [3]:

def encode_categorical(df, cols, fillna=False):
    for col in cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(
            df[col].fillna("MISSING") if fillna else df[col]
        )
    return df

'''
df_calendar = encode_categorical(
    df_calendar,
    ["event_name_1", "event_type_1", "event_name_2", "event_type_2"],
    fillna=True,
).pipe(utils.reduce_mem_usage)

df_sales = encode_categorical(
    df_sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(utils.reduce_mem_usage)

df_prices = encode_categorical(df_prices, ["item_id", "store_id"]).pipe(
    utils.reduce_mem_usage
)
'''


'\ndf_calendar = encode_categorical(\n    df_calendar,\n    ["event_name_1", "event_type_1", "event_name_2", "event_type_2"],\n    fillna=True,\n).pipe(utils.reduce_mem_usage)\n\ndf_sales = encode_categorical(\n    df_sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],\n).pipe(utils.reduce_mem_usage)\n\ndf_prices = encode_categorical(df_prices, ["item_id", "store_id"]).pipe(\n    utils.reduce_mem_usage\n)\n'

# Transform Dataframe

In [4]:
# Add placeholders as timeframe is different for df_sales
for i in range(1914, 1970):
    df_sales[f'd_{i}'] = 'NA'

In [5]:
# transform Dataframe
df = utils.transform_dataframe(df_sales, df_calendar, df_prices)

In [6]:
# Add column with type of data (training, validation & evaluation)

# 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913 ---> training
# 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public) ---> validation
# 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private) ---> evaluation


conditions = [
    (df['date'] <= '2016-04-24'),
    (df['date'] >= '2016-04-25') & (df['date'] <= '2016-05-22'),
    (df['date'] >= '2016-05-23')]
choices = ['train', 'validation', 'evaluation']
df['data_type'] = np.select(conditions, choices)



In [7]:
# reduce memory usage
df = utils.reduce_mem_usage(df)

Mem. usage of decreased to 7958.26 Mb (27.6% reduction)


In [8]:
df[:3]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sale,date,wm_yr_wk,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,data_type
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2011,,,,,0,0,0,,train


# Save DataFrame as CSV

In [9]:
df.to_csv(f'{utils.get_m5_root_dir()}/data/preprocessed/preprocessed_input_data.csv', index=False)