# 0.0. Imports

In [12]:
import pandas as pd
import inflection

## 0.1. Loading Data

In [9]:
df_store_raw = pd.read_csv('../data/store.csv')
df_sales_raw = pd.read_csv('../data/train.csv', low_memory=False)

# merge
df_raw = pd.merge(df_store_raw, df_sales_raw, how='left', on='Store')

## 0.2. Helper Functions

## 1.0. Data Description

In [13]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [15]:
cols_old = [ 'Store', 'StoreType', 'Assortment', 'CompetitionDistance',
             'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
             'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'DayOfWeek',
             'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
             'SchoolHoliday']

snake_case = lambda x: inflection.underscore(x)
df1.columns = list(map(snake_case, cols_old))

## 1.2. Data Dimensions

In [20]:
print(f'Number of Rows: {df1.shape[0]}')
print(f'Number of Cols: {df1.shape[1]}')

Number of Rows: 1017209
Number of Cols: 18


## 1.3. Data Types

In [22]:
df1['date'] = pd.to_datetime(df1['date'])
df1.dtypes

store                                    int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
dtype: object

## 1.4. Check NA

In [26]:
df1.isna().mean().sort_values(ascending=False)

promo_interval                  0.499436
promo2_since_week               0.499436
promo2_since_year               0.499436
competition_open_since_month    0.317878
competition_open_since_year     0.317878
competition_distance            0.002597
sales                           0.000000
state_holiday                   0.000000
promo                           0.000000
open                            0.000000
customers                       0.000000
store                           0.000000
date                            0.000000
day_of_week                     0.000000
store_type                      0.000000
promo2                          0.000000
assortment                      0.000000
school_holiday                  0.000000
dtype: float64

## 1.5. Fillout NA

In [36]:
df1['promo_interval'].fillna(0, inplace=True)

month_map = {1: 'Jan', 2: 'Feb',3: 'Mar',4: 'Apr',5: 'May',6: 'Jun',7: 'Jul', 8: 'Aug',9: 'Sept',10: 'Oct',11: 'Nov', 12: 'Dec'}
df1['month_promo'] = df1['date'].dt.month.map(month_map)


In [37]:
df1.head()

Unnamed: 0,store,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,month_promo
0,1,c,a,1270.0,9.0,2008.0,0,,,0,5,2015-07-31,5263,555,1,1,0,1,Jul
1,1,c,a,1270.0,9.0,2008.0,0,,,0,4,2015-07-30,5020,546,1,1,0,1,Jul
2,1,c,a,1270.0,9.0,2008.0,0,,,0,3,2015-07-29,4782,523,1,1,0,1,Jul
3,1,c,a,1270.0,9.0,2008.0,0,,,0,2,2015-07-28,5011,560,1,1,0,1,Jul
4,1,c,a,1270.0,9.0,2008.0,0,,,0,1,2015-07-27,6102,612,1,1,0,1,Jul
