In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Data load

In [3]:
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])

# Column 설정

In [4]:
target = 'sales'

In [5]:
columns_useless = ['id']

In [6]:
columns_num_all = joblib.load('columns_num.pkl')
columns_cat_all = joblib.load('columns_cat.pkl')
columns_binary_num = joblib.load('columns_binary_num.pkl')
columns_binary_cat = joblib.load('columns_binary_cat.pkl')
columns_dt = joblib.load('columns_dt.pkl')

In [7]:
try :
    columns_num_all.remove(target)
    
except :
    pass

In [8]:
try :
    columns_cat_all.remove(target)
    
except :
    pass

In [9]:
columns_num = []

for column in columns_num_all :
    
    if column not in columns_useless :
        columns_num.append(column)
        
columns_num

[]

In [10]:
columns_cat = []

for column in columns_cat_all :
    
    if column not in columns_useless :
        columns_cat.append(column)
        
columns_cat

['store', 'item']

In [11]:
columns_sc = []

for column in columns_num :
    
    if column not in columns_binary_num:
        columns_sc.append(column)
        
columns_sc

[]

In [12]:
# columns_sc.remove('date_block_num')

In [13]:
columns_en = []

for column in columns_cat :
    
    if (column not in columns_binary_num) and (column not in columns_binary_cat):
        columns_en.append(column)
        
columns_en

['store', 'item']

## datetime 설정

In [14]:
def data_summary(data):
    summary = pd.DataFrame(data.dtypes, columns=['Dtype'])
    summary = summary.rename(columns={'index': 'Feature'})
    summary['Unique value'] = data.nunique().values
    summary['Null count'] = data.isnull().sum().values
    summary['Null percent'] = data.isnull().sum().values / len(data) * 100
    summary['Null percent'] = round(summary['Null percent'], 1)
    
    return summary

In [15]:
def date_features(data, column):
    data['year'] = data[column].dt.year
    data['quarter'] = data[column].dt.quarter
    data['month'] = data[column].dt.month   # Data preprocessing 때 string으로 변환해야함
    data['day'] = data[column].dt.day
    data['day_of_week'] = data[column].dt.dayofweek   # Monday = 0, Sunday = 6
    data['is_weekend'] = data[column].dt.weekday // 5
    
    data.drop(column, axis=1, inplace=True)
    
    date_columns = data.columns[-6:].to_list()
    other_columns = data.columns[:-6].to_list()
    columns = date_columns + other_columns
    
    data = data[columns]
    
    return data

In [16]:
train = date_features(train, 'date')
train

Unnamed: 0,year,quarter,month,day,day_of_week,is_weekend,store,item,sales
0,2013,1,1,1,1,0,1,1,13
1,2013,1,1,2,2,0,1,1,11
2,2013,1,1,3,3,0,1,1,14
3,2013,1,1,4,4,0,1,1,13
4,2013,1,1,5,5,1,1,1,10
...,...,...,...,...,...,...,...,...,...
912995,2017,4,12,27,2,0,10,50,63
912996,2017,4,12,28,3,0,10,50,59
912997,2017,4,12,29,4,0,10,50,74
912998,2017,4,12,30,5,1,10,50,62


In [17]:
train_summary = data_summary(train)
train_summary

Unnamed: 0,Dtype,Unique value,Null count,Null percent
year,int64,5,0,0.0
quarter,int64,4,0,0.0
month,int64,12,0,0.0
day,int64,31,0,0.0
day_of_week,int64,7,0,0.0
is_weekend,int64,2,0,0.0
store,int64,10,0,0.0
item,int64,50,0,0.0
sales,int64,213,0,0.0


In [18]:
columns_num.extend(['year'])
columns_en.extend(['quarter', 'month', 'day', 'day_of_week'])
# is_weekend는 binary num으로 인코딩 필요 없음

# Data 분리

## target 분리

In [19]:
# y = data[target]
# X = data.drop(target, axis=1)

In [20]:
y_train = train[target]
X_train = train.drop(target, axis=1)
X_train_og = X_train.copy()

In [21]:
X_test = test.copy()

## Train & test set 분리

In [22]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)
# X_train_og = X_train.copy()

# useless column 제거

In [23]:
### useless column이 test dataset에만 존재
try :
    X_train.drop(columns_useless, axis=1, inplace=True)

except :
    pass

# 누락값 처리

# Error 및 outlier 처리

# 수치형 data 전처리

## Scaling

In [24]:
# scaler = StandardScaler()

In [25]:
# scaler.fit(X_train[columns_sc])
# X_train[columns_sc] = scaler.transform(X_train[columns_sc])

In [26]:
# X_train

# 범주형 data 전처리

## binary data 처리

In [27]:
columns_binary_cat

[]

## One-hot-encoding

In [28]:
X_train = pd.get_dummies(X_train, columns=columns_en)
# encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
# X_train = encoder.fit_transform(X_train)

In [29]:
X_train

Unnamed: 0,year,is_weekend,store_1,store_2,store_3,store_4,store_5,store_6,store_7,store_8,...,day_29,day_30,day_31,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,2013,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2013,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2013,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2013,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2013,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
912996,2017,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
912997,2017,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
912998,2017,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [30]:
X_train.shape

(913000, 116)

# Data preprocessing 함수
- 함수는 dump 안 됨

In [31]:
def preprocessing(data, X_data_og, columns_useless, columns_sc, columns_binary_cat, columns_en) :
    
    # datetime 변환
    data = date_features(data, 'date')
    
    columns_num.extend(['year'])
    columns_en.extend(['quarter', 'month', 'day', 'day_of_week', 'is_weekend'])
    
    # useless column 제거
    try :
        data.drop(columns_useless, axis=1, inplace=True)

    except :
        pass
    
    # scaling
#     scaler = StandardScaler()
#     scaler.fit(X_data_og[columns_sc])
#     data[columns_sc] = scaler.transform(data[columns_sc])
   
    # One-hot-encoding
    data = pd.get_dummies(data, columns=columns_en)
#     encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
#     data = encoder.fit_transform(data)
    
    return data