In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Data load

In [3]:
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])

In [4]:
items = pd.read_csv('items.csv')
items.drop('item_name', axis=1, inplace=True)

item_categories = pd.read_csv('item_categories_pre.csv')

In [5]:
train = pd.merge(train, items)
train = pd.merge(train, item_categories)

test = pd.merge(test, items)
test = pd.merge(test, item_categories)

In [6]:
train['revenue'] = train['item_price'] * train['item_cnt_day']

# Column 설정

In [7]:
target = 'item_cnt_day'

In [8]:
columns_useless = []

In [9]:
columns_num_all = joblib.load('columns_num.pkl')
columns_cat_all = joblib.load('columns_cat.pkl')
columns_binary_num = joblib.load('columns_binary_num.pkl')
columns_binary_cat = joblib.load('columns_binary_cat.pkl')
columns_dt = joblib.load('columns_dt.pkl')

In [10]:
try :
    columns_num_all.remove(target)
    
except :
    pass

In [11]:
try :
    columns_cat_all.remove(target)
    
except :
    pass

In [12]:
columns_num = []

for column in columns_num_all :
    
    if column not in columns_useless :
        columns_num.append(column)

In [13]:
columns_cat = []

for column in columns_cat_all :
    
    if column not in columns_useless :
        columns_cat.append(column)

In [14]:
columns_sc = []

for column in columns_num :
    
    if column not in columns_binary_num:
        columns_sc.append(column)
        
columns_sc

['date_block_num', 'item_price', 'revenue']

In [15]:
columns_sc.remove('date_block_num')

In [16]:
columns_en = []

for column in columns_cat :
    
    if (column not in columns_binary_num) and (column not in columns_binary_cat):
        columns_en.append(column)
        
columns_en

['shop_id', 'item_id', 'item_category_id', 'type_id', 'subtype_id']

## datetime 설정

In [17]:
def data_summary(data):
    summary = pd.DataFrame(data.dtypes, columns=['Dtype'])
    summary = summary.rename(columns={'index': 'Feature'})
    summary['Unique value'] = data.nunique().values
    summary['Null count'] = data.isnull().sum().values
    summary['Null percent'] = data.isnull().sum().values / len(data) * 100
    summary['Null percent'] = round(summary['Null percent'], 1)
    
    return summary

In [18]:
train['year'] = train['date'].apply(lambda x : x.split('.')[2])
train['month'] = train['date'].apply(lambda x : x.split('.')[1])
train['day'] = train['date'].apply(lambda x : x.split('.')[0])

In [19]:
train['year'] = train['year'].astype('str')
train['month'] = train['month'].astype('str')
train['day'] = train['day'].astype('str')

In [20]:
train.drop('date', axis=1, inplace=True)

In [21]:
train

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,type_id,subtype_id,revenue,year,month,day
0,0,59,22154,999.0,1.0,37,11,1,999.0,2013,01,02
1,0,24,22154,999.0,1.0,37,11,1,999.0,2013,01,23
2,0,27,22154,999.0,1.0,37,11,1,999.0,2013,01,20
3,0,25,22154,999.0,1.0,37,11,1,999.0,2013,01,02
4,0,25,22154,999.0,1.0,37,11,1,999.0,2013,01,03
...,...,...,...,...,...,...,...,...,...,...,...,...
2935844,33,55,13093,250.0,1.0,36,9,15,250.0,2015,10,22
2935845,32,55,13091,1000.0,1.0,36,9,15,1000.0,2015,09,21
2935846,32,55,13094,2500.0,1.0,36,9,15,2500.0,2015,09,16
2935847,32,55,13094,2500.0,2.0,36,9,15,5000.0,2015,09,22


In [22]:
train_summary = data_summary(train)
train_summary

Unnamed: 0,Dtype,Unique value,Null count,Null percent
date_block_num,int64,34,0,0.0
shop_id,int64,60,0,0.0
item_id,int64,21807,0,0.0
item_price,float64,19993,0,0.0
item_cnt_day,float64,198,0,0.0
item_category_id,int64,84,0,0.0
type_id,int64,20,0,0.0
subtype_id,int64,65,0,0.0
revenue,float64,24775,0,0.0
year,object,3,0,0.0


In [23]:
columns_cat.extend(['year', 'month', 'day'])

# Data 분리

## target 분리

In [24]:
# y = data[target]
# X = data.drop(target, axis=1)

In [25]:
y_train = train[target]
X_train = train.drop(target, axis=1)
X_train_og = X_train.copy()

In [26]:
X_test = test.copy()

## Train & test set 분리

In [27]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)
# X_train_og = X_train.copy()

# useless column 제거

In [28]:
X_train.drop(columns_useless, axis=1, inplace=True)

# 누락값 처리

# Error 및 outlier 처리

# 수치형 data 전처리

## Scaling

In [29]:
scaler = StandardScaler()

In [30]:
scaler.fit(X_train[columns_sc])
X_train[columns_sc] = scaler.transform(X_train[columns_sc])

In [31]:
X_train

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_category_id,type_id,subtype_id,revenue,year,month,day
0,0,59,22154,0.062520,37,11,1,-0.027928,2013,01,02
1,0,24,22154,0.062520,37,11,1,-0.027928,2013,01,23
2,0,27,22154,0.062520,37,11,1,-0.027928,2013,01,20
3,0,25,22154,0.062520,37,11,1,-0.027928,2013,01,02
4,0,25,22154,0.062520,37,11,1,-0.027928,2013,01,03
...,...,...,...,...,...,...,...,...,...,...,...
2935844,33,55,13093,-0.370478,36,9,15,-0.159711,2015,10,22
2935845,32,55,13091,0.063098,36,9,15,-0.027752,2015,09,21
2935846,32,55,13094,0.930251,36,9,15,0.236165,2015,09,16
2935847,32,55,13094,0.930251,36,9,15,0.676027,2015,09,22


# 범주형 data 전처리

## binary data 처리

## One-hot-encoding

In [32]:
# X_train = pd.get_dummies(X_train, columns=columns_en)
encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
X_train = encoder.fit_transform(X_train)

In [33]:
X_train

<2935849x66878 sparse matrix of type '<class 'numpy.float64'>'
	with 32294339 stored elements in Compressed Sparse Row format>

In [34]:
X_train.shape

(2935849, 66878)

# Data preprocessing 함수
- 함수는 dump 안 됨

In [35]:
def preprocessing(data, X_data_og, columns_useless, columns_sc, columns_binary_cat, columns_en) :
    
    # datetime 변환
    data['year'] = data['date'].apply(lambda x : x.split('.')[2])
    data['month'] = data['date'].apply(lambda x : x.split('.')[1])
    data['day'] = data['date'].apply(lambda x : x.split('.')[0])
    
    data['year'] = data['year'].astype('str')
    data['month'] = data['month'].astype('str')
    data['day'] = data['day'].astype('str')
    
    data.drop('date', axis=1, inplace=True)
    
    columns_cat.extend(['year', 'month', 'day'])
    
    # useless column 제거
    data.drop(columns_useless, axis=1, inplace=True)
    
    # scaling
    scaler = StandardScaler()
    scaler.fit(X_data_og[columns_sc])
    data[columns_sc] = scaler.transform(data[columns_sc])
   
    # One-hot-encoding
#     data = pd.get_dummies(data, columns=columns_en)
    encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
    data = encoder.fit_transform(data)
    
    return data