In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Data load

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Column 설정

In [4]:
target = 'target'

In [5]:
columns_useless = ['id']
columns_useless = columns_useless + ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5']

In [6]:
columns_num_all = joblib.load('columns_num.pkl')
columns_cat_all = joblib.load('columns_cat.pkl')
columns_binary_num = joblib.load('columns_binary_num.pkl')
columns_binary_cat = joblib.load('columns_binary_cat.pkl')
columns_dt = joblib.load('columns_dt.pkl')

In [7]:
try :
    columns_num_all.remove(target)
    
except :
    pass

In [8]:
try :
    columns_cat_all.remove(target)
    
except :
    pass

In [9]:
columns_num = []

for column in columns_num_all :
    
    if column not in columns_useless :
        columns_num.append(column)

In [10]:
columns_cat = []

for column in columns_cat_all :
    
    if column not in columns_useless :
        columns_cat.append(column)

In [11]:
columns_sc = []

for column in columns_num :
    
    if column not in columns_binary_num:
        columns_sc.append(column)
        
columns_sc

[]

In [12]:
columns_en = []

for column in columns_cat :
    
    if (column not in columns_binary_num) and (column not in columns_binary_cat):
        columns_en.append(column)
        
columns_en

['nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_0',
 'day',
 'month']

## datetime 설정

# Data 분리

## target 분리

In [13]:
# y = data[target]
# X = data.drop(target, axis=1)

In [14]:
y_train = train[target]
X_train = train.drop(target, axis=1)
X_train_og = X_train.copy()

In [15]:
X_test = test.copy()

## Train & test set 분리

In [16]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)
# X_train_og = X_train.copy()

# useless column 제거

In [17]:
X_train.drop(columns_useless, axis=1, inplace=True)

# 누락값 처리

# Error 및 outlier 처리

# 수치형 data 전처리

## Scaling

In [18]:
scaler = StandardScaler()

In [19]:
# scaler.fit(X_train[columns_sc])
# X_train[columns_sc] = scaler.transform(X_train[columns_sc])

In [20]:
X_train

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,ord_0,ord_1,ord_2,ord_3,ord_4,day,month
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,2,Grandmaster,Cold,h,D,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,1,Grandmaster,Hot,a,A,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,1,Expert,Lava Hot,h,R,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,1,Grandmaster,Boiling Hot,i,D,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,1,Grandmaster,Freezing,a,R,7,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,T,N,Red,Trapezoid,Snake,India,Oboe,1,Contributor,Freezing,k,K,3,8
299996,0,0,0,F,Y,Green,Trapezoid,Lion,Russia,Piano,2,Novice,Freezing,h,W,3,2
299997,0,0,0,F,Y,Blue,Star,Axolotl,Russia,Oboe,3,Novice,Boiling Hot,o,A,7,9
299998,0,1,0,F,Y,Green,Square,Axolotl,Costa Rica,Piano,1,Master,Boiling Hot,h,W,3,8


# 범주형 data 전처리

## binary data 처리

In [21]:
for column in columns_binary_cat :
    print(column, '\n')
    print(X_train[column].value_counts(ascending=False), '\n')
    print('=========================================================', '\n')

bin_3 

T    153535
F    146465
Name: bin_3, dtype: int64 


bin_4 

Y    191633
N    108367
Name: bin_4, dtype: int64 




In [22]:
X_train['bin_3'] = X_train['bin_3'].map({'F':0, 'T':1})
X_train['bin_4'] = X_train['bin_4'].map({'N':0, 'Y':1})

In [23]:
X_train

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,ord_0,ord_1,ord_2,ord_3,ord_4,day,month
0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,2,Grandmaster,Cold,h,D,2,2
1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,1,Grandmaster,Hot,a,A,7,8
2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,1,Expert,Lava Hot,h,R,7,2
3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,1,Grandmaster,Boiling Hot,i,D,2,1
4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,1,Grandmaster,Freezing,a,R,7,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,1,0,Red,Trapezoid,Snake,India,Oboe,1,Contributor,Freezing,k,K,3,8
299996,0,0,0,0,1,Green,Trapezoid,Lion,Russia,Piano,2,Novice,Freezing,h,W,3,2
299997,0,0,0,0,1,Blue,Star,Axolotl,Russia,Oboe,3,Novice,Boiling Hot,o,A,7,9
299998,0,1,0,0,1,Green,Square,Axolotl,Costa Rica,Piano,1,Master,Boiling Hot,h,W,3,8


In [24]:
X_train[columns_binary_cat] = X_train[columns_binary_cat].astype('int64')

## One-hot-encoding

In [25]:
X_train = pd.get_dummies(X_train, columns=columns_en)

In [26]:
X_train

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
299996,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299997,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
299998,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [27]:
X_train.dtypes.value_counts()

uint8    99
int64     5
dtype: int64

# Data preprocessing 함수
- 함수는 dump 안 됨

In [28]:
def preprocessing(data, X_train_og, columns_useless, columns_sc, columns_binary_cat, columns_en) :
    
    # useless column 제거
    data.drop(columns_useless, axis=1, inplace=True)
    
    # scaling
#     scaler = StandardScaler()
#     scaler.fit(X_train_og[columns_sc])
#     data[columns_sc] = scaler.transform(data[columns_sc])

    # binary data 처리
    data['bin_3'] = data['bin_3'].map({'F':0, 'T':1})
    data['bin_4'] = data['bin_4'].map({'N':0, 'Y':1})
    
    data[columns_binary_cat] = data[columns_binary_cat].astype('int64')
    
    # One-hot-encoding
    data = pd.get_dummies(data, columns=columns_en)
#     encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
    
    return data

In [29]:
X_test = preprocessing(X_test, X_train_og, columns_useless, columns_sc, columns_binary_cat, columns_en)
X_test

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,1,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
199996,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199997,0,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199998,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# if X_train.shape[1] == X_test.shape[1] :
#     print(X_train.shape)
#     print(X_test.shape)
    
# else :
#     raise ValueError   # shape이 다를 경우 train과 test set을 concat해서 one-hot-encoding 해야함

# Column transformer 사용

In [31]:
# column transformer를 사용
def preprocessing(data, columns_useless, columns_binary_cat) :
    
    # useless column 제거
    data.drop(columns_useless, axis=1, inplace=True)

    # binary data 처리
    data['bin_3'] = data['bin_3'].map({'F':0, 'T':1})
    data['bin_4'] = data['bin_4'].map({'N':0, 'Y':1})
    
    data[columns_binary_cat] = data[columns_binary_cat].astype('int64')
    
    return data

In [32]:
# X_train = preprocessing(X_train, columns_useless, columns_binary_cat)
# X_test = preprocessing(X_test, columns_useless, columns_binary_cat)

In [33]:
# ct = make_column_transformer(
#     (StandardScaler(), columns_sc),
#     (OneHotEncoder(sparse=False), columns_en)
# )

In [34]:
# ct.fit(X_train)
# X_train = ct.transform(X_train)
# X_test = ct.transform(X_test)

In [35]:
# if X_train.shape[1] == X_test.shape[1] :
#     print(X_train.shape)
#     print(X_test.shape)
    
# else :
#     raise ValueError   # shape이 다를 경우 train과 test set을 concat해서 one-hot-encoding 해야함

# Data preprocessing class
- class는 load 안됨

In [36]:
class preprocessing(BaseEstimator, TransformerMixin) :
    def __init__(self, columns_useless, columns_binary_cat, columns_en) :
        self.columns_useless = columns_useless
        self.columns_sc = columns_sc
        self.columns_binary_cat = columns_binary_cat
        self.columns_en = columns_en
        
    def fit(self, data) :
        return self
    
    def transform(self, data) :
        # useless column 제거
        data.drop(columns_useless, axis=1, inplace=True)

        # binary data 처리
        data['bin_3'] = data['bin_3'].map({'F':0, 'T':1})
        data['bin_4'] = data['bin_4'].map({'N':0, 'Y':1})
        
        data[columns_binary_cat] = data[columns_binary_cat].astype('int64')

        # One-hot-encoding
        data = pd.get_dummies(data, columns=columns_en)
        
        return data

In [37]:
# prep = preprocessing(columns_useless, columns_binary_cat, columns_en)

In [38]:
# X_train = prep.transform(X_train)   # fit_transform을 사용해도 상관없음
# X_test = prep.transform(X_test)

In [39]:
# if X_train.shape[1] == X_test.shape[1] :
#     print(X_train.shape)
#     print(X_test.shape)
    
# else :
#     raise ValueError   # shape이 다를 경우 train과 test set을 concat해서 one-hot-encoding 해야함

# Pipeline 사용

In [40]:
class preprocessing(BaseEstimator, TransformerMixin) :
    def __init__(self, columns_useless, columns_binary_cat) :
        self.columns_useless = columns_useless
        self.columns_binary_cat = columns_binary_cat
        
    def fit(self, data) :
        return self
    
    def transform(self, data) :
        # useless column 제거
        data.drop(columns_useless, axis=1, inplace=True)

        # binary data 처리
        data['bin_3'] = data['bin_3'].map({'F':0, 'T':1})
        data['bin_4'] = data['bin_4'].map({'N':0, 'Y':1})
        
        data[columns_binary_cat] = data[columns_binary_cat].astype('int64')
        
        return data

In [41]:
pipe = make_pipeline(
    preprocessing(columns_useless, columns_binary_cat),
#     StandardScaler(),
    OneHotEncoder()
)

In [42]:
# X_train = pipe.fit_transform(X_train)
# X_test = pipe.transform(X_test)

In [43]:
# if X_train.shape[1] == X_test.shape[1] :
#     print(X_train.shape)
#     print(X_test.shape)
    
# else :
#     raise ValueError   # shape이 다를 경우 train과 test set을 concat해서 one-hot-encoding 해야함

In [44]:
# joblib.dump(pipeline, 'pipeline.pkl')