In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")



In [2]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [3]:
X_train = train.copy().drop(['target'], axis=1)
y_train = train['target']
X_test = test.copy()

### 결측치 최빈값 대체

In [4]:
cat = ['id','bin_0','bin_1','bin_2','bin_3','bin_4','nom_0','nom_1','nom_2','nom_3','nom_4','nom_5','nom_6','nom_7','nom_8','nom_9','ord_0','ord_1','ord_2','ord_3','ord_4','ord_5','day','month']

In [5]:
from sklearn.impute import SimpleImputer 

imp = SimpleImputer(strategy="most_frequent")
X_train[cat] = imp.fit_transform(X_train[cat])
X_test[cat] = imp.fit_transform(X_test[cat])

* * *

## bin_0, bin_1, bin_2, bin_3, bin_4
**2-cardinality, binary**

: one-hot을 사용하는 것이 좋은 유형의 변수이므로, *binary encoding*

In [6]:
cat = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

In [7]:
from sklearn.preprocessing import OrdinalEncoder


X_train_oe, X_test_oe, = X_train.copy(), X_test.copy()


le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1, dtype=int)

X_train_oe[cat] = le.fit_transform(X_train[cat])
X_test_oe[cat] = le.transform(X_test[cat])


X_train_oe[cat]

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,0,0
1,1,1,0,0,1
2,0,1,0,0,0
3,0,0,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
599995,0,1,0,1,0
599996,1,0,0,1,1
599997,0,0,0,0,1
599998,1,1,0,0,1


In [8]:
X_train[cat] = X_train_oe[cat]
X_test[cat] = X_test_oe[cat]

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      600000 non-null  object
 1   bin_0   600000 non-null  int64 
 2   bin_1   600000 non-null  int64 
 3   bin_2   600000 non-null  int64 
 4   bin_3   600000 non-null  int64 
 5   bin_4   600000 non-null  int64 
 6   nom_0   600000 non-null  object
 7   nom_1   600000 non-null  object
 8   nom_2   600000 non-null  object
 9   nom_3   600000 non-null  object
 10  nom_4   600000 non-null  object
 11  nom_5   600000 non-null  object
 12  nom_6   600000 non-null  object
 13  nom_7   600000 non-null  object
 14  nom_8   600000 non-null  object
 15  nom_9   600000 non-null  object
 16  ord_0   600000 non-null  object
 17  ord_1   600000 non-null  object
 18  ord_2   600000 non-null  object
 19  ord_3   600000 non-null  object
 20  ord_4   600000 non-null  object
 21  ord_5   600000 non-null  object
 

## nom_0, nom_1, nom_2, nom_3, nom_4
**low cardinality, nominal**

: cardinality가 낮고, DT는 label encoding을 해도 부여받은 값에 영향을 받지 않으므로, *label(ordinal) encoding*

In [9]:
cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']

In [10]:
from sklearn.preprocessing import OrdinalEncoder

X_train_oe, X_test_oe, = X_train.copy(), X_test.copy()

le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1, dtype=int)

X_train_oe[cat] = le.fit_transform(X_train[cat])
X_test_oe[cat] = le.transform(X_test[cat])


X_train_oe[cat]

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4
0,2,4,3,5,0
1,2,3,0,4,3
2,2,5,3,0,0
3,2,0,3,3,3
4,2,5,3,2,3
...,...,...,...,...,...
599995,2,1,0,4,3
599996,0,1,2,2,1
599997,2,0,0,5,3
599998,2,1,0,4,2


In [11]:
X_train[cat] = X_train_oe[cat]
X_test[cat] = X_test_oe[cat]

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      600000 non-null  object
 1   bin_0   600000 non-null  int64 
 2   bin_1   600000 non-null  int64 
 3   bin_2   600000 non-null  int64 
 4   bin_3   600000 non-null  int64 
 5   bin_4   600000 non-null  int64 
 6   nom_0   600000 non-null  int64 
 7   nom_1   600000 non-null  int64 
 8   nom_2   600000 non-null  int64 
 9   nom_3   600000 non-null  int64 
 10  nom_4   600000 non-null  int64 
 11  nom_5   600000 non-null  object
 12  nom_6   600000 non-null  object
 13  nom_7   600000 non-null  object
 14  nom_8   600000 non-null  object
 15  nom_9   600000 non-null  object
 16  ord_0   600000 non-null  object
 17  ord_1   600000 non-null  object
 18  ord_2   600000 non-null  object
 19  ord_3   600000 non-null  object
 20  ord_4   600000 non-null  object
 21  ord_5   600000 non-null  object
 

## nom_5, nom_6, nom_7, nom_8, nom_9
**high cardinality, nominal**

: cardinality가 높으므로 one-hot과 같은 encoding 기법은 비효율적. *target encoding*

In [12]:
cat = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [13]:
from category_encoders import TargetEncoder


X_train_te, X_test_te, = X_train.copy(), X_test.copy()

te = TargetEncoder(cols=cat, min_samples_leaf=5, smoothing=30) 
X_train_te[cat] = te.fit_transform(X_train[cat], y_train)
X_test_te[cat] = te.transform(X_test[cat])


X_train_te[cat]

Unnamed: 0,nom_5,nom_6,nom_7,nom_8,nom_9
0,0.209756,0.161609,0.103301,0.128509,0.164279
1,0.193767,0.200913,0.129012,0.197291,0.152830
2,0.181311,0.163690,0.262747,0.172607,0.188486
3,0.207358,0.203390,0.144197,0.224036,0.267793
4,0.152913,0.162062,0.195469,0.184636,0.250000
...,...,...,...,...,...
599995,0.145946,0.147591,0.214965,0.224930,0.155801
599996,0.182991,0.191126,0.196864,0.192748,0.156328
599997,0.189258,0.176087,0.219978,0.111148,0.203046
599998,0.206133,0.153386,0.261794,0.288534,0.179810


In [14]:
X_train[cat] = X_train_te[cat]
X_test[cat] = X_test_te[cat]

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      600000 non-null  object 
 1   bin_0   600000 non-null  int64  
 2   bin_1   600000 non-null  int64  
 3   bin_2   600000 non-null  int64  
 4   bin_3   600000 non-null  int64  
 5   bin_4   600000 non-null  int64  
 6   nom_0   600000 non-null  int64  
 7   nom_1   600000 non-null  int64  
 8   nom_2   600000 non-null  int64  
 9   nom_3   600000 non-null  int64  
 10  nom_4   600000 non-null  int64  
 11  nom_5   600000 non-null  float64
 12  nom_6   600000 non-null  float64
 13  nom_7   600000 non-null  float64
 14  nom_8   600000 non-null  float64
 15  nom_9   600000 non-null  float64
 16  ord_0   600000 non-null  object 
 17  ord_1   600000 non-null  object 
 18  ord_2   600000 non-null  object 
 19  ord_3   600000 non-null  object 
 20  ord_4   600000 non-null  object 
 21  ord_5   60

## ord_0, ord_1, ord_2, ord_3
**low cardinality, ordinary**

: 순서가 존재하나 cardinality가 작기 때문에, binary 혹은 WoE, Target, james-stein을 사용하는 것이 맞는 듯. *james-stein*

In [15]:
cat = ['ord_0', 'ord_1', 'ord_2', 'ord_3']

In [16]:
X_train_jse, X_test_jse = X_train.copy(), X_test.copy()

from category_encoders.james_stein import JamesSteinEncoder

JSE = JamesSteinEncoder()
X_train_jse[cat] = JSE.fit_transform(X_train_jse[cat], y_train)
X_test_jse[cat] = JSE.transform(X_test_jse[cat])

In [17]:
X_train_jse[cat]

Unnamed: 0,ord_0,ord_1,ord_2,ord_3
0,0.239775,0.176421,0.199598,0.138666
1,0.239775,0.220396,0.178854,0.154787
2,0.239775,0.165748,0.151238,0.229472
3,0.145403,0.165748,0.255008,0.120484
4,0.239775,0.220396,0.161842,0.184254
...,...,...,...,...
599995,0.239775,0.165748,0.151238,0.120484
599996,0.186523,0.165748,0.225328,0.229472
599997,0.186523,0.176421,0.151238,0.229472
599998,0.145403,0.205257,0.178854,0.228037


In [18]:
X_train[cat] = X_train_jse[cat]
X_test[cat] = X_test_jse[cat]

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      600000 non-null  object 
 1   bin_0   600000 non-null  int64  
 2   bin_1   600000 non-null  int64  
 3   bin_2   600000 non-null  int64  
 4   bin_3   600000 non-null  int64  
 5   bin_4   600000 non-null  int64  
 6   nom_0   600000 non-null  int64  
 7   nom_1   600000 non-null  int64  
 8   nom_2   600000 non-null  int64  
 9   nom_3   600000 non-null  int64  
 10  nom_4   600000 non-null  int64  
 11  nom_5   600000 non-null  float64
 12  nom_6   600000 non-null  float64
 13  nom_7   600000 non-null  float64
 14  nom_8   600000 non-null  float64
 15  nom_9   600000 non-null  float64
 16  ord_0   600000 non-null  float64
 17  ord_1   600000 non-null  float64
 18  ord_2   600000 non-null  float64
 19  ord_3   600000 non-null  float64
 20  ord_4   600000 non-null  object 
 21  ord_5   60

## ord_4, ord_5
**high cardinality, ordinary?**

: cardinality가 높으므로 binary는 제외. 명확하게 어떤 형식으로 순서를 가지고 있는지 알 수 없으므로 hashing 사용.

In [27]:
cat = ['ord_4', 'ord_5']

In [29]:
from category_encoders.hashing import HashingEncoder

X_train_ha, X_test_ha = X_train.copy(), X_test.copy()

HA = HashingEncoder(n_components=8, hash_method='md5')
# n_components = 8이면 최대 255까지 표현 가능

X_train_ha = HA.fit_transform(X_train_ha[cat], y_train)
X_test_ha = HA.transform(X_test_ha[cat])

KeyboardInterrupt: 