In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [None]:
X_train = train.copy().drop(['target'], axis=1)
y_train = train['target']
X_test = test.copy()

#### 결측치 최빈값으로 대체

In [None]:
cat = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month']

In [None]:
from sklearn.impute import SimpleImputer 

imp = SimpleImputer(strategy="most_frequent")
X_train[cat] = imp.fit_transform(X_train[cat])
X_test[cat] = imp.fit_transform(X_test[cat])

In [None]:
X_train.info()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
def sub(X_train_encode, X_test_encode, encodingname):
    tree.fit(X_train_encode[cat], y_train)
    submission = X_test.id.to_frame()
    submission['target'] = tree.predict_proba(X_test_encode[cat])[:, 1]
    submission.to_csv('DT_' + encodingname + '.csv', index = False)

### 1) Label Encoding (Ordinal)

In [None]:
from sklearn.preprocessing import OrdinalEncoder


X_train_oe, X_test_oe, = X_train.copy(), X_test.copy()


le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1, dtype=int)

X_train_oe[cat] = le.fit_transform(X_train[cat])
X_test_oe[cat] = le.transform(X_test[cat])


X_train_oe[cat]

In [None]:
sub(X_train_oe, X_test_oe, 'OrdinalEncoding')

### 2) Target Encoding

In [None]:
from category_encoders import TargetEncoder


X_train_te, X_test_te, = X_train.copy(), X_test.copy()

te = TargetEncoder(cols=cat, min_samples_leaf=5, smoothing=50) 
X_train_te[cat] = te.fit_transform(X_train[cat], y_train)
X_test_te[cat] = te.transform(X_test[cat])


X_train_te[cat]

In [None]:
sub(X_train_te, X_test_te, 'TargetEncoding')

### 3) Helmert Encoding

In [None]:
import category_encoders as ce

X_train_he = X_train.copy().drop(['id'], axis=1)
X_test_he =  X_test.copy().drop(['id'], axis=1)

HE = ce.HelmertEncoder(cat) #, drop_invariant=True
X_train_he = HE.fit(X_train[cat], y_train)
X_test_he = HE.transform(X_test[cat])

X_train_he

In [None]:
sub(X_train_he, X_test_he, 'TargetEncoding')

### 4) Binary Encoding

In [None]:
BE = ce.binary.BinaryEncoder(cat)

X_train_be = X_train.copy().drop(['id'], axis=1)
X_test_be = X_test.copy().drop(['id'], axis=1)

X_train_be = BE.fit_transform(X_train[cat], y_train)
X_test_be = BE.transform(X_test[cat])

X_train_be

In [None]:
cat = X_train_be.columns
sub(X_train_be, X_test_be, 'BinaryEncoding')

### 5) Frequency Encoding

In [None]:
def FE(colname):
    freq = (X_train.groupby(colname).size())/len(X_train)
    X_train_fe.loc[:,colname] = X_train_fe[colname].map(freq)
    X_test_fe.loc[:,colname] = X_test_fe[colname].map(freq)

In [None]:
X_train_fe = X_train.copy().drop(['id'], axis=1)
X_test_fe = X_test.copy().drop(['id'], axis=1)

In [None]:
for x in (X_train_fe.columns):
    FE(x)

In [None]:
X_test_fe.nom_6.value_counts()

In [None]:
X_test_fe.nom_6 = X_test_fe.nom_6.fillna(#최빈값)

In [None]:
cat = X_train_fe.columns
sub(X_train_fe, X_test_fe, 'FrequencyEncoding')

### 6) Weight of Evidence Encoding

In [None]:
X_train_woe = X_train.copy().drop(['id'], axis=1)
X_test_woe = X_test.copy().drop(['id'], axis=1)

def Woe(colname):
    WOE = train.groupby(colname)['target'].mean()
    WOE = pd.DataFrame(WOE)
    WOE = WOE.rename(columns = {'target':'good'})
    WOE['bad'] = 1 - WOE.good
    WOE['bad'] = np.where(WOE['bad'] == 0,0.000001, WOE['bad'])

    WOE['WOE'] = np.log(WOE.good/WOE.bad)

    X_train_woe.loc[:,colname] = X_train_woe[colname].map(WOE['WOE'])
    X_test_woe.loc[:,colname] = X_test_woe[colname].map(WOE['WOE'])

In [None]:
for x in (X_train_woe.columns):
    Woe(x)

In [None]:
X_test_woe.nom_6 = X_test_woe.nom_6.fillna(-1.540948)

In [None]:
X_train_woe = X_train_woe.replace([-np.inf], -10)
X_test_woe = X_test_woe.replace([-np.inf], -10)

In [None]:
cat = X_train_woe.columns
sub(X_train_woe, X_test_woe, 'WeightofEvidenceEncoding')

### 7) James-Stein Encoding

In [None]:
X_train_jse = X_train.copy().drop(['id'], axis=1)
X_test_jse = X_test.copy().drop(['id'], axis=1)
cat = X_train_jse.columns

from category_encoders.james_stein import JamesSteinEncoder

JSE_encoder = JamesSteinEncoder()
X_train_jse = JSE_encoder.fit_transform(X_train_jse[cat], y_train)
X_test_jse = JSE_encoder.transform(X_test_jse[cat])

In [None]:
cat = X_train_jse.columns
sub(X_train_jse, X_test_jse, 'JamesSteinEncoding')

### 8) Leave-One-Out Encoding

In [None]:
X_train_looe = X_train.copy().drop(['id'], axis=1)
X_test_looe = X_test.copy().drop(['id'], axis=1)
cat = X_train_looe.columns

from category_encoders.leave_one_out import LeaveOneOutEncoder

LOOE_encoder = LeaveOneOutEncoder()
X_train_looe = LOOE_encoder.fit_transform(X_train_looe[cat], y_train)
X_test_looe = LOOE_encoder.transform(X_test_looe[cat])

In [None]:
cat = X_train_looe.columns
sub(X_train_looe, X_test_looe, 'LeaveOneOutEncoding')

### 9) Catboost Encoding

In [None]:
X_train_cbe = X_train.copy().drop(['id'], axis=1)
X_test_cbe = X_test.copy().drop(['id'], axis=1)
cat = X_train_cbe.columns

from category_encoders.cat_boost import CatBoostEncoder

CBE_encoder = CatBoostEncoder()
X_train_cbe = CBE_encoder.fit_transform(X_train_cbe[cat], y_train)
X_test_cbe = CBE_encoder.transform(X_test_cbe[cat])

In [None]:
cat = X_train_cbe.columns
sub(X_train_cbe, X_test_cbe, 'CatboostEncoding')

* * *