In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sns.set()

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression, SelectFromModel, RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Flatten, Dense, Input, concatenate, Embedding, Dot
from tensorflow.keras.layers import Activation, ReLU, LeakyReLU, PReLU
from tensorflow.keras.layers import BatchNormalization, Dropout, AlphaDropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model

from functools import partial

# Data load

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Column 설정

In [6]:
target = 'target'

In [7]:
columns_useless = ['id']
# columns_useless = columns_useless + ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5']

In [8]:
columns_num_all = joblib.load('columns_num.pkl')
columns_cat_all = joblib.load('columns_cat.pkl')
columns_binary_num = joblib.load('columns_binary_num.pkl')
columns_binary_cat = joblib.load('columns_binary_cat.pkl')

In [9]:
try :
    columns_num_all.remove(target)
    
except :
    pass

In [10]:
try :
    columns_cat_all.remove(target)
    
except :
    pass

In [11]:
columns_num = []

for column in columns_num_all :
    
    if column not in columns_useless :
        columns_num.append(column)

In [12]:
columns_cat = []

for column in columns_cat_all :
    
    if column not in columns_useless :
        columns_cat.append(column)

In [13]:
columns_sc = []

for column in columns_num :
    
    if column not in columns_binary_num:
        columns_sc.append(column)
        
columns_sc

[]

In [14]:
columns_en = []

for column in columns_cat :
    
    if (column not in columns_binary_num) and (column not in columns_binary_cat):
        columns_en.append(column)
        
columns_en

['nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'ord_0',
 'day',
 'month']

# Target 분리

In [15]:
# y = data[target]
# X = data.drop(target, axis=1)

In [16]:
y_train = train[target]
X_train = train.drop(target, axis=1)

y_train_og = y_train.copy()
X_train_og = X_train.copy()

In [17]:
X_test = test.copy()

# 전체 dataset concat

In [18]:
all_data = pd.concat([X_train, X_test])

# Data preprocessing

In [19]:
def preprocessing(data, X_train_og, columns_useless, columns_sc, columns_binary_cat, columns_en) :
    
    # useless column 제거
    data.drop(columns_useless, axis=1, inplace=True)
    
    # scaling
#     scaler = StandardScaler()
#     scaler.fit(X_train_og[columns_sc])
#     data[columns_sc] = scaler.transform(data[columns_sc])

    # binary data 처리
    column = columns_binary_cat[0]
    index_1 = data[data[column] == 'T'].index
    index_0 = data[data[column] == 'F'].index
    data.loc[index_1, column] = 1
    data.loc[index_0, column] = 0
    
    column = columns_binary_cat[1]
    index_1 = data[data[column] == 'Y'].index
    index_0 = data[data[column] == 'N'].index
    data.loc[index_1, column] = 1
    data.loc[index_0, column] = 0
    
    data[columns_binary_cat] = data[columns_binary_cat].astype('int64')
    
    # One-hot-encoding
#     data = pd.get_dummies(data, columns=columns_en)
    encoder = OneHotEncoder()   # 고윳값을 많이 가진 feature가 있는 경우는 OneHotEncoder를 사용해야 함
    data = encoder.fit_transform(data)
    
    return data

In [20]:
all_data = preprocessing(all_data, X_train_og, columns_useless, columns_sc, columns_binary_cat, columns_en)

# Train & test set 분리

In [21]:
X_train = all_data[:len(train)]
X_test = all_data[len(train):]

In [22]:
X_train_before = X_train.copy()
X_test_before = X_test.copy()

# Select Percentile

In [23]:
select = SelectPercentile(score_func=f_classif, percentile=30)
select.fit(X_train, y_train)

SelectPercentile(percentile=30)

In [24]:
X_train = select.transform(X_train)
X_train.shape

(300000, 4965)

In [25]:
X_test = select.transform(X_test)
X_test.shape

(200000, 4965)

## Validation set 분리
- Train dataset이 너무 커서 cross validation에 너무 오랜 시간이 걸리는 경우
- OneHotEncoder를 사용하면 sparse matrix를 return하기 때문에 neural network에서 validation_split를 사용할 수 없는 경우

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=30)

## ML model cross validation

In [27]:
knn = KNeighborsClassifier(n_neighbors=5)
logit = LogisticRegression(C=1.0, random_state=30)
sgd = SGDClassifier(max_iter=1000, eta0=0.0, random_state=30)
linsvm = LinearSVC(C=1.0, random_state=30)
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=30)
dt = DecisionTreeClassifier(max_depth=None, random_state=30)
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=30)
et = ExtraTreesClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=30)
gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, random_state=30)
xgb = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=30)
# lgb = LGBMClassifier(random_state=30)
adb = AdaBoostClassifier(random_state=30)
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, max_iter=200,
                    learning_rate='constant', learning_rate_init=0.001,random_state=30)

In [28]:
models = {
#     'knn' : knn,
    'logit' : logit,
    'sgd' : sgd,
    'linsvm' : linsvm,
#     'svm' : svm,
#     'dt' : dt,
#     'rf' : rf,
#     'et' : et,
#     'gb' : gb,
#     'xgb' : xgb,
#     'lgb' : lgb,
#     'adb' : adb,
#     'mlp' : mlp
}

In [29]:
result = []

for key, model in models.items() : 
    score = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1)  
    score = pd.DataFrame(score).mean()
    
    result.append(score)

In [30]:
result = pd.concat(result, axis=1)
result.columns = models.keys()
result = result.T
result = result[['test_score', 'train_score', 'fit_time', 'score_time']]
result = round(result, 3)
result

Unnamed: 0,test_score,train_score,fit_time,score_time
logit,0.807,0.824,4.793,0.034
sgd,0.802,0.814,1.748,0.031
linsvm,0.807,0.831,69.246,0.026


In [31]:
for key, model in models.items() :
    model.fit(X_train, y_train)
    print(key, ':', round(model.score(X_val, y_val), 3))

logit : 0.768
sgd : 0.764
linsvm : 0.77


## Nerual network

In [32]:
def plot_result(history) :
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], 'b--', label='loss')
    plt.plot(history.history['val_loss'], 'r-', label='val_loss')
    plt.xlabel('Epoch')
    plt.grid(True)
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['auc'], 'b--', label='auc')
    plt.plot(history.history['val_auc'], 'r-', label='val_auc')
    plt.xlabel('Epoch')
    plt.grid(True)
    plt.legend()

In [33]:
np.random.seed(30)
tf.random.set_seed(30)

In [34]:
model = Sequential()
model.add(Dense(100, activation='selu', kernel_initializer='lecun_normal', input_shape=(X_train.shape[1],)))

for layer in range(5):
    model.add(Dense(100, activation='selu', kernel_initializer='lecun_normal'))
    
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               496600    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 2

In [35]:
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

In [36]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)   # metrics로 'AUC'를 사용하기 위해 one-hot-encoding 사용
y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [37]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['AUC'])

In [38]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [39]:
# history = model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), callbacks=early_stopping_cb)

In [40]:
# plot_result(history)

# Select from model

In [41]:
X_train = X_train_before.copy()
X_test = X_test_before.copy()

y_train = y_train_og.copy()

In [42]:
select = SelectFromModel(logit, threshold='1.5*mean')
select.fit(X_train, y_train)

SelectFromModel(estimator=LogisticRegression(random_state=30),
                threshold='1.5*mean')

In [43]:
X_train = select.transform(X_train)
X_train.shape

(300000, 3903)

In [44]:
X_test = select.transform(X_test)
X_test.shape

(200000, 3903)

## Validation set 분리
- Train dataset이 너무 커서 cross validation에 너무 오랜 시간이 걸리는 경우
- OneHotEncoder를 사용하면 sparse matrix를 return하기 때문에 neural network에서 validation_split를 사용할 수 없는 경우

In [45]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=30)

## ML model cross validation

In [46]:
knn = KNeighborsClassifier(n_neighbors=5)
logit = LogisticRegression(C=1.0, random_state=30)
sgd = SGDClassifier(max_iter=1000, eta0=0.0, random_state=30)
linsvm = LinearSVC(C=1.0, random_state=30)
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=30)
dt = DecisionTreeClassifier(max_depth=None, random_state=30)
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=30)
et = ExtraTreesClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=30)
gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, random_state=30)
xgb = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=30)
# lgb = LGBMClassifier(random_state=30)
adb = AdaBoostClassifier(random_state=30)
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, max_iter=200,
                    learning_rate='constant', learning_rate_init=0.001,random_state=30)

In [47]:
models = {
#     'knn' : knn,
    'logit' : logit,
    'sgd' : sgd,
    'linsvm' : linsvm,
#     'svm' : svm,
#     'dt' : dt,
#     'rf' : rf,
#     'et' : et,
#     'gb' : gb,
#     'xgb' : xgb,
#     'lgb' : lgb,
#     'adb' : adb,
#     'mlp' : mlp
}

In [48]:
result = []

for key, model in models.items() : 
    score = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1)  
    score = pd.DataFrame(score).mean()
    
    result.append(score)

In [49]:
result = pd.concat(result, axis=1)
result.columns = models.keys()
result = result.T
result = result[['test_score', 'train_score', 'fit_time', 'score_time']]
result = round(result, 3)
result

Unnamed: 0,test_score,train_score,fit_time,score_time
logit,0.775,0.794,3.043,0.027
sgd,0.77,0.782,0.805,0.021
linsvm,0.774,0.797,22.146,0.025


In [50]:
for key, model in models.items() :
    model.fit(X_train, y_train)
    print(key, ':', round(model.score(X_val, y_val), 3))

logit : 0.75
sgd : 0.746
linsvm : 0.75


## Nerual network

In [51]:
model = Sequential()
model.add(Dense(100, activation='selu', kernel_initializer='lecun_normal', input_shape=(X_train.shape[1],)))

for layer in range(5):
    model.add(Dense(100, activation='selu', kernel_initializer='lecun_normal'))
    
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 100)               390400    
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_11 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_12 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_13 (Dense)             (None, 2)                

In [52]:
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

In [53]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)   # metrics로 'AUC'를 사용하기 위해 one-hot-encoding 사용
y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [54]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['AUC'])

In [55]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [56]:
# history = model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), callbacks=early_stopping_cb)

In [57]:
# plot_result(history)

# Recursive feature elimination

In [58]:
X_train = X_train_before.copy()
X_test = X_test_before.copy()

y_train = y_train_og.copy()

In [59]:
# select = RFE(logit, n_features_to_select=100)   # feature 수가 많으면 너무 오래 걸림
# select.fit(X_train, y_train)

In [60]:
# X_train = select.transform(X_train)
# X_train.shape

In [61]:
# X_test = select.transform(X_test)
# X_test.shape