In [10]:
import numpy as np
import pandas as pd

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [12]:
test_id = test.id
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'] ,axis=1, inplace=True)

X_train = train.drop(['target'], axis=1)
y_train = train.target
X_test = test

feature engineering

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [14]:
h_skew = X_train.loc[:, train.skew() >= 2].columns # with skewed
l_skew = X_train.loc[:, train.skew() < 2].columns # Bimodal

# high skewd columns:
# add mean, median, skew, min, max, var, std
X_train['median_h'] = X_train[h_skew].median(axis=1)
X_test['median_h'] = X_test[h_skew].median(axis=1)
X_train['var_h'] = X_train[h_skew].var(axis=1)
X_test['var_h'] = X_test[h_skew].var(axis=1)

# bimodal columns:

X_train['mean_l'] = X_train[l_skew].mean(axis=1)
X_test['mean_l'] = X_test[l_skew].mean(axis=1)
X_train['std_l'] = X_train[l_skew].std(axis=1)
X_test['std_l'] = X_test[l_skew].std(axis=1)
X_train['median_l'] = X_train[l_skew].median(axis=1)
X_test['median_l'] = X_test[l_skew].median(axis=1)
X_train['skew_l'] = X_train[l_skew].skew(axis=1)
X_test['skew_l'] = X_test[l_skew].skew(axis=1)
X_train['max_l'] = X_train[l_skew].max(axis=1)
X_test['max_l'] = X_test[l_skew].max(axis=1)
X_train['var_l'] = X_train[l_skew].var(axis=1)
X_test['var_l'] = X_test[l_skew].var(axis=1)

model

In [15]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
import tensorflow as tf
import random
import os
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

# seed
seed = 42
def seedAll(seed):
    np.random.seed(seed) # set numpy seed
    tf.random.set_seed(seed) # set tensorflow seed
    random.seed(seed) # set random seed
    os.environ['PYTHONHASHSEED'] = str(seed)
seedAll(seed)

In [8]:
# create model
def createModel():
    # create early stopping.
    # monitor: early stopping의 기준
    # min_delta: 개선된 것으로 간주하기 위한 최소한의 변화량
    # patience: 임계 횟수.
    # baseline: 최소 기준 점수
    # restore_best_weights: if it's True, restore best weights
    early_stopping = callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=20,
        min_delta=0,
        baseline=None,
        restore_best_weights=True,
        verbose=0
    )
    # monitor: reducelronplaceau의 기준
    # factor: learning_rate 감소량. 새로운 learning_rate = 
    # 기존 learning rate * factor
    # patience: 임계 횟수. 만약 이 값이 7이고, 7번 내에 개선이 없으면
    # learning rate를 갱신.
    # mode: monitor의 지향 방향.
    plateau = callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        mode='min',
        patience=7,
        factor=.2,
        verbose=0        
    )
    
    # create model.
    # keras의 sequential (각 층이 연속적으로 붙음.)
    # Dense: 완전연결밀집층(이전 층의 출력이 현재 층의 각 노드와 모두 연결.)
    # 파라미터1: 유닛의 개수(일반적으로 32, 64, 128 등 2의 배수.)
    # 파라미터2: 활성화함수 (일반적으로 relu, swish)
    model = keras.Sequential([
        layers.Dense(108, activation='swish', input_shape=[X_train.shape[1]]),
        layers.Dense(64, activation='swish'),
        layers.Dense(32, activation='swish'),
        layers.Dense(1, activation='sigmoid')]
    )
    
    # model compile
    # optimizer, loss function, metrics를 설정.
    # optimizer는 일반적으로 Adam
    # 이진분류의 경우, loss는 binary_crossentropy 사용.
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0007),
        loss='binary_crossentropy',
        metrics=['AUC']
    )
    
    return model, early_stopping, plateau

In [9]:
ㅋ

NameError: name 'ᄏ' is not defined

In [None]:
kf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

preds_valid_f = {}
pred_test = []
total_auc = []
total_histories = None

for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
    # -----------------------------------------------------------------
    # preprocessing
    xtrain_scaled = pd.DataFrame(
        transformer_high_skew.fit_transform(X_train),
        columns=X_train.columns
    )
    xtest = pd.DataFrame(
        transformer_high_skew.transform(X_test),
        columns=X_test.columns
    )
    
    # -----------------------------------------------------------------
    # split
    xtrain, ytrain = xtrain_scaled.iloc[train_idx, :], y_train[train_idx].astype('float')
    xvalid, yvalid = xtrain_scaled.iloc[valid_idx, :], y_train[valid_idx].astype('float')
    
    #index_valid = xvalid.index.tolist()
    #xtrain = preprocessor.fit_transform(xtrain)
    #xvalid = preprocessor.fit_transform(xvalid)

    # -----------------------------------------------------------------
    # model
    model, early_stopping, plateau = createModel()
    history = model.fit(xtrain, ytrain,
                       validation_data=(xvalid, yvalid),
                       batch_size=2048,
                       epochs=700,
                       callbacks=[early_stopping, plateau],
                       shuffle=True,
                       verbose=0)
    
    # -----------------------------------------------------------------
    # oof
    pred_valid = model.predict(xvalid).reshape(1, -1)[0]
    
    # -----------------------------------------------------------------
    # test predictions
    pred_test.append(model.predict(xtest).reshape(1, -1)[0])
    
    # -----------------------------------------------------------------
    # Saving scores to plot the end
    histories = pd.DataFrame(history.history)
    histories['fold'] = i
    if not i:
        total_histories = histories
    else:
        total_histories = pd.concat([total_histories, histories], axis=0)
    
    # -----------------------------------------------------------------
    # concatenating valid preds
    preds_valid_f.update(dict(zip(valid_idx, preds_valid)))
    
    # Getting score for a fold model
    valid_score = roc_auc_score(yvalid, pred_valid)
    print(f'Fold {i} roc_auc_score: {valid_score}')
    
    # Total auc
    total_auc.append(valid_score)

print(f'mean roc_auc_score: {np.mean(total_auc)}, std: {np.std(total_auc)}')

In [None]:
import matplotlib.pyplot as plt

for fold in range(f_scores['folds'].nunique()):
    history_f = f_scores[f_scores['folds'] == fold]

    fig, ax = plt.subplots(1, 2, tight_layout=True, figsize=(14,4))
    fig.suptitle('Fold : '+str(fold), fontsize=14)
        
    plt.subplot(1,2,1)
    plt.plot(history_f.loc[:, ['loss', 'val_loss']], label= ['loss', 'val_loss'])
    plt.legend(fontsize=15)
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.plot(history_f.loc[:, ['auc', 'val_auc']],label= ['auc', 'val_auc'])
    plt.legend(fontsize=15)
    plt.grid()
    
    print("Validation Loss: {:0.4f}".format(history_f['val_loss'].min()));

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
sub['target'] = np.mean(preds_test, axis = 0)
sub.to_csv('submission.csv', index=False)
sub.head()