In [58]:
import pandas as pd
import numpy as np
import warnings
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras import callbacks
from keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from keras.layers import Dense, Dropout
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping

warnings.filterwarnings(action='ignore')

In [59]:
df = pd.read_csv("modified_train_0412.csv")
x_train = df.drop(columns=['loan_status', 'addr_state', 'Unnamed: 0'])
y_train = df['loan_status']

In [60]:
nan_indices = np.isnan(x_train).any(axis=1)
x_train = x_train[~nan_indices]
y_train = y_train[~nan_indices]

In [61]:
y_train.shape

(1091767,)

In [62]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x_train)

# PCA로 차원 축소
pca = PCA(n_components=90)
X_pca = pca.fit_transform(X_scaled)
x_train = X_pca.astype('float32')
# stratify로 학습에 용이하게 비율 유지
# X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)

In [64]:
# Y_Train=to_categorical(Y_Train, 2).astype(int)
# y_train=to_categorical(y_train, 2).astype(int)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y_train = y_train.values
y_train = encoder.fit_transform(y_train.reshape(-1, 1))

In [71]:
y_train = y_train.toarray()

In [65]:
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())    
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [66]:
y_train.shape

(1091767, 2)

In [72]:
from sklearn.model_selection import StratifiedKFold

# Define the number of splits for k-fold cross-validation
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True)

# Initialize lists to store evaluation metrics
valid_acc_list = []
valid_f1_list = []
valid_recall_list = []
valid_precision_list = []

# Perform k-fold cross-validation
for train, val in skf.split(x_train, y_train):
    X_train_fold, X_val_fold = x_train[train], x_train[val]
    Y_train_fold, Y_val_fold = y_train[train], y_train[val]
    
    # Define and compile the model
    model = keras.Sequential([
        Dense(45, input_dim=90, activation='relu'),
        Dropout(0.2),
        Dense(45, activation='relu'),
        Dropout(0.2),
        Dense(45, activation='relu'),
        Dropout(0,2),
        Dense(10),
        Dense(2, activation='softmax')
    ])
    optimizer = Adam(learning_rate=0.01)
    
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy", f1_score, recall, precision])
    
    
    epochs = 100
    batch_size = 200000
    early_stopping = EarlyStopping(min_delta=0.01)

    # Train the model
    model.fit(X_train_fold, Y_train_fold, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[early_stopping])
    
    # Evaluate the model on validation data
    valid_loss, valid_acc, valid_f1, valid_recall, valid_precision = model.evaluate(X_val_fold, Y_val_fold, verbose=0)
    valid_acc_list.append(valid_acc)
    valid_f1_list.append(valid_f1)
    valid_recall_list.append(valid_recall)
    valid_precision_list.append(valid_precision)

# Calculate the average metrics across all folds
avg_valid_acc = sum(valid_acc_list) / k
avg_valid_f1 = sum(valid_f1_list) / k
avg_valid_recall = sum(valid_recall_list) / k
avg_valid_precision = sum(valid_precision_list) / k

# Print the average evaluation metrics
print("===================================")
print("Average Validation accuracy:", avg_valid_acc)
print("Average Validation F1-score:", avg_valid_f1)
print("Average Validation recall:", avg_valid_recall)
print("Average Validation precesion:", avg_valid_precision)


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.