In [None]:
# library for feature engineering and EDA
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from datetime import datetime
import random

# library for statistic
from scipy import stats
from scipy.stats import chi2_contingency, kruskal
from scipy.stats import boxcox, norm
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

# library for sampling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE

# library for machine learning
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, RocCurveDisplay, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

import shap
%matplotlib inline # Matplotlib의 시각화 결과를 노트북 내에서 바로 표시되도록 설정


In [None]:
df = pd.read_csv("./datas/application_train.csv", delimiter=",")
df.head()

In [None]:
# 중복 data 확인 및 제거

print("중복된 항목 수 :", len(df[df.duplicated()]))

has_duplicated = len(df[df.duplicated()]) != 0
if (has_duplicated):
  df = df.drop_duplicates()

In [None]:
df.info()

In [None]:
df.isna().sum()
df.dropna(axis=1, inplace=True)

In [None]:
# categorical and numeric columns
for column_name in list(df.columns):
    print(column_name, df[column_name].dtype, df[column_name].unique())

In [None]:
list_categorical_columns = list(df.select_dtypes(include=['object']).columns)
list_numeric_columns = list(df.select_dtypes(include=['float64','int64']).columns)
target_column = "TARGET"
print(len(df))
print(len(df.columns))
print(len(list_categorical_columns))
print(len(list_numeric_columns))

In [None]:
df.isna().sum()

### 2-2 Dependent(종속) Data Explore

In [None]:
list_categorical_columns.remove(target_column)

In [None]:
df[target_column].describe()

In [None]:
df[target_column].value_counts()

In [None]:
sns.countplot(x=target_column, data=df)

### 2-3 Independent(독립) Data Explore

In [None]:
df[list_categorical_columns].nunique().sort_values()
# 불필요한 컬럼 보이지 않음

In [None]:
# cateogircla column 별 분포 확인
plt.figure(figsize=(15,30))
x = 1
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.6, wspace=0.2)
for column_name in list_categorical_columns:
    plt.subplot(6,3,x)
    x = x+1
    df[column_name].value_counts().sort_index().plot(kind='bar')
    plt.title(column_name)
plt.show()

In [None]:
# categorical column과 dependent data(target column) 분포 분석
df_poutcome_dependent = pd.crosstab(df[target_column], df[list_categorical_columns[-1]])
df_poutcome_dependent.plot(kind='bar')
# 애초에 target_column(y) 비중이 다르기 때문에, 아래와 같이 count 수를 비교하는 것은 데이터 분포를 파악하는데 적합하지 않음

In [None]:
# 아래와 같은 table을 출력
pd.crosstab(df[target_column], df[list_categorical_columns[-1]], normalize="index")

In [None]:
df_poutcome_dependent_ratio = pd.crosstab(df[target_column], df[list_categorical_columns[-1]], normalize="index")
df_poutcome_dependent_ratio.plot.bar(figsize=(10,5))

In [None]:
# 이런 경우에는 비율 데이터를 plot으로 그리면 데이터 분포 이해에 도움.
for column_name in list_categorical_columns:
    pd.crosstab(df[target_column], df[column_name], normalize="index").plot.bar()
    plt.title(column_name)
plt.show()

In [None]:
# 카이제곱 검정 코드
list_meaningful_column_by_chi = []

for column_name in list_categorical_columns:
  statistic, pvalue, _, _ = chi2_contingency(pd.crosstab(df[target_column], df[column_name]))
  if pvalue <= 0.05:
    list_meaningful_column_by_chi.append(column_name)
  print(column_name, statistic, pvalue)

print("all categorical columns : ", len(list_categorical_columns))
print("selected columns by chi : ", len(list_meaningful_column_by_chi), list_meaningful_column_by_chi)

#### 2) Numeric Data Analysis

In [None]:
df[list_numeric_columns].nunique().sort_values()

In [None]:
df[list_numeric_columns].describe()

In [None]:
# numeric column 별 분포 확인
plt.figure(figsize=(20,10))
x = 1

plt.subplots_adjust(top=0.99, bottom = 0.01, hspace = 0.4, wspace=0.2)
for column_name in list_numeric_columns[:15]:
  plt.subplot(4,4,x)
  x = x + 1
  sns.violinplot(x=column_name, data=df)
  plt.title(column_name)
plt.show()

In [None]:
for column_name in list_numeric_columns:
  print(column_name, "skew : ", skew(df[column_name]), "kur : ", kurtosis(df[column_name]) )

# 추후 scaling을 활용한 feature preprocessing의 필요성 확인

In [None]:
df_corr = df[list_numeric_columns].corr()
plt.figure(figsize=(8,8))
df_corr_for_view = df[list_numeric_columns[:15]].corr()
sns.heatmap(df_corr_for_view, annot=True)

In [None]:
# 너무 높은 correlation을 갖는 데이터를 삭제. 단, 해당 correlation값을 신뢰할 수 있는지 확인필요
# 기준은 절대값 0.75 이상
index_corr_over_75 = np.where((abs(df_corr)>0.75) & (df_corr != 1))
index_corr_over_75

In [None]:
len_corr_over_75 = len(index_corr_over_75[0])
left_columns = df_corr.columns[index_corr_over_75[0]]
right_columns = df_corr.columns[index_corr_over_75[1]]
for index in range(len_corr_over_75):
  print(left_columns[index], "<->", right_columns[index])

In [None]:
# 해당하는 인덱스를 출력
left_columns_over_75 = df_corr.columns[index_corr_over_75[0]]
right_columns_over_75 = df_corr.columns[index_corr_over_75[1]]

# 0.75 이상인 상관관계를 가진 열 리스트 출력
over_75_columns = list(set(left_columns_over_75).union(set(right_columns_over_75)))
print(len(over_75_columns), over_75_columns)

list_removed_by_correlation = over_75_columns

In [None]:
plt.figure(figsize=(18,12))
x = 1
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.2, wspace=0.2)
for column_name in list_numeric_columns[:12]:
    plt.subplot(4,3,x)
    x = x + 1
    sns.boxplot(data=df,x=target_column,y=column_name)
plt.show()

# 종속변수에 따른 previous 변수 분포 차이
# 종속변수에 따른 duration 변수 부노 차이

In [None]:
# numeric column 정규성 확인
plt.figure(figsize=(18,10))
x = 1
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.4, wspace=0.2)
for column_name in list_numeric_columns[:12]:
    plt.subplot(3,4,x)
    x = x+1

    stats.probplot(df[column_name], dist=stats.norm, plot=plt)

    plt.title(column_name)
plt.show()

In [None]:
list_column_normality = []
for column_name in list_numeric_columns:
  statistic, pvalue = stats.shapiro(df[column_name])
  if pvalue > 0.05:
    list_column_normality.append(column_name)
  print(column_name, ", statistic : ",statistic,", pvalue : ", pvalue)
print("정규성 만족하는 column 수 : ", len(list_column_normality))

In [None]:
list_meaningful_column_by_kruskall = []

list_target_unique = df[target_column].unique()

for column_name in list_numeric_columns:
  list_by_target_value = []
  for target_value in list_target_unique:
    df_tmp = df[df[target_column]==target_value][column_name].dropna()
    list_by_target_value.append(np.array(df_tmp))
  statistic, pvalue = kruskal(*list_by_target_value)
  if pvalue <= 0.05:
    list_meaningful_column_by_kruskall.append(column_name)
  print(column_name, ", ", statistic, ", ", pvalue)
print("all numerical columns : ", len(list_numeric_columns))
print("selected columns by kruskall : ", len(list_meaningful_column_by_kruskall), list_meaningful_column_by_kruskall)

In [None]:
list_selected_numerical = list_meaningful_column_by_kruskall.copy()
for column_name in list_removed_by_correlation:
  if (column_name in list_selected_numerical):
    list_selected_numerical.remove(column_name)

In [None]:
df_fs = df[list_meaningful_column_by_chi + list_selected_numerical]
df_fs.head(2)

In [None]:
Y = df_fs[target_column]
X = df_fs.drop([target_column], axis=1)

In [None]:
X

In [None]:
Y

In [None]:
le = LabelEncoder()
le.fit(Y)
Y_encoded = le.transform(Y)

In [None]:
Y_encoded

In [None]:
list_categorical_columns = list(df_fs.select_dtypes(include=['object']).columns)
list_numeric_columns = list(df_fs.select_dtypes(include=['float64','int64']).columns)
print(len(list_categorical_columns))
print(len(list_numeric_columns))

In [None]:
df_fs.head(1)

In [None]:
scaler = MinMaxScaler()
list_numeric_columns = [col for col in list_numeric_columns if col != target_column]
X.loc[:, list_numeric_columns] = scaler.fit_transform(X[list_numeric_columns])
X.head(1)

In [None]:
X_base = pd.get_dummies(X)
X_base.head(1)

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X_base, Y_encoded, test_size=0.2, stratify=Y_encoded)

In [None]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.create_experiment("bank_marketing_model")

In [None]:
mlflow.set_experiment("bank_marketing_model")

In [None]:
# with mlflow.start_run():
n_split = 3
skf = StratifiedKFold(n_splits=n_split)
skf.get_n_splits(X_base, Y_encoded)

list_fold_result_test = []
list_fold_result_validation = []
list_fold_roc_test = []
list_fold_roc_validation = []
num_hidden_layer_sizes = 10
activation = 'relu'
learning_rate_init = 0.001

# mlflow.log_param("split num", n_split)
# mlflow.log_param("hidden_layer_sizes", num_hidden_layer_sizes)
# mlflow.log_param("activation", activation)
# mlflow.log_param("learning_rate_init", learning_rate_init)
# mlflow.log_param("sampling", "None")

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print("Split " + str(i+1))
    X_train_cv, X_test_cv = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]

    # model set
    rf_clf = MLPClassifier(hidden_layer_sizes=num_hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init)
    rf_clf.fit(X_train_cv, y_train_cv)

    # model inference on test set
    y_prd_test = rf_clf.predict(X_test_cv)
    y_prd_proba_test = rf_clf.predict_proba(X_test_cv)[:,-1]

    # score evaluation on test set
    prf_score_test = precision_recall_fscore_support(y_test_cv, y_prd_test, average='macro')
    score_test = roc_auc_score(y_test_cv, y_prd_proba_test, average='macro')
    print(i, " precision, recall, f1score : ", prf_score_test)
    print(i, " roc_auc_score : ", score_test)
    list_fold_roc_test.append(score_test)
    list_fold_result_test.append(prf_score_test)

    # model inference on validation data set
    y_prd_validation = rf_clf.predict(X_validation)
    y_prd_proba_validation = rf_clf.predict_proba(X_validation)[:,-1]

    # score evaluation on validation data set
    prf_score_validation_macro = precision_recall_fscore_support(y_validation, y_prd_validation, average='macro')
    score_validation = roc_auc_score(y_validation, y_prd_proba_validation, average='macro')
    print("validation : precision, recall, f1score macro : ", prf_score_validation_macro)
    print("validation : roc_auc_score : ", score_validation)
    list_fold_roc_validation.append(score_validation)
    list_fold_result_validation.append(prf_score_validation_macro)


def get_prf_average(list_of_result):
    pre = 0
    rec = 0
    f1 = 0
    for result in list_fold_result_validation:
        pre += result[0]
        rec += result[1]
        f1 += result[2]
    return pre/n_split, rec/n_split, f1/n_split

pre, rec, f1 = get_prf_average(list_fold_result_test)
pre_val, rec_val, f1_val = get_prf_average(list_fold_result_validation)

roc = sum(list_fold_roc_test)/n_split
roc_val = sum(list_fold_roc_validation)/n_split

# mlflow.log_metric("precision_on_test", pre)
# mlflow.log_metric("recall_on_test", rec)
# mlflow.log_metric("f1score_on_test", f1)
# mlflow.log_metric("roc_on_test", roc)


# mlflow.log_metric("precision_on_validation", pre_val)
# mlflow.log_metric("recall_on_validation", rec_val)
# mlflow.log_metric("f1score_on_validation", f1_val)
# mlflow.log_metric("roc_on_validation", roc_val)


def save_artifact(model, X_validation, y_validation, y_pred):
    roc_plot = RocCurveDisplay.from_estimator(model,X_validation,y_validation,name='ML ROC CURVE')
    plt.savefig('model_roc_plot.png')
    plt.show()
    plt.clf()
    conf_matrix = confusion_matrix(y_validation, y_pred)
    ax=sns.heatmap(conf_matrix,annot=True,fmt='g',cmap='YlGnBu_r')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion Matrix")
    plt.savefig('model_conf_matrix.png')
    mlflow.log_artifact('model_roc_plot.png')
    mlflow.log_artifact('model_conf_matrix.png')
save_artifact(rf_clf, X_validation, y_validation, y_prd_validation)
# mlflow.sklearn.log_model(rf_clf, "model")


# mlflow.end_run()
