# 데이터 불러오기

In [1]:
!pip install imblearn|

/bin/bash: -c: line 2: syntax error: unexpected end of file


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/drive/MyDrive/DS/accepted_2015.csv')
df.shape

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DS/accepted_2015.csv'

# 전처리

In [None]:
# emp_title --> 직업별 평균 연소득(emp_title_avg_inc)

import re

def func_emp_title(x):
  cleaned_text = re.sub(r'\t', '', str(x).lower()) # 탭(\t) 제거

  cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) # [^\w\s]는 알파벳, 숫자, 밑줄, 공백을 제외한 문자를 공백(' ')으로 변환

  cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # 여러 개의 공백을 공백 하나로 통합
  return cleaned_text

df['emp_title'] = df['emp_title'].apply(func_emp_title)

# 직업별 평균 년소득 특징 생성 (emp_title_avg_annual_inc 활용)
emp_title_avg_annual_inc = df.groupby('emp_title')['annual_inc'].mean()

df['emp_title_avg_inc'] = df['emp_title'].map(emp_title_avg_annual_inc)
df = df.drop('emp_title', axis=1)

In [None]:
# addr_state --> addr_state_avg_annual_inc

addr_state_avg_annual_inc = df.groupby('addr_state')['annual_inc'].mean()

df['addr_state_avg_annual_inc'] = df['addr_state'].map(addr_state_avg_annual_inc)
df = df.drop(['addr_state'], axis=1)

In [None]:
# earliest_cr_line --> 신용거래기간 (credit_history_length)

def create_credit_history(df):
    df_copy = df.copy()
    df_copy['earliest_cr_line_dt'] = pd.to_datetime(df_copy['earliest_cr_line'], format='%b-%Y')
    df_copy['issue_d_dt'] = pd.to_datetime(df_copy['issue_d'], format='%b-%Y')

    # 신용거래기간
    df_copy['credit_history_length'] = (
        (df_copy['issue_d_dt'].dt.year - df_copy['earliest_cr_line_dt'].dt.year) * 12 +
        (df_copy['issue_d_dt'].dt.month - df_copy['earliest_cr_line_dt'].dt.month))

    return df_copy['credit_history_length']

df['credit_history_length'] = create_credit_history(df)
df = df.drop(['earliest_cr_line'], axis=1)

In [None]:
# last_credit_pull_d --> 마지막 신용 조회 후 경과기간(time_since_last_credit_check)

def create_time_since_last_credit_check(df):
    df_copy = df.copy()
    df_copy['issue_d_dt'] = pd.to_datetime(df_copy['issue_d'], format='%b-%Y')
    df_copy['last_credit_pull_d_dt'] = pd.to_datetime(df_copy['last_credit_pull_d'], format='%b-%Y')

    # 마지막 신용 조회 기간 계산
    df_copy['time_since_last_credit_check'] = (
        (df_copy['issue_d_dt'].dt.year - df_copy['last_credit_pull_d_dt'].dt.year) * 12 +
        (df_copy['issue_d_dt'].dt.month - df_copy['last_credit_pull_d_dt'].dt.month))

    df_copy['time_since_last_credit_check'] = np.where(
        df_copy['time_since_last_credit_check'] < 0, 0, df_copy['time_since_last_credit_check'])

    return df_copy['time_since_last_credit_check']

df['time_since_last_credit_check'] = create_time_since_last_credit_check(df)
df = df.drop(['last_credit_pull_d'], axis=1)

In [None]:
# sub_grade --> sub_grade_score

grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
sub_grades_ordered = [f'{g}{i}' for g in grades for i in range(1, 6)]

sub_grade_map = {sub: len(sub_grades_ordered) - i for i, sub in enumerate(sub_grades_ordered)}

df['sub_grade_score'] = df['sub_grade'].map(sub_grade_map)
df = df.drop(['sub_grade'], axis=1)

In [None]:
# title의 count=1인 case --> other
def func_title(df):
  tmp = df['title'].value_counts()
  others = tmp[tmp<=1].index

  df['title'] = df['title'].replace(others, 'Other')
  return df

df = func_title(df)

In [None]:
df['emp_length'] = df['emp_length'].fillna(0)
df['title'] = df['title'].fillna('None')
df['is_default'] = df['is_default'].map({'Default':True, 'NonDefault':False})
df = df.drop(['issue_d', 'id'], axis=1)

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('is_default', axis=1)
y = df['is_default']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify for imbalanced target

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# 클래스 비율
y_train.value_counts(normalize=True)

# 모델 학습 (데이터 누수 문제 해결)

## holdout 방식

### custom 함수
- train_and_evaluate_model(model, X, y)
- train_and_evaluate_model_with_log_smote(model, X, y)
- plot_feature_importance(model_pipeline)
- train_evaluate_and_get_metrics_with_smote(model, X, y)
- train_evaluate_and_get_metrics_without_smote(model, X, y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score # Import roc_auc_score
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Use a different name for imblearn Pipeline to avoid conflict
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_and_evaluate_model(model, X, y):
    """
    Trains and evaluates a machine learning model.

    Args:
        model: The machine learning model pipeline to train.
        X: The feature DataFrame.
        y: The target Series.

    Returns:
        The trained model pipeline.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify

    categorical_features = X_train.select_dtypes('object').columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', model)]) # Use the input model here

    # Train the model
    print("Training the model...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # Predict on the test set
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix :")
    print(confusion_matrix(y_test, y_pred))

    return model_pipeline

# plot_feature_importance
def plot_feature_importance(model_pipeline):
  feature_importances = model_pipeline.named_steps['classifier'].feature_importances_

  # Access feature names from the fitted preprocessor
  preprocessor = model_pipeline.named_steps['preprocessor']
  feature_names = preprocessor.get_feature_names_out()

  feature_importance_series = pd.Series(feature_importances, index=feature_names)

  # Sort feature importances and select top N features (e.g., top 20)
  top_n = 20
  top_features = feature_importance_series.sort_values(ascending=False).head(top_n)

  # Plot the top feature importances
  plt.figure(figsize=(12, 8))
  sns.barplot(x=top_features.values, y=top_features.index)
  plt.title('Top {} Feature Importances'.format(top_n))
  plt.xlabel('Importance')
  plt.ylabel('Feature')

  return feature_importance_series.sort_values(ascending=False)


def train_and_evaluate_model_with_log_smote(model, X, y):
    """
    Trains and evaluates a machine learning model with log transformation and SMOTE
    using imblearn Pipeline.

    Args:
        model: The machine learning model to train.
        X: The feature DataFrame.
        y: The target Series.

    Returns:
        The trained model pipeline.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    categorical_features = X_train.select_dtypes('object').columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
    features_to_log_transform = [
        'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
        'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
        'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'dti_joint'
    ]

    # Separate numerical features that need log transformation from those that only need scaling
    numerical_log_features = [f for f in numerical_features if f in features_to_log_transform]
    numerical_scale_only_features = [f for f in numerical_features if f not in features_to_log_transform]

    log_transformer = FunctionTransformer(func=np.log1p, validate=False)
    numerical_scaler = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer to apply different transformations to different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num_log', Pipeline([('log', log_transformer), ('scaler', numerical_scaler)]), numerical_log_features),
            ('num_scale', numerical_scaler, numerical_scale_only_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # Create an imblearn pipeline: preprocessor -> smote -> classifier
    smote = SMOTE(random_state=42)
    model_pipeline = ImbPipeline(steps=[('preprocessor', preprocessor), # Use ImbPipeline
                                     ('smote', smote), # SMOTE 단계 추가
                                     ('classifier', model)]) # 모델 단계

    # Train the model
    print("Training the model...")
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix :")
    print(confusion_matrix(y_test, y_pred))

    return model_pipeline


def train_evaluate_and_get_metrics_with_smote(model, X, y):
    """
    Trains and evaluates a machine learning model with log transformation and SMOTE
    using imblearn Pipeline, and returns the trained pipeline and metrics.

    Args:
        model: The machine learning model to train.
        X: The feature DataFrame.
        y: The target Series.

    Returns:
        A tuple containing:
        - The trained model pipeline (imblearn.pipeline.Pipeline).
        - A dictionary of evaluation metrics (Accuracy, Precision, Recall, F1-Score for the positive class).
        Returns None, None if an error occurs during training or metric calculation.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # 클래스 비율 유지하며 분할

    categorical_features = X_train.select_dtypes('object').columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

    features_to_log_transform = [
        'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
        'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
        'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'dti_joint'
    ]

    # Separate numerical features that need log transformation from those that only need scaling
    numerical_log_features = [f for f in numerical_features if f in features_to_log_transform]
    numerical_scale_only_features = [f for f in numerical_features if f not in features_to_log_transform]

    # Create preprocessing steps
    log_transformer = FunctionTransformer(func=np.log1p, validate=False)
    numerical_scaler = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num_log', Pipeline([('log', log_transformer), ('scaler', numerical_scaler)]), numerical_log_features),
            ('num_scale', numerical_scaler, numerical_scale_only_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # Create SMOTE instance
    smote = SMOTE(random_state=42)

    # Create imblearn pipeline
    model_pipeline = ImbPipeline(steps=[('preprocessor', preprocessor), # Use ImbPipeline
                                     ('smote', smote),
                                     ('classifier', model)])

    # Train the model
    print("Training the model...")
    try:
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)

        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        print("\nConfusion Matrix :")
        print(confusion_matrix(y_test, y_pred))

        # Calculate individual metrics for the positive class ('Default' or True)
        metrics = {}
        # Ensure True is in y_test before calculating metrics for pos_label=True
        if True in y_test.unique():
             metrics['Accuracy'] = accuracy_score(y_test, y_pred)
             metrics['Precision'] = precision_score(y_test, y_pred, pos_label=True)
             metrics['Recall'] = recall_score(y_test, y_pred, pos_label=True)
             metrics['F1-Score'] = f1_score(y_test, y_pred, pos_label=True)

             # Calculate AUC if the model has predict_proba
             if hasattr(model_pipeline, 'predict_proba'):
                 y_prob = model_pipeline.predict_proba(X_test)[:, 1]
                 metrics['AUC'] = roc_auc_score(y_test, y_prob)

        else:
             print("Warning: Positive class (True) not found in y_test. Metrics for positive class not calculated.")


        return model_pipeline, metrics

    except Exception as e:
        print(f"An error occurred during model training or metric calculation: {e}")
        return None, None

def train_evaluate_and_get_metrics_without_smote(model, X, y):
    """
    Trains and evaluates a machine learning model with log transformation
    using imblearn Pipeline, without SMOTE.

    Args:
        model: The machine learning model to train.
        X: The feature DataFrame.
        y: The target Series.

    Returns:
        A tuple containing:
        - The trained model pipeline (imblearn.pipeline.Pipeline).
        - A dictionary of evaluation metrics (Accuracy, Precision, Recall, F1-Score for the positive class).
        Returns None, None if an error occurs during training or metric calculation.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # 클래스 비율 유지하며 분할

    categorical_features = X_train.select_dtypes('object').columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
    features_to_log_transform = [
        'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
        'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
        'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy'
    ]
    numerical_log_features = [f for f in numerical_features if f in features_to_log_transform]
    numerical_scale_only_features = [f for f in numerical_features if f not in features_to_log_transform]

    # Create preprocessing steps
    log_transformer = FunctionTransformer(func=np.log1p, validate=False)
    numerical_scaler = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num_log', Pipeline([('log', log_transformer), ('scaler', numerical_scaler)]), numerical_log_features),
            ('num_scale', numerical_scaler, numerical_scale_only_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), # Use standard scikit-learn Pipeline
                                     ('classifier', model)]) # 모델 단계

    # Train the model
    print("Training the model...")
    try:
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)

        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        print("\nConfusion Matrix :")
        print(confusion_matrix(y_test, y_pred))

        metrics = {}
        if True in y_test.unique():
             metrics['Accuracy'] = accuracy_score(y_test, y_pred)
             metrics['Precision'] = precision_score(y_test, y_pred, pos_label=True)
             metrics['Recall'] = recall_score(y_test, y_pred, pos_label=True)
             metrics['F1-Score'] = f1_score(y_test, y_pred, pos_label=True)

             # Calculate AUC if the model has predict_proba
             if hasattr(model_pipeline, 'predict_proba'):
                 y_prob = model_pipeline.predict_proba(X_test)[:, 1]
                 metrics['AUC'] = roc_auc_score(y_test, y_prob)

        else:
             print("Warning: Positive class (True) not found in y_test. Metrics for positive class not calculated.")


        return model_pipeline, metrics

    except Exception as e:
        print(f"An error occurred during model training or metric calculation: {e}")
        return None, None

### 학습
___
첫번째 학습

In [None]:
# model training
X = df.drop('is_default', axis=1)
y = df['is_default']

model = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
model_pipeline = train_and_evaluate_model(model, X, y)


# --- 결과 기록 ---
# Training the model...
# Training complete.

# Classification Report:
#               precision    recall  f1-score   support

#        False       0.96      1.00      0.98     68609
#         True       1.00      0.82      0.90     15488

#     accuracy                           0.97     84097
#    macro avg       0.98      0.91      0.94     84097
# weighted avg       0.97      0.97      0.97     84097


# Confusion Matrix :
# [[68609     0]
#  [ 2787 12701]]

In [None]:
# plot_feature_importance
feature_importance = plot_feature_importance(model_pipeline)

두번쨰 학습
- features_leakage_related + features_to_check 제외

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

features_leakage_related = [
    'last_fico_range_high', 'last_fico_range_low', 'total_pymnt_inv', 'last_pymnt_amnt', 'out_prncp_inv', 'out_prncp',
    'total_rec_prncp', 'total_pymnt', 'recoveries', 'collection_recovery_fee', ]

features_highly_correlated = ['fico_range_low', 'loan_amnt', 'funded_amnt_inv', 'funded_amnt', 'num_sats',
                          'num_rev_tl_bal_gt_0', 'tot_cur_bal',  'mo_sin_old_rev_tl_op']

features_to_check = [
    'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 'pub_rec',
    'total_rec_late_fee', 'collections_12_mths_ex_med', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt'
]

X = df.drop(['is_default'] + features_leakage_related + features_to_check, axis=1)
y = df['is_default']

# Calculate scale_pos_weight for XGBoost and LightGBM (needed for Case 2 models)
# scale_pos_weight = count(negative class) / count(positive class)
classes = y.unique()
neg_class_label = False # Or the actual label for the negative class if not boolean
pos_class_label = True # Or the actual label for the positive class if not boolean

if neg_class_label in y.value_counts() and pos_class_label in y.value_counts():
    neg_count = y.value_counts()[neg_class_label]
    pos_count = y.value_counts()[pos_class_label]
    scale_pos_weight_value = neg_count / pos_count
    print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.4f}")
else:
     print("Warning: Could not find expected negative or positive class labels in y.")
     scale_pos_weight_value = 1.0 # Default to 1 if labels not found

models = [{"name": "Logistic Regression",
           "model": LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', n_jobs=-1)},
          {"name": "Random Forest",
           "model": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)},
          {"name": "XGBoost",
           "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
                                      random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_value)},
          {"name": "LightGBM",
           "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1,
                                       scale_pos_weight=scale_pos_weight_value)}
  ]

result = {}

for model_info in models:
    print(f"\n--- Training and Evaluating: {model_info['name']} ---")
    current_pipeline, metrics = train_evaluate_and_get_metrics_without_smote(model_info['model'], X, y)

    result[model_info['name']] = metrics
    print(f"Metrics stored: {metrics}")

In [None]:
result_2nd = pd.DataFrame(result); display(result_2nd)

세번째 학습
- features_leakage_related + features_highly_correlated 제외

In [None]:
X = df.drop(['is_default'] + features_leakage_related + features_highly_correlated, axis=1)
y = df['is_default']

result = {}

for model_info in models:
    print(f"\n--- Training and Evaluating: {model_info['name']} ---")
    current_pipeline, metrics = train_evaluate_and_get_metrics_without_smote(model_info['model'], X, y)

    result[model_info['name']] = metrics
    print(f"Metrics stored: {metrics}")

In [None]:
result_3rd = pd.DataFrame(result); display(result_3rd)

네번쨰 학습
- features_leakage_related + features_highly_correlated + features_to_check 제외

In [None]:
X = df.drop(['is_default'] + features_leakage_related + features_highly_correlated + features_to_check, axis=1)
y = df['is_default']

result = {}

for model_info in models:
    print(f"\n--- Training and Evaluating: {model_info['name']} ---")
    current_pipeline, metrics = train_evaluate_and_get_metrics_without_smote(model_info['model'], X, y)

    result[model_info['name']] = metrics
    print(f"Metrics stored: {metrics}")

In [None]:
# NO SMOTE
# features_leakage_related + features_highly_correlated + features_to_check 모두 제외
result_4th = pd.DataFrame(result); display(result_4th)

- Feature Importance Plot을 통해 특징별 Importance가 이전 학습결과와 다르게 비교적 고르게 분포하고 있음을 확인하였습니다.

- 분류 성능도 `True`( `Default` )에 대한 Recall 과 F1-score 를 봤을 때, 전형적인 클래스 불균형 문제를 가진 분류 문제에서의 성능이 도출되었습니다.

- 따라서, 데이터 누수 문제는 해결되었다고 판단하고 분류 성능 향상을 위한 추가 작업을 진행하였습니다.

### 결과 비교

In [None]:
print("두번째 학습 결과: features_leakage_related + features_to_check 제외")
print(result_2nd)
print("\n세번째 학습 결과: features_leakage_related + features_highly_correlated 제외")
print(result_3rd)
print("\n네번째 학습 결과: features_leakage_related + features_highly_correlated + features_to_check 제외")
print(result_4th)

## 교차검증
-  첫번째 학습:  features_leakage_related + features_to_check 제외
-  두번째 학습: features_leakage_related + features_highly_correlated 제외
- 세번째 학습 결과: features_leakage_related + features_highly_correlated + features_to_check 제외


### custom 함수
- evaluate_model_with_cross_validation(model_pipeline, X, y, cv, scoring)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, make_scorer
import pandas as pd
import numpy as np

def evaluate_model_with_cross_validation(model_pipeline, X, y, cv, scoring):
    """
    Evaluates a machine learning model pipeline using cross-validation.

    Args:
        model_pipeline: The machine learning model pipeline to evaluate.
        X: The feature DataFrame.
        y: The target Series.
        cv: Cross-validation splitter (e.g., StratifiedKFold).
        scoring: A dictionary of scoring metrics.

    Returns:
        A dictionary containing the cross-validation results.
    """
    print("Performing cross-validation...")

    cv_results = cross_validate(
        estimator=model_pipeline,
        X=X,
        y=y,
        cv=cv,
        scoring=scoring,
        return_train_score=False, # We only need test scores for evaluation
        n_jobs=-1 # Use all available cores
    )
    print("Cross-validation complete.")
    return cv_results

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, pos_label=True),
    'recall': make_scorer(recall_score, pos_label=True),
    'f1_score': make_scorer(f1_score, pos_label=True),
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True) # AUC requires predict_proba
}

### 첫번째 학습
- features_leakage_related + features_to_check 제외

In [None]:
X = df.drop(['is_default'] + features_leakage_related + features_to_check, axis=1)
y = df['is_default']

# 교차 검증에 사용할 모델 정의
models = [{"name": "Logistic Regression",
           "model": LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', n_jobs=-1)},
          {"name": "Random Forest",
           "model": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)},
          {"name": "XGBoost",
           "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
                                      random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_value)},
          {"name": "LightGBM",
           "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1,
                                       scale_pos_weight=scale_pos_weight_value)}
  ]

# 교차 검증을 위한 파이프라인 정의
categorical_features = X.select_dtypes('object').columns.tolist()
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
features_to_log_transform = [
    'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
    'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
    'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy'
]
numerical_log_features = [f for f in numerical_features if f in features_to_log_transform]
numerical_scale_only_features = [f for f in numerical_features if f not in features_to_log_transform]

log_transformer = FunctionTransformer(func=np.log1p, validate=False)
numerical_scaler = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num_log', Pipeline([('log', log_transformer), ('scaler', numerical_scaler)]), numerical_log_features),
        ('num_scale', numerical_scaler, numerical_scale_only_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}
for model_info in models:
    print(f"\n--- Cross-validation: {model_info['name']} ---")

    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', model_info['model'])])

    cv_results[model_info['name']] = evaluate_model_with_cross_validation(model_pipeline, X, y, cv, scoring)
    print("\nCross-validation results:")
    display(pd.DataFrame(cv_results[model_info['name']]))

In [None]:
print("\n--- Cross-validation Summary (Mean Metrics) ---")
mean_cv_results = {}
for model_name, results in cv_results.items():
    mean_cv_results[model_name] = {metric: np.mean(scores) for metric, scores in results.items()}
display(pd.DataFrame(mean_cv_results).T)

### 두번째 학습
- features_leakage_related + features_highly_correlated 제외

In [None]:
X = df.drop(['is_default'] + features_leakage_related + features_highly_correlated, axis=1)
y = df['is_default']


# 교차 검증을 위한 파이프라인 정의
categorical_features = X.select_dtypes('object').columns.tolist()
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
features_to_log_transform = [
    'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
    'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
    'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy'
]
numerical_log_features = [f for f in numerical_features if f in features_to_log_transform]
numerical_scale_only_features = [f for f in numerical_features if f not in features_to_log_transform]

log_transformer = FunctionTransformer(func=np.log1p, validate=False)
numerical_scaler = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num_log', Pipeline([('log', log_transformer), ('scaler', numerical_scaler)]), numerical_log_features),
        ('num_scale', numerical_scaler, numerical_scale_only_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}
for model_info in models:
    print(f"\n--- Cross-validation: {model_info['name']} ---")

    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', model_info['model'])])

    cv_results[model_info['name']] = evaluate_model_with_cross_validation(model_pipeline, X, y, cv, scoring)
    print("\nCross-validation results:")
    display(pd.DataFrame(cv_results[model_info['name']]))

# 상관 분석
- 강건한 분류 성능과 다중공선성 해결 위해 0.9 이상의 강한 상관 관계를 가지는 특징에 대한 전처리를 수행하였습니다.
- 삭제할 특징 선택 기준은 이전 학습 모델의 Feature Importance 를 활용하였습니다.

In [None]:
def get_unstacked_corr_matrix(df):
  # 상관계수가 0.5 이상인 특징만 필터링
  corr_matrix = df.select_dtypes(['number', 'bool']).corr()

  stacked_corr = corr_matrix.unstack().reset_index()
  stacked_corr.columns = ['feature1', 'feature2', 'correlation']

  filtered_corr = stacked_corr[stacked_corr['feature1'] > stacked_corr['feature2']]
  high_corr_pairs = filtered_corr[abs(filtered_corr['correlation']) > 0.5]

  high_corr_pairs = high_corr_pairs.sort_values(by='correlation', ascending=False)

  return high_corr_pairs

def prep_corrlated_feautres(corr_matrix):
  # 상관계수가 0.9 이상인 특징 목록 추출
  filtered_matrix = corr_matrix[abs(corr_matrix['correlation'] >= 0.9)]

  feature_list = []
  feature_list.append(filtered_matrix['feature1'].values.tolist())
  feature_list.append(filtered_matrix['feature2'].values.tolist())

  return np.unique(np.array(feature_list).flatten())

In [None]:
# is_default 와의 상관관계
corr_matrix = df.select_dtypes(['number', 'bool']).corr()

abs(corr_matrix['is_default']).sort_values(ascending=False)[:10]

In [None]:
# 특징 간 상관관계
high_corr_pairs = get_unstacked_corr_matrix(X)
print(high_corr_pairs[abs(high_corr_pairs['correlation'] >= 0.9)])

# --- 결과 기록 ---
#                         feature1               feature2  correlation
# 821               fico_range_low        fico_range_high     1.000000
# 1                      loan_amnt            funded_amnt     0.999999
# 163              funded_amnt_inv            funded_amnt     0.999995
# 2                      loan_amnt        funded_amnt_inv     0.999994
# 1115                    open_acc               num_sats     0.999096
# 4996         num_rev_tl_bal_gt_0        num_actv_rev_tl     0.982402
# 5775             tot_hi_cred_lim            tot_cur_bal     0.974933
# 6024  total_il_high_credit_limit           total_bal_il     0.952069
# 406                  installment            funded_amnt     0.945982
# 5                      loan_amnt            installment     0.945981
# 407                  installment        funded_amnt_inv     0.945880
# 3885        mo_sin_old_rev_tl_op  credit_history_length     0.912267
# 2502                total_bal_il      total_bal_ex_mort     0.900444

In [None]:
# feature importance based feature selection
# 상관계수가 0.9 이상인 특징 쌍 중 feature importance가 낮은 특징을 삭제 처리

# feature_list = prep_corrlated_feautres(high_corr_pairs)

# print(feature_importance[[col for col in feature_importance.index if any(keyword in col for keyword in feature_list)]])

In [None]:
# 삭제대상
# features_to_be_deleted = ['fico_range_low', 'loan_amnt', 'funded_amnt_inv', 'funded_amnt', 'num_sats',
#                           'num_rev_tl_bal_gt_0', 'tot_cur_bal', 'total_bal_il', 'mo_sin_old_rev_tl_op']

# 모델 학습 with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

def train_and_evaluate_model_using_SMOTE(model, X, y):
    """
    Trains and evaluates a machine learning model with SMOTE oversampling
    using imblearn Pipeline.

    Args:
        model: The machine learning model to train.
        X: The feature DataFrame.
        y: The target Series.

    Returns:
        The trained model pipeline.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify

    categorical_features = X_train.select_dtypes('object').columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # Create an imblearn pipeline
    smote = SMOTE(random_state=42)
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('smote', smote), # Add SMOTE step
                                     ('classifier', model)]) # Use the input model here

    # Train the model
    print("Training the model...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # Predict on the test set
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix :")
    print(confusion_matrix(y_test, y_pred))

    return model_pipeline

In [None]:
# feature importance
# feature_importance = plot_feature_importance(model_pipeline)

- SMOTE를 수행헀음에도 Default에 대한 분류 성능을 봤을 때, 데이터 전처리가 더 필요하다고 판단됩니다.
- 수치형 특징에 대한 데이터 탐색을 후 적절한 추가 전처리를 수행한 후 다시 학습을 진행하겠습니다.

# 데이터 탐색

In [None]:
features_to_be_deleted = ['fico_range_low', 'loan_amnt', 'funded_amnt_inv', 'funded_amnt', 'num_sats',
                          'num_rev_tl_bal_gt_0', 'tot_cur_bal', 'total_bal_il', 'mo_sin_old_rev_tl_op']

X = df.drop([
    'is_default',
    'last_fico_range_high', 'last_fico_range_low',
    'recoveries', 'collection_recovery_fee',
    'total_rec_prncp', 'out_prncp_inv', 'total_pymnt_inv',
    'last_pymnt_amnt', 'out_prncp', 'total_pymnt'] + features_to_be_deleted, axis=1)
y = df['is_default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify

## Int 타입

In [None]:
int_features = X_train.select_dtypes('int').columns
float_features = X_train.select_dtypes('float').columns

len(int_features), len(float_features)

In [None]:
pd.set_option('display.float_format', '{:.4f}'.format)

X_train[int_features].describe()

- 정수 타입 특징 중 `time_since_last_credit_check`의 기술통계량을 보고 한쪽으로 치우친 분포를 가지고 있다고 판단하여 시각화를 진행하였습니다.

In [None]:
plt.figure(figsize = (12, 5))

plt.subplot(121)
sns.histplot(X_train['time_since_last_credit_check'])
plt.title('Original Distribution of time_since_last_credit_check')

plt.subplot(122)
sns.histplot(np.log1p(X_train['time_since_last_credit_check']))
plt.title('Log-transformed Distribution of time_since_last_credit_check')

In [None]:
plt.figure(figsize=(8, 5))

sns.countplot(data=df, x='time_since_last_credit_check', hue='is_default', palette='viridis')
plt.title('Distribution of time_since_last_credit_check by is_default')
plt.xlabel('Time Since Last Credit Check')
plt.ylabel('Count')
plt.show()

- `time_since_last_credit_check`는 한쪽으로 크게 기울어진 분포를 가진 특징이지만 타겟 특징과의 분포 확인 결과, 분류에 유의미할 것으로 판단하였습니다.
- 로그변환은 하지 않았습니다

## Float 타입
- 전처리된 데이터 기준으로 총 67개로 구성되며, 모두 분포를 시각화하며 확인할 수 없기 때문에 기술통계량을 기준으로 필터링 후 시각화를 통해 분포를 확인하였습니다.


In [None]:
float_features = X_train.select_dtypes('float').columns
display(X_train[float_features].describe())

분포-불균형 특징 식별
- 기술 통계량 결과를 바탕으로, 평균 및 표준편차와 비교했을 때 최대/최소 값의 차이가 큰 특징들을 식별하였습니다.
- 기준 : `max > mean + 3*std` 또는 `min < mean - 3*std`

In [None]:
# Analyze descriptive statistics to identify features with potential outliers or skewed distributions

float_features_desc = X_train[float_features].describe()

potential_outlier_features = []

for col in float_features_desc.columns:
    mean_val = float_features_desc.loc['mean', col]
    std_val = float_features_desc.loc['std', col]
    min_val = float_features_desc.loc['min', col]
    max_val = float_features_desc.loc['max', col]

    # Check for extreme maximum values
    if max_val > mean_val + 3 * std_val:
        potential_outlier_features.append(col)
        print(f"'{col}': Max value ({max_val:.4f}) is significantly larger than mean ({mean_val:.4f}) + 3*std ({3 * std_val:.4f})")

    # Check for extreme minimum values
    if min_val < mean_val - 3 * std_val:
        potential_outlier_features.append(col)
        print(f"'{col}': Min value ({min_val:.4f}) is significantly smaller than mean ({mean_val:.4f}) - 3*std ({3 * std_val:.4f})")

# Remove duplicates from the list
potential_outlier_features = list(set(potential_outlier_features))

print("\nFeatures identified with potential extreme values:")
print(potential_outlier_features)

##분포-불균형 의심특징 분포 시각화

- 식별된 특징들에 대해 히스토그램 또는 상자 그림(boxplot)을 그려 분포를 시각적으로 확인하였습니다.


In [None]:
# for col in potential_outlier_features:
#     plt.figure(figsize=(12, 5))

#     plt.subplot(1, 2, 1)
#     sns.histplot(X_train[col], kde=True)
#     plt.title(f'Original Distribution of {col}')

#     plt.subplot(1, 2, 2)
#     sns.histplot(np.log1p(X_train[col]), kde=True)
#     plt.title(f'Log1p Transformed Distribution of {col}')

#     plt.tight_layout()
#     plt.show()

1.로그 변환이 필요한 특징
- `int_rate`, `installment`, `annual_inc`, `dti_joint`, `dti`, `open_acc`, `revol_bal`, `total_acc`, `total_rec_int`, `mths_since_rcnt_il`, `il_util`, `max_bal_bc`, `total_rev_hi_lim`, `avg_cur_bal`, `bc_open_to_buy`, `tot_coll_amt`,

2.정보량 부족한 특징
- target 과의 분포 확인하고 처리
- 대상: `delinq_2yrs`, `fico_range_high`, `inq_last_6mths`, `pub_rec`, `total_rec_late_fee`, `collections_12_mths_ex_med`, `acc_now_delinq`, `chargeoff_within_12_mhts`, `delinq_amnt`

3.int 변환이 필요해보이는 특징
  - `open_acc_6m`, `open_act_il`, `open_il_12m`, `open_il_24m`, `open_rv_12m`, `open_rv_24m`, `inq_fi`, `total_cu_tl`, `inq_last_12m`

4.변환 필요 없음
- 대상: `revol_util`, `all_util`, `bc_util`



## 추가 분석

로그변환은 함수화하여 변환

In [None]:
# # 로그변환 대상 특징
# features_to_log_transform = [
#     'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
#     'revol_bal', 'total_acc', 'total_rec_int', 'mths_since_rcnt_il', 'il_util',
#     'max_bal_bc', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'tot_coll_amt'
# ]

# for feature in features_to_log_transform:
#     if feature in X_train.columns:
#         X_train[feature] = np.log1p(X_train[feature])
#         # Apply the same transformation to the test set
#         if feature in X_test.columns:
#             X_test[feature] = np.log1p(X_test[feature])
#     else:
#         print(f"Warning: Feature '{feature}' not found in X_train.")

# print("로그 변환 완료.")

정보량 부족한 특징
- target과의 분포 확인 후 조치하였습니다

In [None]:
# 대상 특징
# features_to_check = [
#     'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 'pub_rec',
#     'total_rec_late_fee', 'collections_12_mths_ex_med', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt'
# ]

# # Determine plot type based on the number of unique values
# def plot_feature_distribution_by_target(df, feature, target='is_default'):
#     plt.figure(figsize=(10, 6))
#     if df[feature].nunique() < 20: # Use countplot for features with few unique values
#         sns.countplot(data=df, x=feature, hue=target, palette='viridis')
#         plt.title(f'Distribution of {feature} by {target}')
#         plt.xticks(rotation=45)

#     else: # Use histplot for features with many unique values
#         sns.histplot(data=df, x=feature, hue=target, kde=True, palette='viridis', common_norm=False)
#         plt.title(f'Distribution of {feature} by {target}')

#     plt.xlabel(feature)
#     plt.ylabel('Count' if df[feature].nunique() < 20 else 'Density')
#     plt.show()

# # Plot distributions for the features to check
# for feature in features_to_check:
#     if feature in df.columns:
#         plot_feature_distribution_by_target(df, feature)
#     else:
#         print(f"Warning: Feature '{feature}' not found in DataFrame.")

In [None]:
# features_to_check의 feature importance 확인
# print(feature_importance[[col for col in feature_importance.index if any(keyword in col for keyword in features_to_check)]])

# num__total_rec_late_fee           0.0358
# num__inq_last_6mths               0.0156
# num__fico_range_high              0.0089
# num__delinq_2yrs                  0.0025
# num__pub_rec                      0.0012
# num__pub_rec_bankruptcies         0.0009
# num__collections_12_mths_ex_med   0.0003
# num__chargeoff_within_12_mths     0.0002
# num__delinq_amnt                  0.0001
# num__acc_now_delinq               0.0001
# dtype: float64

- 타겟 특징과의 분포도 확인해봤지만 유의미한 특징으로 보이지 않음
- 따라서 해당 특징들은 학습에서 제외하였음

In [None]:
# features_to_check = [
#     'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 'pub_rec',
#     'total_rec_late_fee', 'collections_12_mths_ex_med', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt'
# ]
# features_to_be_deleted = features_to_be_deleted + features_to_check

int 변환이 필요해보이는 특징
- 이 특징들은 Float type 이지만 int type 특징처럼 bar 분포를 가지고 있었습니다.
- 이에, 타겟 변수와의 분포 확인 후 유의미하다면, binning 후 범주형 특징으로 변환하면 좋을 것 같습니다.

- `open_acc_6m`, `open_act_il`, `open_il_12m`, `open_il_24m`, `open_rv_12m`, `open_rv_24m`, `inq_fi`, `total_cu_tl`, `inq_last_12m`

In [None]:
# 정수형에 가까운 float 특징 목록
# features_to_check_2nd = [
#     'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
#     'open_rv_12m', 'open_rv_24m',
#     'inq_fi', 'total_cu_tl', 'inq_last_12m',
# ]

# # Calculate and plot the proportion of 'Default' for each feature value
# for feature in features_to_check_2nd:
#     if feature in df.columns:
#         # Calculate proportions of is_default for each unique value of the feature
#         # Group by the feature and then calculate the mean of 'is_default' (True=1, False=0)
#         default_proportion = df.groupby(feature)['is_default'].mean().reset_index()
#         default_proportion.columns = [feature, 'default_proportion']

#         # Plot the proportions
#         plt.figure(figsize=(10, 6))
#         # Modified sns.barplot call to address FutureWarning
#         sns.barplot(data=default_proportion, x=feature, y='default_proportion', hue=feature, palette='viridis', legend=False)
#         plt.title(f'Proportion of Default by {feature}')
#         plt.xlabel(feature)
#         plt.ylabel('Proportion of Default')
#         plt.xticks(rotation=45) # Rotate labels for better readability
#         plt.show()
#     else:
#         print(f"Warning: Feature '{feature}' not found in DataFrame.")

In [None]:
# print(feature_importance[[col for col in feature_importance.index if any(keyword in col for keyword in features_to_check_2nd)]])

# num__open_rv_24m    0.0179
# num__inq_fi         0.0173
# num__inq_last_12m   0.0168
# num__total_cu_tl    0.0144
# num__open_rv_12m    0.0106
# num__open_acc_6m    0.0089
# num__open_il_24m    0.0070
# num__open_act_il    0.0055
# num__open_il_12m    0.0045
# dtype: float64

이 특징들 또한 학습에서 제외하는 것이 맞다고 판단하였습니다.

# 모델 재학습
데이터 탐색 및 피처 엔지니어링을 적용한 후 모델 학습을 진행

## 첫번째 학습

### 학습할 피처

In [None]:
features_leakage_related = [
    'last_fico_range_high', 'last_fico_range_low', 'total_pymnt_inv', 'last_pymnt_amnt', 'out_prncp_inv', 'out_prncp',
    'total_rec_prncp', 'total_pymnt', 'recoveries', 'collection_recovery_fee', ]

features_highly_correlated = ['fico_range_low', 'loan_amnt', 'funded_amnt_inv', 'funded_amnt', 'num_sats',
                          'num_rev_tl_bal_gt_0', 'tot_cur_bal', 'total_bal_il', 'mo_sin_old_rev_tl_op']

features_to_check = [
    'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 'pub_rec',
    'total_rec_late_fee', 'collections_12_mths_ex_med', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt'
]

features_to_check_2nd = [
    'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
    'open_rv_12m', 'open_rv_24m',
    'inq_fi', 'total_cu_tl', 'inq_last_12m',
]

X = df.drop(['is_default'] + features_leakage_related + features_highly_correlated
                           + features_to_check + features_to_check_2nd, axis=1)
y = df['is_default']
y = y.map({'Default':True, 'NonDefault':False})

# Calculate scale_pos_weight for XGBoost and LightGBM (needed for Case 2 models)
# scale_pos_weight = count(negative class) / count(positive class)
classes = y.unique()
neg_class_label = False # Or the actual label for the negative class if not boolean
pos_class_label = True # Or the actual label for the positive class if not boolean

if neg_class_label in y.value_counts() and pos_class_label in y.value_counts():
    neg_count = y.value_counts()[neg_class_label]
    pos_count = y.value_counts()[pos_class_label]
    scale_pos_weight_value = neg_count / pos_count
    print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.4f}")
else:
     print("Warning: Could not find expected negative or positive class labels in y.")
     scale_pos_weight_value = 1.0 # Default to 1 if labels not found

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

# Define the models for Case 1 (SMOTE used in pipeline, no class weight in model)
models_case1 = [{"name": "Logistic Regression",
                 "model": LogisticRegression(random_state=42, solver='liblinear', n_jobs=-1)},
                {"name": "Random Forest",'
                 "model": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)},
                {"name": "XGBoost",
                 "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                                            use_label_encoder=False, random_state=42, n_jobs=-1)},
                {"name": "LightGBM",
                 "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1)}
]

# Define the models for Case 2 (No SMOTE, class weight in model)
models_case2 = [{"name": "Logistic Regression",
                 "model": LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', n_jobs=-1)},
                {"name": "Random Forest",
                 "model": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)},
                {"name": "XGBoost",
                 "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
                                            random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_value)},
                {"name": "LightGBM",
                 "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1,
                                             scale_pos_weight=scale_pos_weight_value)}
]


# Initialize dictionary to store results, nested by case
result = {"Case 1 (SMOTE)": {}, "Case 2 (Class Weight)": {}}

# Train and evaluate models for Case 1 (using train_evaluate_and_get_metrics with SMOTE)
print("\n--- Training and Evaluating: Case 1 (SMOTE) ---")

for model_info in models_case1:
    print(f"\n--- Training and Evaluating: {model_info['name']} (Case 1) ---")
    trained_pipeline, metrics = train_evaluate_and_get_metrics(model_info['model'], X, y)

    if metrics:
        result["Case 1 (SMOTE)"][model_info['name']] = metrics
        print(f"--- Finished: {model_info['name']} (Case 1) ---")
        print(f"Metrics stored: {metrics}")
    else:
        print(f"--- Finished: {model_info['name']} (Case 1) (Metrics not available) ---")


# Train and evaluate models for Case 2 (using train_evaluate_and_get_metrics_without_smote)
print("\n--- Training and Evaluating: Case 2 (Class Weight) ---")

for model_info in models_case2:
    print(f"\n--- Training and Evaluating: {model_info['name']} (Case 2) ---")
    trained_pipeline, metrics = train_evaluate_and_get_metrics_without_smote(model_info['model'], X, y)

    if metrics:
        result["Case 2 (Class Weight)"][model_info['name']] = metrics
        print(f"--- Finished: {model_info['name']} (Case 2) ---")
        print(f"Metrics stored: {metrics}")
    else:
        print(f"--- Finished: {model_info['name']} (Case 2) (Metrics not available) ---")


# Display the results dictionary
print("\n--- Model Performance Metrics Summary ---")
display(result)


# -- 결과저장 --
#--- Model Performance Metrics Summary ---
# {'Case 1 (SMOTE)': {'Logistic Regression': {'Accuracy': 0.7348342622469932,
#    'Precision': 0.23753360095681314,
#    'Recall': 0.7167256226104978,
#    'F1-Score': 0.3568137751925574},
#   'Random Forest': {'Accuracy': 0.8977559401584042,
#    'Precision': 0.5261023821591485,
#    'Recall': 0.03708864830099689,
#    'F1-Score': 0.06929238985313751},
#   'LightGBM': {'Accuracy': 0.8999156644177178,
#    'Precision': 0.5579176431201874,
#    'Recall': 0.11909100653875013,
#    'F1-Score': 0.1962839727923206}},
#  'Case 2 (Class Weight)': {'Logistic Regression': {'Accuracy': 0.7323041947785274,
#    'Precision': 0.2368017960080914,
#    'Recall': 0.7236216814949799,
#    'F1-Score': 0.3568319971808651},
#   'Random Forest': {'Accuracy': 0.8978952772073922,
#    'Precision': 0.6153846153846154,
#    'Recall': 0.013434809018472863,
#    'F1-Score': 0.026295545143016994},
#   'LightGBM': {'Accuracy': 0.7851312701672044,
#    'Precision': 0.2935861473729636,
#    'Recall': 0.7778611498195591,
#    'F1-Score': 0.4262818316216137}}}

## 2nd
데이터 탐색 결과로 삭제한 특징을 학습에 포함시킨 후 재학습 진행

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

# Calculate scale_pos_weight for XGBoost and LightGBM (needed for Case 2 models)
# scale_pos_weight = count(negative class) / count(positive class)
classes = y.unique()
neg_class_label = False # Or the actual label for the negative class if not boolean
pos_class_label = True # Or the actual label for the positive class if not boolean

if neg_class_label in y.value_counts() and pos_class_label in y.value_counts():
    neg_count = y.value_counts()[neg_class_label]
    pos_count = y.value_counts()[pos_class_label]
    scale_pos_weight_value = neg_count / pos_count
    print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.4f}")
else:
     print("Warning: Could not find expected negative or positive class labels in y.")
     scale_pos_weight_value = 1.0 # Default to 1 if labels not found

# Define the models for Case 1 (SMOTE used in pipeline, no class weight in model)
models_case1 = [{"name": "Logistic Regression",
                 "model": LogisticRegression(random_state=42, solver='liblinear', n_jobs=-1)},
                {"name": "Random Forest",'
                 "model": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)},
                {"name": "XGBoost",
                 "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                                            use_label_encoder=False, random_state=42, n_jobs=-1)},
                {"name": "LightGBM",
                 "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1)}
]

# Define the models for Case 2 (No SMOTE, class weight in model)
models_case2 = [{"name": "Logistic Regression",
                 "model": LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', n_jobs=-1)},
                {"name": "Random Forest",
                 "model": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)},
                {"name": "XGBoost",
                 "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
                                            random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_value)},
                {"name": "LightGBM",
                 "model": lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42, n_jobs=-1,
                                             scale_pos_weight=scale_pos_weight_value)}
]


# Initialize dictionary to store results, nested by case
result = {"Case 1 (SMOTE)": {}, "Case 2 (Class Weight)": {}}

# Train and evaluate models for Case 1 (using train_evaluate_and_get_metrics with SMOTE)
print("\n--- Training and Evaluating: Case 1 (SMOTE) ---")

for model_info in models_case1:
    print(f"\n--- Training and Evaluating: {model_info['name']} (Case 1) ---")
    trained_pipeline, metrics = train_evaluate_and_get_metrics(model_info['model'], X, y)

    if metrics:
        result["Case 1 (SMOTE)"][model_info['name']] = metrics
        print(f"--- Finished: {model_info['name']} (Case 1) ---")
        print(f"Metrics stored: {metrics}")
    else:
        print(f"--- Finished: {model_info['name']} (Case 1) (Metrics not available) ---")


# Train and evaluate models for Case 2 (using train_evaluate_and_get_metrics_without_smote)
print("\n--- Training and Evaluating: Case 2 (Class Weight) ---")

for model_info in models_case2:
    print(f"\n--- Training and Evaluating: {model_info['name']} (Case 2) ---")
    trained_pipeline, metrics = train_evaluate_and_get_metrics_without_smote(model_info['model'], X, y)

    if metrics:
        result["Case 2 (Class Weight)"][model_info['name']] = metrics
        print(f"--- Finished: {model_info['name']} (Case 2) ---")
        print(f"Metrics stored: {metrics}")
    else:
        print(f"--- Finished: {model_info['name']} (Case 2) (Metrics not available) ---")


# Display the results dictionary
print("\n--- Model Performance Metrics Summary ---")
display(result)


# -- 결과저장 --
#--- Model Performance Metrics Summary ---
# {'Case 1 (SMOTE)': {'Logistic Regression': {'Accuracy': 0.7348342622469932,
#    'Precision': 0.23753360095681314,
#    'Recall': 0.7167256226104978,
#    'F1-Score': 0.3568137751925574},
#   'Random Forest': {'Accuracy': 0.8977559401584042,
#    'Precision': 0.5261023821591485,
#    'Recall': 0.03708864830099689,
#    'F1-Score': 0.06929238985313751},
#   'LightGBM': {'Accuracy': 0.8999156644177178,
#    'Precision': 0.5579176431201874,
#    'Recall': 0.11909100653875013,
#    'F1-Score': 0.1962839727923206}},
#  'Case 2 (Class Weight)': {'Logistic Regression': {'Accuracy': 0.7323041947785274,
#    'Precision': 0.2368017960080914,
#    'Recall': 0.7236216814949799,
#    'F1-Score': 0.3568319971808651},
#   'Random Forest': {'Accuracy': 0.8978952772073922,
#    'Precision': 0.6153846153846154,
#    'Recall': 0.013434809018472863,
#    'F1-Score': 0.026295545143016994},
#   'LightGBM': {'Accuracy': 0.7851312701672044,
#    'Precision': 0.2935861473729636,
#    'Recall': 0.7778611498195591,
#    'F1-Score': 0.4262818316216137}}}