### Lasso 기반 피처선택 및 SMOTE 적용

- 전처리된 데이터 불러오기
- VIF 기반 피처선택 및 Lasso(L1)로 최종 변수 선택
- 선택된 변수만 추출, 표준화
- SMOTE로 클래스 불균형 보정
- 학습/검증 데이터 csv 저장

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

def load_and_prepare_data(path):
    df = pd.read_csv(path, dtype={'stock_code':str, 'year':str})
    df = df.drop(['Unnamed: 0'], axis=1)
    df = df.sort_values(by='year')
    X = df.drop(columns=['is_defaulted', 'corp_nm', 'year'], errors='ignore')
    y = df['is_defaulted']
    X = X.select_dtypes(include='number')
    return X, y, df

def time_series_split(X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    return X_train, X_val, y_train, y_val

def lasso_feature_selection(X_train, y_train, X_val, random_state=42):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    model = LogisticRegression(penalty='l1', solver='liblinear', random_state=random_state)
    model.fit(X_train_scaled, y_train)
    selector = SelectFromModel(model, prefit=True)
    X_train_selected = selector.transform(X_train_scaled)
    X_val_selected = selector.transform(X_val_scaled)
    selected_features = X_train.columns[selector.get_support()]
    print("✅ 선택된 피처:", selected_features.tolist())
    return X_train_selected, X_val_selected, selected_features, scaler, selector

def apply_smote(X_train, y_train, random_state=42):
    smote = BorderlineSMOTE(random_state=random_state)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train.reset_index(drop=True))
    print(f"✅ SMOTE 적용: {X_train.shape} → {X_train_smote.shape}")
    return X_train_smote, y_train_smote

def save_to_csv(X_train, y_train, X_val, y_val, train_path, val_path):
    train_output = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='is_defaulted')], axis=1)
    val_output = pd.concat([pd.DataFrame(X_val), pd.Series(y_val, name='is_defaulted')], axis=1)
    train_output.to_csv(train_path, index=False)
    val_output.to_csv(val_path, index=False)
    print(f"✅ 학습/검증 데이터 저장: {train_path}, {val_path}")

# === 실행 ===
X, y, df = load_and_prepare_data('../../2_EDA/JY/features_v1_vif.csv')
X_train, X_val, y_train, y_val = time_series_split(X, y)
X_train_selected, X_val_selected, selected_features, scaler, selector = lasso_feature_selection(X_train, y_train, X_val)
X_train_final = pd.DataFrame(X_train_selected, columns=selected_features)
X_val_final = pd.DataFrame(X_val_selected, columns=selected_features)
X_train_smote, y_train_smote = apply_smote(X_train_final, y_train)
save_to_csv(X_train_smote, y_train_smote, X_val_final, y_val.reset_index(drop=True), 'train_data_with_smote.csv', 'val_data.csv')


✅ 선택된 피처: ['유동비율', '순운전자본비율', '현금비율', '현금흐름부채비율', '자본잠식여부', '총자산증가율', '유동자산증가율', '매출액증가율', '순이익증가율', '영업이익증가율', '자산대비영업현금흐름', 'ROE', 'ROA', '총자산영업이익율', '이익잉여금비율', '비유동자산회전율', '판관비율', '매출채권회전율', '재고자산회전율', '재무활동의존도']
✅ SMOTE 적용: (13899, 20) → (27484, 20)
✅ SMOTE 적용: (13899, 20) → (27484, 20)
✅ 학습/검증 데이터 저장: train_data_with_smote.csv, val_data.csv
✅ 학습/검증 데이터 저장: train_data_with_smote.csv, val_data.csv
