In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 📂 데이터 불러오기
df = pd.read_csv('../../1_preprocessing/YANG/preprocessed_data.csv')

# ⏱️ 시계열 순서 정렬
df = df.sort_values(by='회계년도')

# 🎯 X, y 분리
X = df.drop(columns=['is_defaulted', '회사명', '회계년도'], errors='ignore')
y = df['is_defaulted']
X = X.select_dtypes(include='number')

# ⏱️ TimeSeriesSplit (마지막 fold 사용)
tscv = TimeSeriesSplit(n_splits=5)
for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

# 📏 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 🔍 L1 로지스틱 회귀로 피처 선택
model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
model.fit(X_train_scaled, y_train)

selector = SelectFromModel(model, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_val_selected = selector.transform(X_val_scaled)

# 💡 선택된 컬럼 이름
selected_features = X.columns[selector.get_support()]
print("✅ 선택된 피처:", selected_features.tolist())

# 🎯 선택된 피처만 추출해 원본에서도 재구성
X_train_final = pd.DataFrame(X_train_selected, columns=selected_features)
X_val_final = pd.DataFrame(X_val_selected, columns=selected_features)

# 🧪 SMOTE 적용 (훈련 데이터만!)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_final, y_train.reset_index(drop=True))

# 💾 CSV 파일로 저장
train_output = pd.concat([X_train_smote, pd.Series(y_train_smote, name='is_defaulted')], axis=1)
val_output = pd.concat([X_val_final.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)

train_output.to_csv('train_data_with_smote.csv', index=False)
val_output.to_csv('val_data.csv', index=False)

print("✅ SMOTE 적용된 학습 데이터 저장: train_data_with_smote.csv")
print("✅ 검증 데이터 저장: val_data.csv")


✅ 선택된 피처: ['거래소코드', '자산(*)(IFRS)(천원)', '유동자산(*)(IFRS)(천원)', '이익잉여금(결손금)(*)(IFRS)(천원)', '비유동부채 (*)(IFRS)(천원)', '유동부채(*)(IFRS)(천원)', '현금및현금성자산(*)(IFRS)(천원)', '영업활동으로 인한 현금흐름(간접법)(*)(IFRS)(천원)', '재무활동으로 인한 현금흐름(*)(IFRS)(천원)', '기초 현금및현금성자산(IFRS)(천원)', '매출액(수익)(*)(IFRS)(천원)', '매출총이익(손실)(IFRS)(천원)', '* (정상)영업손익(보고서기재)(IFRS)(천원)', '매출원가(*)(IFRS)(천원)', '당기순이익(손실)(IFRS)(천원)', '매출채권(IFRS)(천원)']
✅ SMOTE 적용된 학습 데이터 저장: train_data_with_smote.csv
✅ 검증 데이터 저장: val_data.csv
