In [None]:
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Data Load

In [None]:
# data_path = "/workspace/data/0_Subtask/[DICCD 분석] 251107.csv" # ~251110
data_path = "/workspace/data/0_Subtask/[DICCD 분석] 251111_SRD 추가.csv" # 251111
raw_df = pd.read_csv(data_path)

# y = 타겟 변수 / x = 예측 변수
# except_cols = ['DICCD_CP', 'ODD_Cur', 'ODD_Pas', 'DICCD_C', 'DICCD_P', 'MentD']
except_cols = ['Z_KDBDRS', 'K_ODD', 'KDBDRS', 'wt_s', 'DICCD_CP', 'ODD_Cur', 'ODD_Pas', 'DICCD_C', 'DICCD_P', 'MentD']
standard_cols = ['ZAI_Incom', 'Z_K_ODD', 'Z_K_CD', 'Z_K_IA', 'Z_K_HI', 'Z_GAD', 'Z_PHQ', 'Z_SAS']
nonstandard_cols = ['Incom', 'K_ODD', 'K_CD', 'K_IA', 'K_HI', 'GAD', 'PHQ', 'SAS']

drop_cols = except_cols + nonstandard_cols
# drop_cols = except_cols + standard_cols

print(f"기존 변수 수 : {len(raw_df.columns)}")
raw_df = raw_df.drop(columns=drop_cols) # 입력 변수 
print(f"제거된 변수: {except_cols + standard_cols}")
print(f"남은 변수 수: {len(raw_df.columns)}")

## Missing value imputation


In [None]:
from utils.data_imputation import filter_by_missing_ratio
# 사용 예시
df = filter_by_missing_ratio(raw_df, threshold=0.25, visualize = False)
X = df.drop(columns=['ODD_CP'])
y = df['ODD_CP']

# Data Preprocess

### 데이터 전처리 검토

1. 범주형

    1-1 Ordinal : Imputation(Median) => OrdinalEncoder

    1-2 Nominal : Imputation(Unknown) => OneHotEncoder

2. 수치형

    2-1 결측치 존재 시, Imputation(Median) 적용 => Z-표준화(StandardScaler)

In [None]:
from utils.data_preprocessor import check_preprocessing_needs, preprocess_dataframe
from utils.data_preprocessor import data_preprocess_pipeline
from utils.data_analyzer import analyze_correlation_matrix
# 전처리 필요사항 검토
# recommendations = check_preprocessing_needs(X_train, target_col='ODD_CP')

oversample_method = "SMOTETomek" # SMOTE / SMOTEEN / ADASYN / SMOTETomek

# # 권장사항에 따라 전처리 (선택사항)
X = preprocess_dataframe(
    X, 
    target_col='ODD_CP',
    drop_weight=True,  # 가중치 변수 제거
    convert_categorical=['Answ', 'IGD_P', 'FEdu', 'MEdu', 'FJob', 'MJob', 'Age_Grp', 'P_Marr'],
    convert_ordinal=['ST1', 'ST2', 'ST3', 'ST4', 'PAF', 'MAlc', 'FAlc', "MTob", "FTob", "MAlc", "FAlc", "GAlc", "MTob", "FTob", "GTob"], 
    convert_binary=['SRD_CP', 'IGD_P', 'Sex', 'PSleep', 'SBV', 'SBP', 'CBV', 'CBP', 'GDec', 'BF', 'RFG', 'MentD', 'AdolSlp', 'MoodD', 'AnxD'],
    drop_low_variance=False,  # 분산이 낮은 변수 제거
    drop_leakage=True  # 데이터 누수 위험 변수 제거
)

X = data_preprocess_pipeline(X) # 전처리된 데이터 

# X_preprocessed.info()
# analyze_correlation_matrix(X, y)

### Startified Splitting 

In [None]:
from utils.data_splitter import oversample_train_test_split, downsample_train_test_split
# 함수 사용
df = pd.concat([X, y], axis=1)
X_train, X_test, y_train, y_test = oversample_train_test_split(
    X, y, 
    target_col='ODD_CP',
    test_size_per_class=60,
    train_size_per_class=3145,  #w/o Random Sampling : 3145, w/ Random Sampling : 240
    random_state=42,
    verbose=True,
    method = oversample_method # SMOTE / SMOTEEN / ADASYN / SMOTETomek
)

# Only for Random Downsampling 시 
# X_train, X_test, y_train, y_test = downsample_train_test_split(
#     df, 
#     target_col='ODD_CP', 
#     n_train_class0=None,    # 전부가지고오기
#     n_test_per_class=60,   # Test에서 각 클래스 60개씩
#     random_state=42,
#     verbose=True
# )



# Train & Eval

In [None]:
from utils.ml_model import MultiModelFoldTrainer

# 사용 예시: 모든 모델 비교
# models_to_train=None이면 모든 가능한 모델 사용
# 또는 특정 모델만 선택: ['CatBoost', 'RandomForest', 'LogisticRegression'] 등
multi_model_trainer = MultiModelFoldTrainer(
    models_to_train=None,  # None이면 모든 가능한 모델 사용
    # models_to_train=['CatBoost', 'XGBoost', 'LightGBM', 'RandomForest', 'GradientBoosting', 'LogisticRegression'],  # 특정 모델만 선택
    n_splits=5, 
    random_state=42, 
    T=0.01
)
multi_model_trainer.fit(X_train, y_train, X_test, y_test=y_test)

# 저장

In [None]:
# 모든 모델 결과 저장
import pickle

save_dict = {
    'test_inputs': X_test,
    'test_labels': multi_model_trainer.get_test_labels(),
    'test_proba': multi_model_trainer.get_test_proba(),
    'test_preds': multi_model_trainer.get_test_preds(),
    'test_metrics': multi_model_trainer.get_test_metrics(),
    'fold_thresholds': multi_model_trainer.get_fold_thresholds(),
    'shap_values_test': multi_model_trainer.get_shap_values_test(),
}

# Feature importance 저장 (모델별)
feature_importances = multi_model_trainer.get_feature_importances()
feature_importance_dfs = {}
for model_name in feature_importances.keys():
    if len(feature_importances[model_name]) > 0:
        # Fold별 평균 feature importance
        avg_importance = np.mean(feature_importances[model_name], axis=0)
        feature_importance_dfs[model_name] = pd.DataFrame({
            'feature': X_train.columns,
            'importance': avg_importance
        }).sort_values(by='importance', ascending=False)

save_dict['feature_importances'] = feature_importance_dfs

# 모델 비교 결과도 저장
save_dict['comparison_results'] = {
    'validation': multi_model_trainer.weighted_avg_metrics,
    'test': multi_model_trainer.weighted_avg_test_metrics
}

save_path = "/workspace/data/results/models_comparison.pkl"
with open(save_path, 'wb') as f:
    pickle.dump(save_dict, f)

print(f"✓ 결과 저장 완료: {save_path}")
print(f"  저장된 모델: {list(multi_model_trainer.models_to_train)}")

# 모델 비교