In [None]:
from glob import glob 

result_root_dir = "/workspace/data/results"

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from scipy.stats import mode

def pkl_load(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# 모든 모델 결과 로드
pkl_files = glob(f"{result_root_dir}/*.pkl")
results_dict = {}

for pkl_file in pkl_files:
    model_name = pkl_file.split('/')[-1].split('.')[0]
    pkl_dict = pkl_load(pkl_file)
    results_dict[model_name] = pkl_dict
    print(f"✓ {model_name} 로드 완료")



## 4. Feature Importance 비교


In [None]:
import json 

feature_category = json.load(open('/workspace/data/variable_category.json', 'r'))

# Feature Importance 데이터 가져오기
feature_importances = results_dict['models_comparison_downsample_SMOTEEN']['feature_importances']['LightGBM']

# 부모 변인과 청소년 변인 이름 추출
parent_var_names = [var['name'] for var in feature_category['parent_variables']]
adolescent_var_names = [var['name'] for var in feature_category['adolescent_variables']]

def aggregate_onehot_features(df, var_names):
    """
    One-hot encoding된 변수들을 원래 변수명으로 통합
    
    예: P_Marr_1, P_Marr_2 -> P_Marr (importance 합산)
    """
    aggregated = {}
    
    for _, row in df.iterrows():
        feature_name = row['feature']
        importance = row['importance']
        
        # 원본 변수명 찾기 (one-hot encoding 제거)
        base_name = None
        for var_name in var_names:
            # 정확히 일치하거나, var_name으로 시작하고 _숫자로 끝나는 경우
            if feature_name == var_name:
                base_name = var_name
                break
            elif feature_name.startswith(var_name + '_'):
                # _ 뒤가 숫자인지 확인
                suffix = feature_name[len(var_name) + 1:]
                if suffix.isdigit() or (suffix.replace('.', '').isdigit()):
                    base_name = var_name
                    break
        
        # base_name이 없으면 그대로 사용 (다른 변수들)
        if base_name is None:
            base_name = feature_name
        
        # 통합
        if base_name in aggregated:
            aggregated[base_name] += importance
        else:
            aggregated[base_name] = importance
    
    # DataFrame으로 변환
    result_df = pd.DataFrame({
        'feature': list(aggregated.keys()),
        'importance': list(aggregated.values())
    }).sort_values('importance', ascending=False)
    
    return result_df

# 부모 변인과 청소년 변인으로 분류하여 통합
parent_features = []
adolescent_features = []
other_features = []

for _, row in feature_importances.iterrows():
    feature_name = row['feature']
    importance = row['importance']
    
    # 원본 변수명 찾기
    base_name = None
    var_category = None
    
    # 부모 변인 확인
    for var_name in parent_var_names:
        if feature_name == var_name:
            base_name = var_name
            var_category = 'parent'
            break
        elif feature_name.startswith(var_name + '_'):
            suffix = feature_name[len(var_name) + 1:]
            if suffix.isdigit() or (suffix.replace('.', '').isdigit()):
                base_name = var_name
                var_category = 'parent'
                break
    
    # 청소년 변인 확인
    if base_name is None:
        for var_name in adolescent_var_names:
            if feature_name == var_name:
                base_name = var_name
                var_category = 'adolescent'
                break
            elif feature_name.startswith(var_name + '_'):
                suffix = feature_name[len(var_name) + 1:]
                if suffix.isdigit() or (suffix.replace('.', '').isdigit()):
                    base_name = var_name
                    var_category = 'adolescent'
                    break
    
    # 분류
    if var_category == 'parent':
        parent_features.append({'feature': base_name, 'importance': importance})
    elif var_category == 'adolescent':
        adolescent_features.append({'feature': base_name, 'importance': importance})
    else:
        other_features.append({'feature': feature_name, 'importance': importance})

# 통합 (같은 변수명의 importance 합산)
def merge_features(feature_list):
    merged = {}
    for item in feature_list:
        name = item['feature']
        imp = item['importance']
        if name in merged:
            merged[name] += imp
        else:
            merged[name] = imp
    
    result_df = pd.DataFrame({
        'feature': list(merged.keys()),
        'importance': list(merged.values())
    }).sort_values('importance', ascending=False)
    
    return result_df

# 부모 변인과 청소년 변인 통합
parent_df = merge_features(parent_features)
adolescent_df = merge_features(adolescent_features)
total_df = pd.concat([parent_df, adolescent_df])
total_df = total_df.sort_values('importance', ascending=False)
# 상위 7개 추출
top7_parent = parent_df.head(7)
top7_adolescent = adolescent_df.head(7)

print("="*80)
print("부모 변인 상위 7개")
print("="*80)
display(top7_parent)

print("\n" + "="*80)
print("청소년 변인 상위 7개")
print("="*80)
display(top7_adolescent)

print(f"\n✅ 부모 변인 상위 7개와 청소년 변인 상위 7개 추출 완료")

top7_total = pd.concat([top7_parent, top7_adolescent]).sort_values('importance', ascending=False).head(7).reset_index(drop=True)
display(top7_total)

top7_parent.to_csv('/workspace/data/results/top7_parent.csv')
top7_adolescent.to_csv('/workspace/data/results/top7_adolescent.csv')
top7_total.to_csv('/workspace/data/results/top7_total.csv')
total_df.to_csv('/workspace/data/results/total_feature_importance.csv')

In [None]:
# Visualize Feature Importances for Parent, Adolescent, and Combined (Top 7 for each) as a 1x3 subplot

import matplotlib.pyplot as plt
import numpy as np

feature_dfs = [top7_parent, top7_adolescent, top7_total.head(7)]
titles = ['Top 7 Parent Features', 'Top 7 Adolescent Features', 'Top 7 Combined Features']
cmap = plt.cm.viridis

fig, axes = plt.subplots(1, 3, figsize=(21, 6), sharey=False)
for i, (ax, df, title) in enumerate(zip(axes, feature_dfs, titles)):
    colors = cmap(np.linspace(0, 1, len(df)))
    ax.barh(range(len(df)), df['importance'], color=colors)
    ax.set_yticks(range(len(df)))
    ax.set_yticklabels(df['feature'], fontsize=10)
    ax.invert_yaxis()
    ax.set_xlabel('Importance', fontsize=12, fontweight='bold')
    ax.set_title(title, fontsize=13, fontweight='bold', pad=10)
    ax.grid(axis='x', alpha=0.3, linestyle='--')
    for j, (idx, row) in enumerate(df.iterrows()):
        ax.text(row['importance'], j, f' {row["importance"]:.1f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print("\n=== Top 7 Parent Features ===")
display(top7_parent)
print("\n=== Top 7 Adolescent Features ===")
display(top7_adolescent)
print("\n=== Top 7 Combined (Parent + Adolescent) Features ===")
display(top7_total.head(7))

# 5. Shap Value 비교

In [None]:
import shap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

# SHAP values 시각화: downsample_SMOTEEN의 LightGBM만
all_combinations = []
for sampling_name, result in results_dict.items():
    # sampling_method가 'downsample_SMOTEEN'인 경우만 필터링
    sampling_method = result.get('sampling_method', sampling_name.replace('models_comparison_', ''))
    if sampling_method != 'downsample_SMOTEEN':
        continue

    shap_values_test = result.get('shap_values_test', {})
    test_inputs = result.get('test_inputs', None)
    
    if test_inputs is None or not isinstance(shap_values_test, dict):
        continue
    
    # 모델명이 'LightGBM'인 경우만
    for model_name in shap_values_test.keys():
        if model_name != 'LightGBM':
            continue
        shap_values_list = shap_values_test[model_name]
        if isinstance(shap_values_list, list) and len(shap_values_list) > 0:
            all_combinations.append((sampling_method, model_name, shap_values_list, test_inputs))

if len(all_combinations) == 0:
    print("⚠️  downsample_SMOTEEN의 LightGBM SHAP values 데이터가 없습니다.")
else:
    for plot_idx, (sampling_method, model_name, shap_values_list, test_inputs) in enumerate(all_combinations):
        # SHAP values는 fold별로 리스트로 저장되어 있으므로 평균 계산
        shap_values_array = np.array(shap_values_list)  # (n_folds, n_samples, n_features)
        shap_values_mean = np.mean(shap_values_array, axis=0)  # (n_samples, n_features)

        feature_names = list(test_inputs.columns)

        # SHAP value importance 계산
        shap_importances = np.abs(shap_values_mean).mean(axis=0)  # 각 feature별 importance (n_features,)

        # SHAP feature important DF 만들기
        shap_df = pd.DataFrame({'feature': feature_names, 'shap_importance': shap_importances})

        # 부모/청소년 명단 불러오기
        feature_category = json.load(open('/workspace/data/variable_category.json', 'r'))
        parent_var_names = [var['name'] for var in feature_category['parent_variables']]
        adolescent_var_names = [var['name'] for var in feature_category['adolescent_variables']]

        # One-hot encoding된 변수들을 원본 변수명으로 통합하는 함수
        def extract_base_feature(name, group_names):
            for v in group_names:
                if name == v:
                    return v
                elif name.startswith(v + '_'):
                    tail = name[len(v) + 1:]
                    if tail.isdigit() or (tail.replace('.', '').isdigit()):
                        return v
            return None

        # 원본 feature명으로 매핑 (one-hot encoding 통합)
        feature_to_base = {}
        base_to_original_features = {}
        
        for fname in feature_names:
            base = None
            # 부모 변인 확인
            b = extract_base_feature(fname, parent_var_names)
            if b is not None:
                base = b
            else:
                # 청소년 변인 확인
                b = extract_base_feature(fname, adolescent_var_names)
                if b is not None:
                    base = b
                else:
                    base = fname  # 그 외는 그대로
            
            feature_to_base[fname] = base
            if base not in base_to_original_features:
                base_to_original_features[base] = []
            base_to_original_features[base].append(fname)

        # 각 base feature에 대해 SHAP values와 feature values 통합
        def aggregate_shap_for_base(base_name, shap_values_mean, test_inputs, feature_names):
            """원본 feature들의 SHAP values와 feature values를 통합"""
            original_features = base_to_original_features[base_name]
            original_indices = [feature_names.index(f) for f in original_features if f in feature_names]
            
            if len(original_indices) == 0:
                return None, None
            
            # SHAP values 통합 (합산)
            shap_combined = shap_values_mean[:, original_indices].sum(axis=1)
            
            # Feature values 통합 (평균)
            feature_combined = test_inputs.iloc[:, original_indices].mean(axis=1).values
            
            return shap_combined, feature_combined

        # 부모/청소년/전체 분류
        parent_shap_list = []
        adolescent_shap_list = []
        other_shap_list = []

        for _, row in shap_df.iterrows():
            fname = row['feature']
            s_imp = row['shap_importance']
            base = feature_to_base[fname]
            category = None

            if base in parent_var_names:
                category = 'parent'
            elif base in adolescent_var_names:
                category = 'adolescent'
            else:
                category = 'other'

            if category == 'parent':
                parent_shap_list.append({'feature': base, 'shap_importance': s_imp})
            elif category == 'adolescent':
                adolescent_shap_list.append({'feature': base, 'shap_importance': s_imp})
            else:
                other_shap_list.append({'feature': base, 'shap_importance': s_imp})

        # 그룹핑(통합) 함수
        def merge_shap_list(shap_list):
            merged = {}
            for d in shap_list:
                if d['feature'] in merged:
                    merged[d['feature']] += d['shap_importance']
                else:
                    merged[d['feature']] = d['shap_importance']
            return pd.DataFrame({
                'feature': list(merged.keys()),
                'shap_importance': list(merged.values())
            }).sort_values('shap_importance', ascending=False)

        parent_shap_df = merge_shap_list(parent_shap_list)
        adolescent_shap_df = merge_shap_list(adolescent_shap_list)
        # others 제외하고 부모+청소년만 합치기
        total_shap_df = pd.concat([parent_shap_df, adolescent_shap_df]).sort_values('shap_importance', ascending=False)

        # 상위 개수 선택
        top_n_parent = 7
        top_n_adolescent = 7
        top_n_total = 14  # 부모+청소년 합쳐서 14개
        
        parent_top_features = parent_shap_df.head(top_n_parent)['feature'].tolist()
        adolescent_top_features = adolescent_shap_df.head(top_n_adolescent)['feature'].tolist()
        total_top_features = total_shap_df.head(top_n_total)['feature'].tolist()

        # 1x3 subplot 생성
        fig, axes = plt.subplots(1, 3, figsize=(21, 8))
        
        # 공통 colorbar를 위한 데이터 수집 (모든 feature values의 범위)
        all_feature_values = []
        for base_name in list(set(parent_top_features + adolescent_top_features + total_top_features)):
            shap_vals, feat_vals = aggregate_shap_for_base(base_name, shap_values_mean, test_inputs, feature_names)
            if feat_vals is not None:
                all_feature_values.extend(feat_vals)
        
        vmin = np.min(all_feature_values) if len(all_feature_values) > 0 else 0
        vmax = np.max(all_feature_values) if len(all_feature_values) > 0 else 1
        
        # 각 subplot에 대해 SHAP plot 그리기
        plot_configs = [
            ('Total', total_top_features, 'Top14 SHAP Features'),
            ('Parent', parent_top_features, 'Top7 Parent SHAP Features'),
            ('Adolescent', adolescent_top_features, 'Top7 Adolescent SHAP Features'),
        ]
        
        for idx, (category, top_features, title) in enumerate(plot_configs):
            ax = axes[idx]
            
            # 각 feature에 대해 SHAP values와 feature values 수집
            y_positions = []
            shap_data = []
            feature_data = []
            feature_labels = []
            
            for y_pos, feat_name in enumerate(reversed(top_features)):  # 역순으로 (위에서 아래로)
                shap_vals, feat_vals = aggregate_shap_for_base(feat_name, shap_values_mean, test_inputs, feature_names)
                
                if shap_vals is not None and feat_vals is not None:
                    y_positions.append(y_pos)
                    shap_data.append(shap_vals)
                    feature_data.append(feat_vals)
                    feature_labels.append(feat_name)
            
            if len(shap_data) == 0:
                ax.text(0.5, 0.5, 'No data available', ha='center', va='center', transform=ax.transAxes)
                ax.set_title(title, fontsize=14, fontweight='bold')
                continue
            
            # Scatter plot 그리기 (colorbar 없이)
            for y_pos, shap_vals, feat_vals, feat_label in zip(y_positions, shap_data, feature_data, feature_labels):
                # 색상 매핑 (feature value에 따라)
                ax.scatter(shap_vals, [y_pos] * len(shap_vals), 
                          c=feat_vals, cmap='RdBu_r', 
                          vmin=vmin, vmax=vmax, 
                          s=20, alpha=0.6, edgecolors='none')
            
            # Y축 설정
            ax.set_yticks(y_positions)
            ax.set_yticklabels(feature_labels, fontsize=10)
            ax.set_ylim(-0.5, len(y_positions) - 0.5)
            
            # X축 설정
            ax.set_xlabel('SHAP value', fontsize=12, fontweight='bold')
            ax.set_title(title, fontsize=14, fontweight='bold', pad=10)
            ax.grid(True, alpha=0.3, axis='x')
            ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8, alpha=0.5)
        
        # 공통 colorbar를 오른쪽에 추가
        plt.tight_layout(rect=[0, 0, 0.92, 1.0])  # colorbar 공간 확보
        
        # colorbar axes 생성
        cbar_ax = fig.add_axes([0.93, 0.15, 0.015, 0.7])  # [left, bottom, width, height]
        
        # colorbar 생성
        sm = plt.cm.ScalarMappable(cmap='RdBu_r', norm=plt.Normalize(vmin=vmin, vmax=vmax))
        sm.set_array([])
        cbar = plt.colorbar(sm, cax=cbar_ax)
        cbar.set_label('Feature Value', fontsize=11, fontweight='bold')
        cbar.ax.tick_params(labelsize=9)
        
        plt.suptitle('SHAP Summary Plots by Category', fontsize=16, fontweight='bold', y=1.02)
        plt.show()

        # Display SHAP importance 테이블
        print("\n=== Top 7 Parent SHAP Features ===")
        display(parent_shap_df.head(7))
        print("\n=== Top 7 Adolescent SHAP Features ===")
        display(adolescent_shap_df.head(7))
        print("\n=== Top 14 Parent+Adolescent SHAP Features ===")
        display(total_shap_df.head(14))

        print(f"\n✅ SHAP summary plot 3개(부모/청소년/전체) 1x3 subplot으로 출력 완료")

    print(f"\n✅ downsample_SMOTEEN의 LightGBM SHAP summary plot 1x3 분할 시각화 완료")