# 시트 컬럼 추출 및 병합 (SHAP 기반 Top 100)

이 노트북은 `VIP_shap_vif12.23.csv` 파일에서 SHAP 중요도가 결코 0이 아닌 컬럼들을 추출하되,
**상위 100개까지만** 제한하여 병합합니다.
(파일 경로 자동 탐색 기능 포함)

In [17]:
import pandas as pd
import os

# 1. SHAP 파일 로드 및 타겟 컬럼 선정
shap_csv_path = 'GENERAL_shap_vif12.23.csv'
top_n_limit = 100

if os.path.exists(shap_csv_path):
    print(f"Reading SHAP values from {shap_csv_path}...")
    try:
        shap_df = pd.read_csv(shap_csv_path)
        
        # 1) SHAP importance가 0이 아닌(혹은 양수인) 행만 필터링
        #    (보통 importance는 절대값이거나 양수입니다)
        filtered_df = shap_df[shap_df['shap_importance'] > 0].copy()
        
        # 2) 중요도 순으로 내림차순 정렬
        filtered_df = filtered_df.sort_values(by='shap_importance', ascending=False)
        
        # 3) 상위 100개만 추출
        if len(filtered_df) > top_n_limit:
            print(f"Initial valid feature count: {len(filtered_df)}")
            print(f"Limiting to top {top_n_limit} features by importance.")
            filtered_df = filtered_df.head(top_n_limit)
            
        target_columns = filtered_df['feature'].dropna().unique().tolist()
        
        print(f"Target columns loaded: {len(target_columns)} columns")
        print("Top 5 targets:", target_columns[:5])
        
    except Exception as e:
        print(f"Error reading SHAP csv: {e}")
        target_columns = []
else:
    print(f"Warning: {shap_csv_path} not found. Please check file path.")
    target_columns = []

Reading SHAP values from GENERAL_shap_vif12.23.csv...
Initial valid feature count: 191
Limiting to top 100 features by importance.
Target columns loaded: 100 columns
Top 5 targets: ['최종이용일자_신판', '잔액_일시불_B2M', '이용금액_체크_R12M', '카드이용한도금액_B2M', '잔액_신판최대한도소진율_r6m']


In [18]:
# 2. 대상 폴더 설정 및 파일 목록 자동 탐색
# 분석할 CSV 파일들이 들어있는 폴더 경로를 입력하세요.
target_folder = '../30만원본/GENERAL/' # <-- 여기에 경로 입력

file_list = []

if os.path.exists(target_folder):
    print(f"Scanning files in {target_folder}...")
    for root, dirs, files in os.walk(target_folder):
        for file in files:
            if file.lower().endswith('.csv'):
                full_path = os.path.join(root, file)
                file_list.append(full_path)
    
    print(f"Found {len(file_list)} CSV files.")
    for f in file_list[:3]:
        print(f" - {os.path.basename(f)}")
else:
    print(f"Folder not found: {target_folder}")

Scanning files in ../30만원본/GENERAL/...
Found 7 CSV files.
 - 300k_general_성과정보.csv
 - 300k_general_승인정보.csv
 - 300k_general_신용정보.csv


In [19]:
# 3. 파일 순회 및 데이터 병합

key_cols = ['발급회원번호', '기준년월'] 

final_df = pd.DataFrame()
found_log = []

print("\nStarting Merge Process...")
print("="*50)

for file_path in file_list:
    try:
        # 1) 헤더만 읽기
        temp_header = pd.read_csv(file_path, nrows=0)
        file_cols = list(temp_header.columns)
        
        # 2) 키 컬럼 확인
        missing_keys = [k for k in key_cols if k not in file_cols]
        if missing_keys:
            # 키가 없으면 병합 불가하므로 스킵
            # print(f"[SKIP] {os.path.basename(file_path)} -> Missing keys: {missing_keys}")
            continue
            
        # 3) 타겟 컬럼 확인
        cols_to_read = [col for col in target_columns if col in file_cols]
        
        if not cols_to_read:
            # print(f"[PASS] {os.path.basename(file_path)} -> No target columns found.")
            continue
            
        # 4) 데이터 로드
        use_cols = list(set(key_cols + cols_to_read))
        print(f"[READ] {os.path.basename(file_path)} -> Found {len(cols_to_read)} target columns.")
        
        current_df = pd.read_csv(file_path, usecols=use_cols)
        
        # 5) 병합
        if final_df.empty:
            final_df = current_df
            print(f"       -> Initialized final_df with shape {final_df.shape}")
        else:
            before_shape = final_df.shape
            final_df = pd.merge(final_df, current_df, on=key_cols, how='outer')
            print(f"       -> Merged. Shape: {before_shape} -> {final_df.shape}")
            
        found_log.append(f"{os.path.basename(file_path)}: {cols_to_read}")
            
    except Exception as e:
        print(f"[ERROR] reading {file_path}: {e}")

print("="*50)
print("Merge Completed.")


Starting Merge Process...
[READ] 300k_general_성과정보.csv -> Found 19 target columns.
       -> Initialized final_df with shape (240636, 21)
[READ] 300k_general_승인정보.csv -> Found 48 target columns.
       -> Merged. Shape: (240636, 21) -> (240636, 69)
[READ] 300k_general_신용정보.csv -> Found 8 target columns.
       -> Merged. Shape: (240636, 69) -> (240636, 77)
[READ] 300k_general_잔액정보.csv -> Found 8 target columns.
       -> Merged. Shape: (240636, 77) -> (240636, 85)
[READ] 300k_general_채널정보.csv -> Found 1 target columns.
       -> Merged. Shape: (240636, 85) -> (240636, 86)
[READ] 300k_general_청구정보.csv -> Found 8 target columns.
       -> Merged. Shape: (240636, 86) -> (240636, 94)
[READ] 300k_general_회원정보.csv -> Found 8 target columns.
       -> Merged. Shape: (240636, 94) -> (240636, 102)
Merge Completed.


In [None]:
# 4. 결과 확인 및 저장
print("Final DataFrame Shape:", final_df.shape)

if not final_df.empty:
    print("Columns:", final_df.columns.tolist())
    display(final_df.head())
    


Final DataFrame Shape: (240636, 102)
Columns: ['기준년월', '발급회원번호', '증감율_이용건수_신용_전월', '증감율_이용건수_체크_전월', '증감율_이용금액_신용_전월', '증감율_이용금액_신판_전월', '증감율_이용금액_일시불_전월', '증감율_이용금액_신용_분기', '증감율_이용금액_일시불_분기', '잔액_신판평균한도소진율_r6m', '잔액_신판최대한도소진율_r6m', '잔액_신판평균한도소진율_r3m', '잔액_신판최대한도소진율_r3m', '잔액_신판ca평균한도소진율_r6m', '잔액_신판ca최대한도소진율_r6m', '잔액_신판ca최대한도소진율_r3m', '변동률_일시불평잔', '변동률_잔액_B1M', '변동률_잔액_일시불_B1M', '혜택수혜율_R3M', '혜택수혜율_B0M', '최종이용일자_기본', '최종이용일자_신판', '최종이용일자_체크', '최종이용일자_할부', '이용금액_신용_B0M', '이용건수_신판_R12M', '이용건수_체크_R12M', '이용금액_신용_R12M', '이용금액_할부_R12M', '이용금액_체크_R12M', '최대이용금액_신용_R12M', '최대이용금액_신판_R12M', '이용건수_신용_R6M', '이용건수_신판_R6M', '이용건수_일시불_R6M', '이용금액_신판_R6M', '이용금액_일시불_R6M', '이용금액_부분무이자_R6M', '이용건수_신용_R3M', '이용건수_일시불_R3M', '이용금액_할부_무이자_R3M', '이용금액_부분무이자_R3M', '이용금액_체크_R3M', '이용가맹점수', '이용금액_요식', '이용금액_납부', '이용금액_여유생활', '이용금액_사교활동', '쇼핑_도소매_이용금액', '쇼핑_슈퍼마켓_이용금액', '쇼핑_기타_이용금액', '교통_버스지하철이용금액', '_1순위업종_이용금액', '_1순위납부업종_이용금액', '할부금액_6M_R12M', '할부금액_유이자_3M_R12M', 'RP금액_B0M', '이용금액_온라인_R6M', '이용금액_오프라인_R6M

Unnamed: 0,기준년월,발급회원번호,증감율_이용건수_신용_전월,증감율_이용건수_체크_전월,증감율_이용금액_신용_전월,증감율_이용금액_신판_전월,증감율_이용금액_일시불_전월,증감율_이용금액_신용_분기,증감율_이용금액_일시불_분기,잔액_신판평균한도소진율_r6m,...,혜택수혜금액_R3M,혜택수혜금액,입회일자_신용,입회경과개월수_신용,최종탈회후경과월,이용카드수_신용체크,이용금액_R3M_신용체크,_1순위카드이용금액,최종유효년월_신용_이용,최종카드발급일자
0,201807,SYN_1000048,0.020007,0.0,0.249761,0.249761,0.249761,-0.420396,0.057435,0.223303,...,0,0,20040801,168,0,2,2681730,1572745,202205.0,20170918.0
1,201808,SYN_1000048,-0.073442,0.0,0.239299,0.239299,0.239299,-0.377131,0.024356,0.180827,...,0,0,20040801,169,0,2,2203913,1385707,202208.0,20170918.0
2,201809,SYN_1000048,0.074553,0.0,0.286712,0.286712,0.286712,0.121721,0.121721,0.147647,...,0,0,20040801,170,0,2,2409617,1544347,202302.0,20170918.0
3,201810,SYN_1000048,0.091676,-0.403293,0.328827,0.328827,0.328827,0.01974,0.01974,0.059904,...,0,0,20040801,171,0,4,2753884,1651927,202301.0,20170918.0
4,201811,SYN_1000048,-0.128753,-0.585977,0.174194,0.174194,0.174194,0.624895,0.624895,0.0524,...,0,0,20040801,172,0,4,2793228,1652493,202208.0,20170918.0


In [21]:
# CSV 저장 (필요시)
final_df.to_csv("merged_result_top100.csv", index=False, encoding='utf-8')