# 시트 컬럼 추출 및 병합 (SHAP 기반 + 폴더 순회)

이 노트북은 `VIP_shap_vif12.23.csv` 파일에서 SHAP 중요도가 0이 아닌 컬럼들을 추출하고,
지정된 **폴더 내의 모든 CSV 파일**을 자동으로 탐색하여 해당 컬럼들의 데이터를 병합합니다.

In [1]:
import pandas as pd
import os

# 1. SHAP 파일 로드 및 타겟 컬럼 선정
shap_csv_path = 'VIP_shap_vif12.23.csv'

if os.path.exists(shap_csv_path):
    print(f"Reading SHAP values from {shap_csv_path}...")
    try:
        shap_df = pd.read_csv(shap_csv_path)
        
        # SHAP importance가 0이 아닌 feature만 추출
        target_columns = shap_df[shap_df['shap_importance'] != 0]['feature'].dropna().unique().tolist()
        
        print(f"Target columns loaded: {len(target_columns)} columns (SHAP != 0)")
        print("Top 5 targets:", target_columns[:20])
        
    except Exception as e:
        print(f"Error reading SHAP csv: {e}")
        target_columns = []
else:
    print(f"Warning: {shap_csv_path} not found. Please check file path.")
    target_columns = []

Reading SHAP values from VIP_shap_vif12.23.csv...
Target columns loaded: 76 columns (SHAP != 0)
Top 5 targets: ['정상청구원금_B5M', 'CA이자율_할인전', '최종이용일자_기본', 'rv최초시작후경과일', '변동률_잔액_일시불_B1M', '이용금액_오프라인_R6M', '이용금액_쇼핑', '연체입금원금_B0M', '평잔_일시불_3M', '이용건수_신용_R12M', '이용금액_일시불_R6M', '이용금액_체크_R12M', '잔액_일시불_B2M', '월중평잔_일시불_B0M', '할부금액_무이자_3M_R12M', '최종유효년월_신용_이용가능', '잔액_신판평균한도소진율_r6m', '잔액_신판최대한도소진율_r6m', '쇼핑_편의점_이용금액', '이용건수_신용_R6M']


In [None]:
# 2. 대상 폴더 설정 및 파일 목록 자동 탐색
# 분석할 CSV 파일들이 들어있는 폴더 경로를 입력하세요.
target_folder = '../30만원본/VIP' # <-- 여기에 경로 입력

file_list = []

if os.path.exists(target_folder):
    print(f"Scanning files in {target_folder}...")
    for root, dirs, files in os.walk(target_folder):
        for file in files:
            # 대소문자 구분 없이 csv 파일 찾기
            if file.lower().endswith('.csv'):
                full_path = os.path.join(root, file)
                file_list.append(full_path)
    
    print(f"Found {len(file_list)} CSV files.")
    # 확인을 위해 처음 3개만 출력
    for f in file_list[:3]:
        print(f" - {os.path.basename(f)}")
else:
    print(f"Folder not found: {target_folder}")

Scanning files in ../30만원본/VIP...
Found 7 CSV files.
 - 300k_vip_성과정보.csv
 - 300k_vip_승인정보.csv
 - 300k_vip_신용정보.csv


['../30만원본/VIP\\300k_vip_성과정보.csv',
 '../30만원본/VIP\\300k_vip_승인정보.csv',
 '../30만원본/VIP\\300k_vip_신용정보.csv',
 '../30만원본/VIP\\300k_vip_잔액정보.csv',
 '../30만원본/VIP\\300k_vip_채널정보.csv',
 '../30만원본/VIP\\300k_vip_청구정보.csv',
 '../30만원본/VIP\\300k_vip_회원정보.csv']

In [8]:
# 3. 파일 순회 및 데이터 병합

# 기준 키 컬럼
key_cols = ['발급회원번호', '기준년월'] 

final_df = pd.DataFrame()
found_log = []

print("\nStarting Merge Process...")
print("="*50)

for file_path in file_list:
    try:
        # 1) 헤더만 읽기 (encoding은 파일에 맞게 수정, 보통 cp949 or utf-8)
        temp_header = pd.read_csv(file_path, nrows=0)
        file_cols = list(temp_header.columns)
        
        # 2) 키 컬럼 확인
        missing_keys = [k for k in key_cols if k not in file_cols]
        if missing_keys:
            print(f"[SKIP] {os.path.basename(file_path)} -> Missing keys: {missing_keys}")
            continue
            
        # 3) 타겟 컬럼 확인
        cols_to_read = [col for col in target_columns if col in file_cols]
        
        # 타겟 컬럼이 하나도 없으면 건너뜀
        if not cols_to_read:
            print(f"[PASS] {os.path.basename(file_path)} -> No target columns found.")
            continue
            
        # 4) 데이터 로드 (키 + 타겟)
        use_cols = list(set(key_cols + cols_to_read))
        print(f"[READ] {os.path.basename(file_path)} -> Found {len(cols_to_read)} target columns.")
        
        current_df = pd.read_csv(file_path, usecols=use_cols)
        
        # 5) 병합 (Outer Join)
        if final_df.empty:
            final_df = current_df
            print(f"       -> Initialized final_df with shape {final_df.shape}")
        else:
            before_shape = final_df.shape
            # on=key_cols, how='outer'로 병합
            final_df = pd.merge(final_df, current_df, on=key_cols, how='outer')
            print(f"       -> Merged. Shape: {before_shape} -> {final_df.shape}")
            
        found_log.append(f"{os.path.basename(file_path)}: {cols_to_read}")
            
    except Exception as e:
        print(f"[ERROR] reading {file_path}: {e}")

print("="*50)
print("Merge Completed.")


Starting Merge Process...
[READ] 300k_vip_성과정보.csv -> Found 14 target columns.
       -> Initialized final_df with shape (59364, 16)
[READ] 300k_vip_승인정보.csv -> Found 41 target columns.
       -> Merged. Shape: (59364, 16) -> (59364, 57)
[READ] 300k_vip_신용정보.csv -> Found 8 target columns.
       -> Merged. Shape: (59364, 57) -> (59364, 65)
[READ] 300k_vip_잔액정보.csv -> Found 5 target columns.
       -> Merged. Shape: (59364, 65) -> (59364, 70)
[READ] 300k_vip_채널정보.csv -> Found 2 target columns.
       -> Merged. Shape: (59364, 70) -> (59364, 72)
[READ] 300k_vip_청구정보.csv -> Found 1 target columns.
       -> Merged. Shape: (59364, 72) -> (59364, 73)
[READ] 300k_vip_회원정보.csv -> Found 5 target columns.
       -> Merged. Shape: (59364, 73) -> (59364, 78)
Merge Completed.


In [None]:
# 4. 결과 확인 및 저장
print("Final DataFrame Shape:", final_df.shape)

if not final_df.empty:
    print("Columns:", final_df.columns.tolist())
    display(final_df.head())
    

Final DataFrame Shape: (59364, 78)
Columns: ['기준년월', '발급회원번호', '증감율_이용금액_신판_전월', '증감율_이용금액_일시불_전월', '증감율_이용건수_신용_분기', '증감율_이용금액_신용_분기', '증감율_이용금액_일시불_분기', '잔액_신판평균한도소진율_r6m', '잔액_신판최대한도소진율_r6m', '잔액_신판ca평균한도소진율_r6m', '잔액_신판ca평균한도소진율_r3m', '변동률_일시불평잔', '변동률_RV일시불평잔', '변동률_할부평잔', '변동률_잔액_일시불_B1M', '혜택수혜율_R3M', '최종이용일자_기본', '최종이용일자_체크', '최종이용일자_할부', '이용금액_신판_B0M', '이용금액_일시불_B0M', '이용건수_신용_R12M', '이용금액_할부_유이자_R12M', '이용금액_체크_R12M', '최대이용금액_신용_R12M', '최대이용금액_신판_R12M', '이용건수_신용_R6M', '이용금액_신판_R6M', '이용금액_일시불_R6M', '이용금액_할부_R6M', '이용금액_부분무이자_R6M', '이용금액_신판_R3M', '이용금액_부분무이자_R3M', '이용가맹점수', '이용금액_쇼핑', '이용금액_사교활동', '쇼핑_마트_이용금액', '쇼핑_편의점_이용금액', '쇼핑_기타_이용금액', '교통_주유이용금액', '_1순위업종_이용금액', '_2순위업종_이용금액', '_3순위업종_이용금액', '_3순위쇼핑업종_이용금액', '_2순위교통업종_이용금액', '할부금액_무이자_3M_R12M', '최종카드론_대출이율', '이용금액_오프라인_R6M', '이용건수_오프라인_R6M', '이용건수_오프라인_B0M', '이용금액_페이_온라인_R6M', '정상청구원금_B0M', '연체입금원금_B0M', '정상청구원금_B2M', '연체입금원금_B2M', '정상청구원금_B5M', '연체입금원금_B5M', '카드이용한도금액', '일시상환론한도금액', '월상환론한도금액', 'CA이자율_할인전', 'RV신청일자', '상향

Unnamed: 0,기준년월,발급회원번호,증감율_이용금액_신판_전월,증감율_이용금액_일시불_전월,증감율_이용건수_신용_분기,증감율_이용금액_신용_분기,증감율_이용금액_일시불_분기,잔액_신판평균한도소진율_r6m,잔액_신판최대한도소진율_r6m,잔액_신판ca평균한도소진율_r6m,...,잔액_일시불_B2M,평잔_일시불_3M,인입월수_ARS_R6M,방문횟수_앱_R6M,청구금액_B0,이용금액_R3M_신용체크,이용금액_R3M_신용,최종유효년월_신용_이용가능,최종유효년월_신용_이용,최종카드발급경과월
0,201807,SYN_100022,0.260559,0.260559,-0.116826,0.003011,0.003011,0.315508,0.410814,0.324979,...,1516945,1991562,1,37,702874,1967446,1967446,201910.0,202001.0,47
1,201808,SYN_100022,0.301082,0.301082,-0.387184,-0.303957,-0.303957,0.319732,0.402875,0.338338,...,1546018,1983755,1,47,507508,1672527,1672527,202001.0,202001.0,48
2,201809,SYN_100022,0.234625,0.234625,-0.294737,-0.355487,-0.355487,0.371649,0.41452,0.405172,...,1571048,1984430,1,45,594177,1541344,1541344,201910.0,201912.0,49
3,201810,SYN_100022,0.276458,0.276458,-0.156311,-0.15805,-0.15805,0.349783,0.369626,0.387961,...,2436470,1585222,0,43,549161,1584928,1584928,202003.0,202005.0,50
4,201811,SYN_100022,0.262132,0.262132,-0.098511,-0.038224,-0.038224,0.327325,0.363699,0.351894,...,2445215,1763295,0,44,691786,1755897,1755897,202003.0,202005.0,51


In [None]:
# 필요시 저장
# final_df.to_csv("VIP_merged_result_columns_select_78cols.csv", index=False, encoding='utf-8')