In [1]:
import pandas as pd

# 1. 파일 불러오기 (용량이 크니 low_memory 옵션 추가!)
df_target = pd.read_csv('VIP_타겟_WideFormat.csv', low_memory=False)
df_50k = pd.read_csv('50K_VIP_WiderFormat.csv', low_memory=False)

# 2. 합치기 전에 각 파일 내에서 중복된 '발급회원번호'가 혹시 있는지 체크하고 지워주자!
df_target = df_target.drop_duplicates(subset=['발급회원번호'])
df_50k = df_50k.drop_duplicates(subset=['발급회원번호'])

# 3. 'VIP_타겟' 파일을 기준으로 합치기 (how='left')
# 이렇게 하면 df_target의 인덱스와 회원 리스트는 유지되고, 50k에만 있는 회원은 사라져!
merged_df = pd.merge(df_target, df_50k, on='발급회원번호', how='left')

print(f"기준(Target) 데이터 행 개수: {len(df_target)}")
print(f"병합 후 데이터 행 개수: {len(merged_df)}")

기준(Target) 데이터 행 개수: 8902
병합 후 데이터 행 개수: 8902


In [7]:
df_target.columns

Index(['발급회원번호', '이용건수_신용_B0M_07', '이용건수_신용_B0M_08', '이용건수_신용_B0M_09',
       '이용건수_신용_B0M_10', '이용건수_신용_B0M_11', '이용건수_신용_B0M_12', '이용금액_신용_B0M_07',
       '이용금액_신용_B0M_08', '이용금액_신용_B0M_09', '이용금액_신용_B0M_10', '이용금액_신용_B0M_11',
       '이용금액_신용_B0M_12', '잔액_B0M_07', '잔액_B0M_08', '잔액_B0M_09', '잔액_B0M_10',
       '잔액_B0M_11', '잔액_B0M_12', '잔액_현금서비스_B0M_07', '잔액_현금서비스_B0M_08',
       '잔액_현금서비스_B0M_09', '잔액_현금서비스_B0M_10', '잔액_현금서비스_B0M_11',
       '잔액_현금서비스_B0M_12', '잔액_카드론_B0M_07', '잔액_카드론_B0M_08', '잔액_카드론_B0M_09',
       '잔액_카드론_B0M_10', '잔액_카드론_B0M_11', '잔액_카드론_B0M_12', '연체잔액_B0M_07',
       '연체잔액_B0M_08', '연체잔액_B0M_09', '연체잔액_B0M_10', '연체잔액_B0M_11',
       '연체잔액_B0M_12', '월중평잔_07', '월중평잔_08', '월중평잔_09', '월중평잔_10', '월중평잔_11',
       '월중평잔_12', 'Score_BadDebt', 'Score_Delinq', 'Score_Activity',
       'Score_Asset', 'Score_Status_Total', 'Slope_Spend', 'Slope_Balance',
       'Slope_Count', 'Norm_Slope_Spend', 'Norm_Slope_Balance',
       'Norm_Slope_Count', 'Score_Slope_Total', 'Final_T

In [2]:
merged_df = pd.merge(df_target, df_50k, on='발급회원번호', how='left', suffixes=('', '_drop'))


cols_to_drop = [c for c in merged_df.columns if c.endswith('_drop')]
merged_df.drop(columns=cols_to_drop, inplace=True)


merged_df = merged_df.drop_duplicates(subset=['발급회원번호'], keep='first')


merged_df.reset_index(drop=True, inplace=True)

print(f"✨ 병합 및 정리 완료! ✨")
print(f"기준 파일 행 개수: {len(df_target)}")
print(f"최종 결과물 행 개수: {len(merged_df)}") 

✨ 병합 및 정리 완료! ✨
기준 파일 행 개수: 8902
최종 결과물 행 개수: 8902


In [3]:
merged_df.head()

Unnamed: 0,발급회원번호,이용건수_신용_B0M_07,이용건수_신용_B0M_08,이용건수_신용_B0M_09,이용건수_신용_B0M_10,이용건수_신용_B0M_11,이용건수_신용_B0M_12,이용금액_신용_B0M_07,이용금액_신용_B0M_08,이용금액_신용_B0M_09,...,혜택수혜율_R3M_09,혜택수혜율_R3M_10,혜택수혜율_R3M_11,혜택수혜율_R3M_12,혜택수혜율_B0M_07,혜택수혜율_B0M_08,혜택수혜율_B0M_09,혜택수혜율_B0M_10,혜택수혜율_B0M_11,혜택수혜율_B0M_12
0,SYN_100022,19,14,17,15,22,20,480339,521058,468354,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SYN_1000502,62,64,61,65,68,64,384296,374107,333770,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SYN_1000924,45,45,47,47,44,40,1321537,1317041,1210077,...,1.201645,1.905175,1.777966,1.435725,1.74769,2.78567,2.724771,1.103773,2.390778,1.239013
3,SYN_1001012,25,25,26,27,30,28,359449,465522,522558,...,3.125902,2.32954,2.694161,2.960395,3.144258,2.945304,2.686417,3.191364,3.027355,3.451466
4,SYN_1001025,72,74,71,76,72,72,2005304,1864818,1688989,...,1.132257,1.559776,0.909197,1.360722,1.279682,0.453615,2.174966,1.673895,2.314231,1.322371


In [6]:
merged_df['Target']

0       0
1       0
2       0
3       0
4       0
       ..
8897    0
8898    0
8899    0
8900    0
8901    0
Name: Target, Length: 8902, dtype: int64

In [4]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8902 entries, 0 to 8901
Columns: 4940 entries, 발급회원번호 to 혜택수혜율_B0M_12
dtypes: float64(1480), int64(3279), object(181)
memory usage: 335.5+ MB


In [5]:
# file_name = 'VIP_모델학습용_8902_4940.csv'


# merged_df.to_csv(file_name, index=False, encoding='utf-8-sig')
