In [7]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# ===== 1️⃣ Load CSV =====
dir_path = "初賽資料/"
df_txn = pd.read_csv(os.path.join(dir_path, 'acct_transaction.csv'))
df_alert = pd.read_csv(os.path.join(dir_path, 'acct_alert.csv'))
df_test = pd.read_csv(os.path.join(dir_path, 'acct_predict.csv'))
print("(Finish) Load Dataset.")

# ===== 2️⃣ PreProcessing =====
# 計算每個帳戶的交易統計量
send = df_txn.groupby('from_acct')['txn_amt'].sum().rename('total_send_amt')
recv = df_txn.groupby('to_acct')['txn_amt'].sum().rename('total_recv_amt')

max_send = df_txn.groupby('from_acct')['txn_amt'].max().rename('max_send_amt')
min_send = df_txn.groupby('from_acct')['txn_amt'].min().rename('min_send_amt')
avg_send = df_txn.groupby('from_acct')['txn_amt'].mean().rename('avg_send_amt')

max_recv = df_txn.groupby('to_acct')['txn_amt'].max().rename('max_recv_amt')
min_recv = df_txn.groupby('to_acct')['txn_amt'].min().rename('min_recv_amt')
avg_recv = df_txn.groupby('to_acct')['txn_amt'].mean().rename('avg_recv_amt')

df_result = pd.concat([max_send, min_send, avg_send, max_recv, min_recv, avg_recv, send, recv], axis=1).fillna(0).reset_index()
df_result.rename(columns={'index': 'acct'}, inplace=True)

# 是否玉山帳戶
df_from = df_txn[['from_acct', 'from_acct_type']].rename(columns={'from_acct':'acct','from_acct_type':'is_esun'})
df_to = df_txn[['to_acct', 'to_acct_type']].rename(columns={'to_acct':'acct','to_acct_type':'is_esun'})
df_acc = pd.concat([df_from, df_to], ignore_index=True).drop_duplicates().reset_index(drop=True)

# 合併交易統計與帳戶屬性
df_X = pd.merge(df_result, df_acc, on='acct', how='left')
print("(Finish) PreProcessing.")

# ===== 3️⃣ Train/Test Split =====
# 訓練集：非待測帳戶 & 玉山帳戶
X_train = df_X[(~df_X['acct'].isin(df_test['acct'])) & (df_X['is_esun']==1)].drop(columns=['is_esun']).copy()
y_train = X_train['acct'].isin(df_alert['acct']).astype(int)

# 測試集：待測帳戶
X_test = df_X[df_X['acct'].isin(df_test['acct'])].drop(columns=['is_esun']).copy()
print("-----------------------------------")
print(X_train.head())
print("-----------------------------------")
print(y_train.head())
print("-----------------------------------")
print(X_test.head())
print("-----------------------------------")
print("(Finish) Train-Test-Split")

# ===== 4️⃣ Modeling =====
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train.drop(columns=['acct']), y_train)
y_pred = model.predict(X_test.drop(columns=['acct']))
print("(Finish) Modeling")

# ===== 5️⃣ Output CSV =====
df_pred = pd.DataFrame({
    'acct': X_test['acct'].values,
    'label': y_pred
})

df_out = df_test[['acct']].merge(df_pred, on='acct', how='left')
out_path = "result.csv"
df_out.to_csv(out_path, index=False)
print(f"(Finish) Output saved to {out_path}")


(Finish) Load Dataset.
(Finish) PreProcessing.
-----------------------------------
                                                 acct  max_send_amt  \
4   00007e8eec41727b71aca0e00cf26c75bd96babafa5203...       12500.0   
7   000093515bcc2248cb669fa5af67656fce904f5206ef29...        1250.0   
11  0000e57c3c5cb730d620e2fa80b40e06fa6fd54f38e1aa...        1450.0   
13  0000efc77b357bf596f34c3062bc2a04933cb373d0e909...        1550.0   
14  000102a7a1ae87401d7f8fdc24c8549708aadce52ad954...      125000.0   

    min_send_amt   avg_send_amt  max_recv_amt  min_recv_amt  avg_recv_amt  \
4         1050.0    4500.000000           0.0           0.0           0.0   
7         1250.0    1250.000000           0.0           0.0           0.0   
11        1450.0    1450.000000           0.0           0.0           0.0   
13         815.0    1138.333333        7550.0        1050.0        3910.0   
14      125000.0  125000.000000           0.0           0.0           0.0   

    total_send_amt  total_r

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(328988, 9)
(328988,)
(4780, 9)


In [18]:
y_train

4          0
7          0
11         0
13         0
14         0
          ..
1800056    0
1800060    0
1800066    0
1800082    0
1800103    0
Name: acct, Length: 328988, dtype: int64

In [21]:
y_train.value_counts()

acct
0    327984
1      1004
Name: count, dtype: int64

In [19]:
X_train

Unnamed: 0,acct,max_send_amt,min_send_amt,avg_send_amt,max_recv_amt,min_recv_amt,avg_recv_amt,total_send_amt,total_recv_amt
4,00007e8eec41727b71aca0e00cf26c75bd96babafa5203...,12500.0,1050.0,4500.000000,0.0,0.0,0.0,31500.0,0.0
7,000093515bcc2248cb669fa5af67656fce904f5206ef29...,1250.0,1250.0,1250.000000,0.0,0.0,0.0,1250.0,0.0
11,0000e57c3c5cb730d620e2fa80b40e06fa6fd54f38e1aa...,1450.0,1450.0,1450.000000,0.0,0.0,0.0,1450.0,0.0
13,0000efc77b357bf596f34c3062bc2a04933cb373d0e909...,1550.0,815.0,1138.333333,7550.0,1050.0,3910.0,3415.0,19550.0
14,000102a7a1ae87401d7f8fdc24c8549708aadce52ad954...,125000.0,125000.0,125000.000000,0.0,0.0,0.0,125000.0,0.0
...,...,...,...,...,...,...,...,...,...
1800056,fffd69ecb9b7310514a9c11ff7291fbce412f03428b6cd...,0.0,0.0,0.000000,6250.0,6250.0,6250.0,0.0,12500.0
1800060,fffdb24bbb306d1bb0a4a8b59b1af1aa77f1b8cca417fc...,0.0,0.0,0.000000,1650.0,1550.0,1600.0,0.0,3200.0
1800066,fffdd9f43addbe4aea36fa2c2595dfdcd09fbfd6c0fdd6...,0.0,0.0,0.000000,60500.0,1950.0,18387.5,0.0,73550.0
1800082,fffe99d938144c52489d90783ab6c371f178c7bbd49199...,0.0,0.0,0.000000,315000.0,315000.0,315000.0,0.0,315000.0


In [23]:
len(X_train)

328988

In [20]:
X_test

Unnamed: 0,acct,max_send_amt,min_send_amt,avg_send_amt,max_recv_amt,min_recv_amt,avg_recv_amt,total_send_amt,total_recv_amt
200,000ef1e36f55fd9f71dd7e02fe747af1f6240417f9e807...,205000.0,105.0,42614.387755,385000.0,6050.0,93851.250000,2088105.0,3754050.0
249,00129e51dddd8e4265489bad36a7c1a66f813ee1c86a79...,33500.0,255.0,6076.428571,32500.0,505.0,4843.500000,212675.0,145305.0
394,001d6ee4c8bdfc61447fdb5624c57a897426e278ba10b9...,3050000.0,3150.0,403318.292683,1450000.0,2250.0,386680.000000,16536050.0,1933400.0
690,0036584bcfa9a386492e6538fc0c6ae0728f098d11e640...,9550.0,1550.0,3100.000000,11500.0,1550.0,8066.666667,18600.0,48400.0
892,0045920c6a09906468d855f0343972685d925000266ffb...,38500.0,435.0,8905.757576,205000.0,35.0,20775.405405,293890.0,768690.0
...,...,...,...,...,...,...,...,...,...
1779248,fa864c33441bbe3e522367a3c38bbb457f2a9ab7cf569b...,0.0,0.0,0.000000,4650.0,205.0,1096.875000,0.0,17550.0
1784011,fbc3c34c22c96e7be386129dd6ad23dda158d1fc4dee42...,0.0,0.0,0.000000,50500.0,55.0,5581.250000,0.0,111625.0
1787293,fca2d2de5d3f988f9fac909bcfd2141cefdad1d50c3f24...,0.0,0.0,0.000000,84500.0,7850.0,30680.000000,0.0,153400.0
1792132,fde8e164f13ad5970170aef31383ac6e015fb1c64a3ced...,0.0,0.0,0.000000,18500.0,1050.0,3713.333333,0.0,111400.0


In [22]:
len(X_test)

4780

按帳戶劃分：

X_train：交易中帳戶不在測試帳戶清單的行

y_train：對應的 Label（0 / 1）

X_test：交易中帳戶在測試帳戶清單的行