In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import deque
import random

import networkx as nx
# import igraph as ig

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve

pd.set_option('display.max_columns', None)

In [None]:
hf_df = pd.read_csv('C:/Users/DATOP/data/dataset_002/HF_TRNS_TRAM.csv', low_memory=False)

print("데이터 로드 완료")
print(hf_df.head())

In [None]:
# 'ff_sp_ai' 컬럼 값이 '01', '02', 'SP'인 데이터만 선택
hf_df_filtered_1 = hf_df[(hf_df['ff_sp_ai'] == '01') | (hf_df['ff_sp_ai'] == '02') | (hf_df['ff_sp_ai'] == 'SP')]

# 'ff_sp_ai' 컬럼 값이 '01', '02', 'SP'가 아닌 데이터만 선택
hf_df_filtered_2 = hf_df[~((hf_df['ff_sp_ai'] == '01') | (hf_df['ff_sp_ai'] == '02') | (hf_df['ff_sp_ai'] == 'SP'))]

print(hf_df_filtered_1.shape)
print(hf_df_filtered_2.shape)
# (9321, 10)
# (10542643, 10)

# 위에서 필터링한 두 번째 데이터프레임(hf_df_filtered_2)에서 1%의 데이터를 무작위로 샘플링
hf_df_filtered_2_sampled = hf_df_filtered_2.sample(frac=0.01, random_state=42)
hf_df_filtered_2_sampled.shape
# (105426, 10)

# 첫 번째 데이터프레임과, 두 번째 데이터프레임에서 샘플링한 결과를 하나로 합치기
hf_sample_df = pd.concat([hf_df_filtered_1, hf_df_filtered_2_sampled], ignore_index=True)

hf_sample_df.shape
# (114747, 10)

hf_sample_df.head()

In [None]:
# datetime 컬럼 생성
hf_sample_df['datetime'] = pd.to_datetime(hf_sample_df['tran_dt'], format='%Y%m%d') + hf_sample_df['tran_tmrg'].astype('timedelta64[h]')

# wd_account 컬럼 생성 (wd_fc_sn + wd_ac_sn을 문자열로 결합)
hf_sample_df['wd_account'] = hf_sample_df['wd_fc_sn'].astype(str) + hf_sample_df['wd_ac_sn'].astype(str)

# dps_account 컬럼 생성 (dps_fc_sn + dps_ac_sn을 문자열로 결합)
hf_sample_df['dps_account'] = hf_sample_df['dps_fc_sn'].astype(str) + hf_sample_df['dps_ac_sn'].astype(str)

# 기초 통계량 피쳐

In [None]:
# 기초 통계량 피처를 만들기 위해 데이터프레임을 복사
hf_basic_stats_df = hf_sample_df

# 출금 계좌별로 정렬
hf_basic_stats_df = hf_basic_stats_df.sort_values(['wd_account', 'datetime']).reset_index(drop=True)

In [None]:
def sliding_window_count(group, window):

    times = group['datetime'].values
    n = len(times)

    counts = np.zeros(n, dtype=int)
    start = 0

    for i in range(n):
        t = times[i]

        while t - times[start] > np.timedelta64(window):
            start += 1

        # counts[i] = i - start + 1
        counts[i] = i - start

    return pd.Series(counts, index=group.index)

In [None]:
from collections import deque

def sliding_window_total(group, window):
    q = deque()
    total = 0
    result = []

    for dt, amt in zip(group['datetime'], group['tran_amt']):
        # 새 거래 추가
        q.append((dt, amt))
        total += amt

        # window 벗어난 거래 제거
        while q and (dt - q[0][0]) > window:
            old_dt, old_amt = q.popleft()
            total -= old_amt

        result.append(total)

    return pd.Series(result, index=group.index)

In [None]:
# 1. 피처: 최근 시간별, 일별 송금횟수 피처 생성
hf_basic_stats_df['송금횟수_최근1시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=1))
)

hf_basic_stats_df['송금횟수_최근3시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=3))
)

hf_basic_stats_df['송금횟수_최근12시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=12))
)

hf_basic_stats_df['송금횟수_최근1일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=1))
)

hf_basic_stats_df['송금횟수_최근7일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=7))
)

hf_basic_stats_df['송금횟수_최근30일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=30))
)

hf_basic_stats_df['송금횟수_최근90일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=90))
)

In [None]:
# 2. 피처: 최근 시간별, 일별 송금총액 피처 생성
hf_basic_stats_df['송금총액_최근1시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=1)))
)

hf_basic_stats_df['송금총액_최근3시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=3)))
)

hf_basic_stats_df['송금총액_최근12시간'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=12)))
)

hf_basic_stats_df['송금총액_최근1일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=1)))
)

hf_basic_stats_df['송금총액_최근7일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=7)))
)

hf_basic_stats_df['송금총액_최근30일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=30)))
)

hf_basic_stats_df['송금총액_최근90일'] = (
    hf_basic_stats_df.groupby('wd_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=90)))
)

In [None]:
hf_basic_stats_df = hf_basic_stats_df.sort_values(['dps_account', 'datetime']).reset_index(drop=True)

# 3. 피처: 최근 시간별, 일별 입금횟수 피처 생성
hf_basic_stats_df['입금횟수_최근1시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=1))
)

hf_basic_stats_df['입금횟수_최근3시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=3))
)

hf_basic_stats_df['입금횟수_최근12시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(hours=12))
)

hf_basic_stats_df['입금횟수_최근1일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=1))
)

hf_basic_stats_df['입금횟수_최근7일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=7))
)

hf_basic_stats_df['입금횟수_최근30일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=30))
)

hf_basic_stats_df['입금횟수_최근90일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False).apply(sliding_window_count, window=pd.Timedelta(days=90))
)

In [None]:
# 4. 피처: 최근 시간별, 일별 입금총액 피처 생성
hf_basic_stats_df['입금총액_최근1시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=1)))
)

hf_basic_stats_df['입금총액_최근3시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=3)))
)

hf_basic_stats_df['입금총액_최근12시간'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(hours=12)))
)

hf_basic_stats_df['입금총액_최근1일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=1)))
)

hf_basic_stats_df['입금총액_최근7일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=7)))
)

hf_basic_stats_df['입금총액_최근30일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=30)))
)

hf_basic_stats_df['입금총액_최근90일'] = (
    hf_basic_stats_df.groupby('dps_account', group_keys=False)
    .apply(lambda x: sliding_window_total(x, window=pd.Timedelta(days=90)))
)

In [None]:
# 5. 피처: 거래 상대방 계좌와의 첫 거래 여부
# txn_rank: 몇번째 거래인가
# is_first_txn: 첫번째 거래 여부
hf_basic_stats_df['txn_rank'] = hf_basic_stats_df.groupby('wd_account')['datetime'].rank(method='first')
hf_basic_stats_df['is_first_txn'] = (hf_basic_stats_df['txn_rank'] == 1).astype(int)

hf_basic_stats_df.drop(['txn_rank'], axis=1, inplace=True)

# 6. 피처: 전체 거래량 대비 송금 거래량 비율
hf_basic_stats_df['송금거래량비율_최근30일'] = hf_basic_stats_df['송금횟수_최근30일'] / (hf_basic_stats_df['송금횟수_최근30일'] + hf_basic_stats_df['입금횟수_최근30일']+ 1e-10)
hf_basic_stats_df['송금거래량비율_최근90일'] = hf_basic_stats_df['송금횟수_최근90일'] / (hf_basic_stats_df['송금횟수_최근90일'] + hf_basic_stats_df['입금횟수_최근90일']+ 1e-10)
hf_basic_stats_df['입금거래량비율_최근30일'] = hf_basic_stats_df['입금횟수_최근30일'] / (hf_basic_stats_df['송금횟수_최근30일'] + hf_basic_stats_df['입금횟수_최근30일']+ 1e-10)
hf_basic_stats_df['입금거래량비율_최근90일'] = hf_basic_stats_df['입금횟수_최근90일'] / (hf_basic_stats_df['송금횟수_최근90일'] + hf_basic_stats_df['입금횟수_최근90일']+ 1e-10)

# 0으로 나누는 경우 Null -> 0으로 대체
hf_basic_stats_df['송금거래량비율_최근30일'] = hf_basic_stats_df['송금거래량비율_최근30일'].fillna(0)
hf_basic_stats_df['송금거래량비율_최근90일'] = hf_basic_stats_df['송금거래량비율_최근90일'].fillna(0)
hf_basic_stats_df['입금거래량비율_최근30일'] = hf_basic_stats_df['입금거래량비율_최근30일'].fillna(0)
hf_basic_stats_df['입금거래량비율_최근90일'] = hf_basic_stats_df['입금거래량비율_최근90일'].fillna(0)

In [None]:
# 7. 피처: 매체별 송금/입금 횟수
media_codes = list(sorted(hf_basic_stats_df['md_type'].unique())) # [1,2,3,4,5,6,7]

for md in media_codes:
    # 출금계좌별로 정렬
    hf_basic_stats_df = hf_basic_stats_df.sort_values(['wd_account', 'datetime']).reset_index(drop=True)
    # 매체별 송금횟수 집계
    hf_basic_stats_df[f'송금횟수_매체구분{md}_최근3시간'] = (
        hf_basic_stats_df[hf_basic_stats_df['md_type'] == md]
        .groupby('wd_account', group_keys=False)
        .apply(sliding_window_count, window=pd.Timedelta(hours=3))
    ).astype(int)
    hf_basic_stats_df[f'송금횟수_매체구분{md}_최근12시간'] = (
        hf_basic_stats_df[hf_basic_stats_df['md_type'] == md]
        .groupby('wd_account', group_keys=False)
        .apply(sliding_window_count, window=pd.Timedelta(hours=12))
    ).astype(int)

    # 입금계좌별로 정렬
    hf_basic_stats_df = hf_basic_stats_df.sort_values(['dps_account', 'datetime']).reset_index(drop=True)
    # 매체별 입금횟수 집계
    hf_basic_stats_df[f'입금횟수_매체구분{md}_최근3시간'] = (
        hf_basic_stats_df[hf_basic_stats_df['md_type'] == md]
        .groupby('dps_account', group_keys=False)
        .apply(sliding_window_count, window=pd.Timedelta(hours=3))
    ).astype(int)
    hf_basic_stats_df[f'입금횟수_매체구분{md}_최근12시간'] = (
        hf_basic_stats_df[hf_basic_stats_df['md_type'] == md]
        .groupby('dps_account', group_keys=False)
        .apply(sliding_window_count, window=pd.Timedelta(hours=12))
    ).astype(int)

# 결측치 처리: 송금(입금) 횟수 없음 Null -> 0 으로 대체
hf_basic_stats_df[f'송금횟수_매체구분{md}_최근3시간'] = hf_basic_stats_df[f'송금횟수_매체구분{md}_최근3시간'].fillna(0).astype(int)
hf_basic_stats_df[f'송금횟수_매체구분{md}_최근12시간'] = hf_basic_stats_df[f'송금횟수_매체구분{md}_최근12시간'].fillna(0).astype(int)
hf_basic_stats_df[f'입금횟수_매체구분{md}_최근3시간'] = hf_basic_stats_df[f'입금횟수_매체구분{md}_최근3시간'].fillna(0).astype(int)
hf_basic_stats_df[f'입금횟수_매체구분{md}_최근12시간'] = hf_basic_stats_df[f'입금횟수_매체구분{md}_최근12시간'].fillna(0).astype(int)


In [None]:
# 불필요한 컬럼 제거
hf_basic_stats_df.drop(columns=['tran_dt', 'tran_tmrg', 'wd_fc_sn', 'wd_ac_sn', 'dps_fc_sn', 'dps_ac_sn'], axis=1, inplace=True)

# 최종 컬럼 확인
hf_basic_stats_df.columns

In [None]:
# 최종적으로 사용할 컬럼들만 선택 및 순서 재배치
hf_basic_stats_df = hf_basic_stats_df[['datetime', 'wd_account',
    'dps_account', 'tran_amt', 'md_type', 'fnd_type', '송금횟수_최근1시간', '송금횟수_최근3시간', '송금횟수_최근12시간', '송금횟수_최근1일', '송금횟수_최근7일',
    '송금횟수_최근30일', '송금횟수_최근90일', '송금총액_최근1시간', '송금총액_최근3시간', '송금총액_최근12시간', '송금총액_최근1일', '송금총액_최근7일', '송금총액_최근30일', '송금총액_최근90일',
    '입금횟수_최근1시간', '입금횟수_최근3시간', '입금횟수_최근12시간', '입금횟수_최근1일', '입금횟수_최근7일', '입금횟수_최근30일', '입금횟수_최근90일',
    '입금총액_최근1시간', '입금총액_최근3시간', '입금총액_최근12시간', '입금총액_최근1일', '입금총액_최근7일', '입금총액_최근30일', '입금총액_최근90일', 'is_first_txn',
    '송금거래량비율_최근30일', '송금거래량비율_최근90일', '입금거래량비율_최근30일', '입금거래량비율_최근90일',
    '송금횟수_매체구분1_최근3시간', '송금횟수_매체구분1_최근12시간', '입금횟수_매체구분1_최근3시간', '입금횟수_매체구분1_최근12시간',
    '송금횟수_매체구분2_최근3시간', '송금횟수_매체구분2_최근12시간', '입금횟수_매체구분2_최근3시간', '입금횟수_매체구분2_최근12시간',
    '송금횟수_매체구분3_최근3시간', '송금횟수_매체구분3_최근12시간', '입금횟수_매체구분3_최근3시간', '입금횟수_매체구분3_최근12시간',
    '송금횟수_매체구분4_최근3시간', '송금횟수_매체구분4_최근12시간', '입금횟수_매체구분4_최근3시간', '입금횟수_매체구분4_최근12시간',
    '송금횟수_매체구분5_최근3시간', '송금횟수_매체구분5_최근12시간', '입금횟수_매체구분5_최근3시간', '입금횟수_매체구분5_최근12시간',
    '송금횟수_매체구분6_최근3시간', '송금횟수_매체구분6_최근12시간', '입금횟수_매체구분6_최근3시간', '입금횟수_매체구분6_최근12시간',
    '송금횟수_매체구분7_최근3시간', '송금횟수_매체구분7_최근12시간', '입금횟수_매체구분7_최근3시간', '입금횟수_매체구분7_최근12시간',
    'ff_sp_ai']]

# 데이터프레임의 요약 정보 출력
hf_basic_stats_df.info()

In [None]:
hf_basic_stats_df['wd_account'] = hf_basic_stats_df['wd_account'].astype('int64')
hf_basic_stats_df['dps_account'] = hf_basic_stats_df['dps_account'].astype('int64')

hf_basic_stats_df

# 그래프 피쳐

In [None]:
# 그래프 피처 생성을 위한 별도의 데이터프레임 준비
hf_graph_df = hf_sample_df

# 출금 계좌별로 정렬
hf_graph_df = hf_graph_df.sort_values(['wd_account', 'datetime']).reset_index(drop=True)

# 불필요한 원본 컬럼 제거
hf_graph_df.drop(columns=['tran_dt', 'tran_tmrg', 'wd_fc_sn', 'wd_ac_sn', 'dps_fc_sn', 'dps_ac_sn'], axis=1, inplace=True)

# 그래프 생성에 필요한 컬럼만 선택 및 재정렬
hf_graph_df = hf_graph_df[['datetime', 'wd_account', 'dps_account', 'tran_amt', 'md_type', 'fnd_type', 'ff_sp_ai']]

In [None]:
## MultiDiGraph 구축 함수
def build_multidigraph(df):
    G = nx.MultiDiGraph()

    for _, row in df.iterrows():
        from_acc = row['wd_account']
        to_acc = row['dps_account']
        attrs = {
            'datetime': row['datetime'],
            'amount': row['tran_amt'],
            'channel': row['md_type'],
            'fund_type': row['fnd_type'],
            'fraud_flag': row['ff_sp_ai']
        }

        G.add_edge(from_acc, to_acc, **attrs)

    return G

G = build_multidigraph(hf_graph_df)

In [None]:
# ========= 1. 중심성 ======== #
centrality_df = pd.DataFrame({
    'node': list(G.nodes()),
    'degree_centrality': pd.Series(nx.degree_centrality(G)),
    'betweenness': pd.Series(nx.betweenness_centrality(G)),
    'closeness': pd.Series(nx.closeness_centrality(G)),
    'eigenvector': pd.Series(nx.eigenvector_centrality_numpy(G)),
    'pagerank': pd.Series(nx.pagerank(G))
}).reset_index(drop=True)

centrality_df

In [None]:
# ======== 2. 차수 ======== #
degree_df = pd.DataFrame({
    'node': list(G.nodes()),
    'in_degree': pd.Series(dict(G.in_degree())),
    'out_degree': pd.Series(dict(G.out_degree()))
}).reset_index(drop=True)

degree_df["degree"] = degree_df["in_degree"] + degree_df["out_degree"]

degree_df

In [None]:
# ======== 3. 거래 파트너 다양성: 출금 계좌에 연결된 입금 계좌수 ======== #
partner_div_df = pd.DataFrame({
    "node": list(G.nodes()),
    "partner_div": [
        len(set(G.predecessors(n)).union(set(G.successors(n))))
        for n in G.nodes()
    ]
})

partner_div_df

In [None]:
# ======== 4. Louvain 커뮤니티 탐지======== #
# import community as community_louvain
# partition = community_louvain.best_partition(G.to_undirected())
# comm_df = pd.DataFrame(list(partition.items()), columns=["계좌", "community_id"])
# comm_size = comm_df["community_id"].value_counts().to_dict()
# comm_df["community_size"] = comm_df["community_id"].map(comm_size).astype(int)
# comm_df.set_index("계좌", inplace=True)


# ======== 5. 왕복 거래 여부 피처======== #
round_trip_df = pd.DataFrame({
    "node": list(G.nodes()),
    "round_trip": [
        any(G.has_edge(n, nbr) and G.has_edge(nbr, n) for nbr in G.successors(n))
        for n in G.nodes()
    ]
})

round_trip_df

In [None]:
# ======== 6. 의심계좌 근접도======== #
suspicious_nodes = [n for u, v, d in G.edges(data=True) if d.get("fraud_flag") == 1 for n in [u, v]]

suspect_proximity_df = pd.DataFrame({
    "node": list(G.nodes()),
    "suspect_proximity": [
        min([
            nx.shortest_path_length(G, source=n, target=s)
            for s in suspicious_nodes
            if nx.has_path(G, n, s)
        ]) if suspicious_nodes else None
        for n in G.nodes()
    ]
})

suspect_proximity_df

In [None]:
# ======== 7. Gather-scatter (한 계좌에서 여러 계좌로 분산 송금) ========
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())

gather_scatter_df = pd.DataFrame({
    "node": list(G.nodes()),
    "gather_scatter": [in_degree.get(n, 0) - out_degree.get(n, 0) for n in G.nodes()]
})

# ======== 8. Scatter-gather (여러 계좌에서 한 계좌로 집중 송금) ========
scatter_gather_df = pd.DataFrame({
    "node": list(G.nodes()),
    "scatter_gather": [out_degree.get(n, 0) - in_degree.get(n, 0) for n in G.nodes()]
})

# ======== 9. Bipartite 변환 (출금 계좌 vs 입금 계좌) ========
# 출금=1, 입금=0 구분
bipartite_set1 = set(hf_graph_df["wd_account"])
bipartite_df = pd.DataFrame({
    "node": list(G.nodes()),
    "bipartite": [1 if n in bipartite_set1 else 0 for n in G.nodes()]
})

In [None]:
# ======== 10. Simple cycle 탐지 ======== #
cycles = list(nx.simple_cycles(G))
simple_cycle_df = pd.DataFrame({
    "node": list(G.nodes()),
    "simple_cycle": [any(n in cycle for cycle in cycles) for n in G.nodes()]
})

# ======== 11. stack (여러 계좌로 송금) ======== #
stack_df = pd.DataFrame({
    "node": list(G.nodes()),
    "stack": [1 if len(set(G.successors(n))) > 1 else 0 for n in G.nodes()]
})

In [None]:
# ======== 12. random walk ======== #
def random_walk_probability(G, start, steps=100, walk_length=10):
    visits = {n: 0 for n in G.nodes()}
    for _ in range(steps):
        current = start
        for _ in range(walk_length):
            neighbors = list(G.successors(current))
            if not neighbors:
                break
            current = random.choice(neighbors)
            visits[current] += 1

    total = sum(visits.values())
    return {n: visits[n] / total if total > 0 else 0 for n in G.nodes()}

random_walk_feat = random_walk_probability(G, start=random.choice(list(G.nodes())))
random_walk_df = pd.DataFrame({
    "node": list(G.nodes()),
    "random_walk": pd.Series(random_walk_feat)
}).reset_index(drop=True)

In [None]:
# 모든 피처 병합
graph_features = (centrality_df
    .merge(degree_df, on='node', how='outer')
    .merge(partner_div_df, on='node', how='outer')
    .merge(round_trip_df, on='node', how='outer')
    .merge(suspect_proximity_df, on='node', how='outer')
    .merge(gather_scatter_df, on='node', how='outer')
    .merge(scatter_gather_df, on='node', how='outer')
    .merge(bipartite_df, on='node', how='outer')
    .merge(simple_cycle_df, on='node', how='outer')
    .merge(stack_df, on='node', how='outer')
    .merge(random_walk_df)).fillna(0)

print(graph_features.shape)
print(hf_graph_df.shape)

In [None]:
## node 기반 피처 -> 거래 단위 dataframe으로 확장하려면
# - 출금/입금 계좌 기준으로 각각 병합
# - prefix(src_, dst_) 붙여서 구분
## => 결과적으로 거래별로 출금+입금 계좌의 그래프 피처가 모두 포함된 테이블 생성

# graph_features: node 단위의 그래프 피처 DataFrame (node 컬럼 포함)
# df: 거래 단위 원본 데이터프레임

# 출금계좌 기준 피처
hf_graph_df_with_src = hf_graph_df.merge(
    graph_features.add_prefix("src_").rename(columns={"src_node": "wd_account"}),
    on="wd_account",
    how="left"
)

# 입금계좌 기준 피처
hf_graph_features_merged = hf_graph_df_with_src.merge(
    graph_features.add_prefix("dst_").rename(columns={"dst_node": "dps_account"}),
    on="dps_account",
    how="left"
)

hf_graph_features_merged.head()

In [None]:
# Find all boolean columns
bool_cols = hf_graph_features_merged.select_dtypes(include='bool').columns.tolist()
print(bool_cols)

# Convert boolean columns to integers
hf_graph_features_merged['src_round_trip'] = hf_graph_features_merged['src_round_trip'].astype(int)
hf_graph_features_merged['src_simple_cycle'] = hf_graph_features_merged['src_simple_cycle'].astype(int)
hf_graph_features_merged['dst_round_trip'] = hf_graph_features_merged['dst_round_trip'].astype(int)
hf_graph_features_merged['dst_simple_cycle'] = hf_graph_features_merged['dst_simple_cycle'].astype(int)

# Ensure account IDs are integers
hf_graph_features_merged['wd_account'] = hf_graph_features_merged['wd_account'].astype('int64')
hf_graph_features_merged['dps_account'] = hf_graph_features_merged['dps_account'].astype('int64')

# Display the final DataFrame summary
hf_graph_features_merged.info()

# 기초 통계량 피쳐 + 그래프 피쳐 병합

In [None]:
print(hf_basic_stats_df.shape)
print(hf_graph_features_merged.shape)
print(hf_graph_features_merged.columns)

In [None]:
hf_basic_stats_df.columns

In [None]:
hf_graph_features_merged.columns

In [None]:
# 거래 단위로 병합
# 거래 단위(row 단위)로 병합하려면 공통 컬럼을 키로 변환하면 됨
# 중복행 제거: 만약 같은 거래 단위('datetime', 'wd_account', 'dps_account', 'tran_amt', 'md_type', 'ff_sp_ai')가 두 데이터프레임에 여러번 존재하면
# merge시 행이 불어나므로, 병합전 unique처리가 필요함
hf_basic_stats_df = hf_basic_stats_df.drop_duplicates(
    subset=['datetime', 'wd_account', 'dps_account', 'tran_amt', 'md_type', 'ff_sp_ai']
)

hf_graph_df = hf_graph_df.drop_duplicates(
    subset=['datetime', 'wd_account', 'dps_account', 'tran_amt', 'md_type', 'ff_sp_ai']
)


hf_merged = pd.merge(hf_basic_stats_df, hf_graph_features_merged,
    on=['datetime', 'wd_account', 'dps_account', 'tran_amt', 'md_type', 'fnd_type', 'ff_sp_ai'], # 공통 컬럼을 키로 병합
    how='left',
    suffixes=('_basic_stats', '_graph')) # 혹시 겹치는 컬럼이 있으면 구분

hf_merged.head()

In [None]:
hf_merged.shape

# 전처리

In [None]:
# Drop account identifier columns
hf_merged.drop(columns=['wd_account', 'dps_account'], axis=1, inplace=True)

# '# 결측치 확인' (Check for missing values)
print(hf_merged.isnull().sum())
print('----------------------------------------------------')
print(hf_merged.isnull().mean())
print('----------------------------------------------------')

missing_info = hf_merged.isnull().sum()
missing_info = missing_info[missing_info > 0]
print(missing_info)

In [None]:
# 결측치 처리 및 확인
hf_merged['ff_sp_ai'] = hf_merged['ff_sp_ai'].fillna('00')
print(hf_merged.isnull().mean())

In [None]:
# ff_sp_ai 변수 분포 확인
hf_merged['ff_sp_ai'].value_counts()

In [None]:
# 라벨 인코딩 (Label Encoding)
le = LabelEncoder()

hf_merged.loc[:, 'ff_sp_ai_encoded'] = le.fit_transform(hf_merged['ff_sp_ai'])

# 기존 컬럼 삭제 및 인코딩된 결과 확인
hf_merged.drop(columns=['ff_sp_ai'], inplace=True)
print(hf_merged['ff_sp_ai_encoded'].value_counts())

In [None]:
# ff_sp_ai_encoded 값을 target 0 또는 1로 변경
# 01, 02, SP -> target 1, 이외 target 0 으로 설정
hf_merged['target'] = 0
hf_merged.loc[hf_merged['ff_sp_ai_encoded'] == 1, 'target'] = 1
hf_merged.loc[hf_merged['ff_sp_ai_encoded'] == 2, 'target'] = 1
hf_merged.loc[hf_merged['ff_sp_ai_encoded'] == 3, 'target'] = 1

hf_merged.head()

In [None]:
# 날짜별 오름차순 정렬
hf_merged.sort_values(by='datetime', ascending=True, inplace=True)

hf_merged.head()

In [None]:
# ff_sp_ai_encoded 제거
hf_merged.drop(['ff_sp_ai_encoded'], axis=1, inplace=True)

hf_merged.head()

In [None]:
# train/valid/test set 분할

# train set: 2024-01-01 ~ 2024-07-31
# valid set: 2024-08-01 ~ 2024-10-31
# test set: 2024-11-01 ~ 2024-12-31

train_end = '2024-07-31 23:59:59'
valid_end = '2024-10-31 23:59:59'

train_2 = hf_merged[hf_merged['datetime'] <= train_end]
valid_2 = hf_merged[(hf_merged['datetime'] > train_end) & (hf_merged['datetime'] <= valid_end)]
test_2 = hf_merged[hf_merged['datetime'] > valid_end]


print(f"학습셋 shape: {train_2.shape}, 정상건수: {train_2[train_2['target'] == 0].shape}, 부정건수: {train_2[train_2['target'] == 1].shape}")
train_fraud_ratio = train_2[train_2['target'] == 1].shape[0] / train_2.shape[0] * 100
print(f"학습셋 fraud 비율: {train_fraud_ratio:.3f}%")

print('----------------------------------------------------')

print(f"검증셋 shape: {valid_2.shape}, 정상건수: {valid_2[valid_2['target'] == 0].shape}, 부정건수: {valid_2[valid_2['target'] == 1].shape}")
valid_fraud_ratio = valid_2[valid_2['target'] == 1].shape[0] / valid_2.shape[0] * 100
print(f"검증셋 fraud 비율: {valid_fraud_ratio:.3f}%")

print('----------------------------------------------------')

print(f"테스트셋 shape: {test_2.shape}, 정상건수: {test_2[test_2['target'] == 0].shape}, 부정건수: {test_2[test_2['target'] == 1].shape}")
test_fraud_ratio = test_2[test_2['target'] == 1].shape[0] / test_2.shape[0] * 100
print(f"테스트셋 fraud 비율: {test_fraud_ratio:.3f}%")

In [None]:
print(train_2.columns)

In [None]:
print(train_2.shape)

In [None]:
# Separate features (X) and target (y) for each set
X_train = train_2.iloc[:, :-1]
y_train = train_2.iloc[:, -1]

X_valid = valid_2.iloc[:, :-1]
y_valid = valid_2.iloc[:, -1]

X_test = test_2.iloc[:, :-1]
y_test = test_2.iloc[:, -1]

# Print the shapes to verify (Note: a typo in the original print is corrected below)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
print(f"X_test:  {X_test.shape},  y_test:  {y_test.shape}")


# Convert datetime to a numeric (integer) format for the model
X_train['datetime'] = X_train['datetime'].dt.strftime('%Y%m%d%H%M%S').astype('int64')
X_valid['datetime'] = X_valid['datetime'].dt.strftime('%Y%m%d%H%M%S').astype('int64')
X_test['datetime'] = X_test['datetime'].dt.strftime('%Y%m%d%H%M%S').astype('int64')

In [None]:
X_train.drop(['datetime'], axis=1, inplace=True)
X_valid.drop(['datetime'], axis=1, inplace=True)
X_test.drop(['datetime'], axis=1, inplace=True)

X_train.head()

In [None]:
# datetime_int 를 컬럼의 맨앞으로 위치 변경
# 마지막 컬럼의 이름을 가져오기
last_col = X_train.columns[-1]

# pop으로 마지막 컬럼을 제거하고, insert로 맨 앞에 추가.
X_train.insert(0, last_col, X_train.pop(last_col))
X_valid.insert(0, last_col, X_valid.pop(last_col))
X_test.insert(0, last_col, X_test.pop(last_col))

# 타겟 클래스 불균형 처리
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# 데이터 스케일링
scaler = MinMaxScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# 모델 학습

In [None]:
# 모델 학습 - Logistic Regression
LR_clf = LogisticRegression(random_state=42)
LR_clf.fit(X_train_res_scaled, y_train_res)
print('--------------------LR_clf fit completed--------------------')

# 모델 학습 - Decision Tree
DT_clf = DecisionTreeClassifier(random_state=42)
DT_clf.fit(X_train_res_scaled, y_train_res)
print('--------------------DT_clf fit completed--------------------')

# RandomForestClassifier (주석 처리됨)
# RF_clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
# RF_clf.fit(X_train_res_scaled, y_train_res)
# print('--------------------RF_clf fit completed--------------------')

# XGBClassifier
XGB_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='aucpr', random_state=42)
XGB_clf.fit(X_train_res_scaled, y_train_res)
print('--------------------XGB_clf fit completed--------------------')

# LGBMClassifier
LGBM_clf = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42)
LGBM_clf.fit(X_train_res_scaled, y_train_res)
print('--------------------LGBM_clf fit completed--------------------')

# CatBoostClassifier
CB_clf = CatBoostClassifier(iterations=200, learning_rate=0.1, depth=6, loss_function='Logloss', eval_metric='AUC', class_weights=[1, 10], random_state=42, verbose=1)
CB_clf.fit(X_train_res_scaled, y_train_res, eval_set=[(X_test_scaled, y_test)], verbose=50)
print('--------------------CB_clf fit completed--------------------')


# 검증세트 성능 평가

In [None]:
# ================ 검증셋 성능 평가 ================

# Logistic Regression Classifier 성능지표
LR_y_valid_pred = LR_clf.predict(X_valid_scaled)
print(f"LR valid roc_auc_score:  {roc_auc_score(y_valid, LR_y_valid_proba):.3f}") # 확률값 사용
print(f"LR valid accuracy_score: {accuracy_score(y_valid, LR_y_valid_pred):.3f}")
print(f"LR valid precision_score:{precision_score(y_valid, LR_y_valid_pred):.3f}")
print(f"LR valid recall_score:   {recall_score(y_valid, LR_y_valid_pred):.3f}")
print(f"LR valid f1_score:       {f1_score(y_valid, LR_y_valid_pred):.3f}")


# DecisionTree Classifier 성능지표
DT_y_valid_pred = DT_clf.predict(X_valid_scaled)
print(f"DT valid roc_auc_score:  {roc_auc_score(y_valid, DT_y_valid_proba):.3f}") # 확률값 사용
print(f"DT valid accuracy_score: {accuracy_score(y_valid, DT_y_valid_pred):.3f}")
print(f"DT valid precision_score:{precision_score(y_valid, DT_y_valid_pred):.3f}")
print(f"DT valid recall_score:   {recall_score(y_valid, DT_y_valid_pred):.3f}")
print(f"DT valid f1_score:       {f1_score(y_valid, DT_y_valid_pred):.3f}")


# (RandomForest Classifier 성능지표 - 주석 처리됨)
# RF_y_valid_pred = RF_clf.predict(X_valid_scaled)
# print(f"RF valid roc_auc_score:  {roc_auc_score(y_valid, RF_y_valid_proba):.3f}")
# print(f"RF valid accuracy_score: {accuracy_score(y_valid, RF_y_valid_pred):.3f}")
# print(f"RF valid precision_score:{precision_score(y_valid, RF_y_valid_pred):.3f}")
# print(f"RF valid recall_score:   {recall_score(y_valid, RF_y_valid_pred):.3f}")
# print(f"RF valid f1_score:       {f1_score(y_valid, RF_y_valid_pred):.3f}")


# XGBoost Classifier 성능지표
XGB_y_valid_pred = XGB_clf.predict(X_valid_scaled)
print(f"XGB valid roc_auc_score:  {roc_auc_score(y_valid, XGB_y_valid_proba):.3f}")
print(f"XGB valid accuracy_score: {accuracy_score(y_valid, XGB_y_valid_pred):.3f}")
print(f"XGB valid precision_score:{precision_score(y_valid, XGB_y_valid_pred):.3f}")
print(f"XGB valid recall_score:   {recall_score(y_valid, XGB_y_valid_pred):.3f}")
print(f"XGB valid f1_score:       {f1_score(y_valid, XGB_y_valid_pred):.3f}")


# LightGBM Classifier 성능지표
LGBM_y_valid_pred = LGBM_clf.predict(X_valid_scaled)
print(f"LGBM valid roc_auc_score:  {roc_auc_score(y_valid, LGBM_y_valid_proba):.3f}")
print(f"LGBM valid accuracy_score: {accuracy_score(y_valid, LGBM_y_valid_pred):.3f}")
print(f"LGBM valid precision_score:{precision_score(y_valid, LGBM_y_valid_pred):.3f}")
print(f"LGBM valid recall_score:   {recall_score(y_valid, LGBM_y_valid_pred):.3f}")
print(f"LGBM valid f1_score:       {f1_score(y_valid, LGBM_y_valid_pred):.3f}")


# CatBoost Classifier 성능지표
CB_y_valid_pred = CB_clf.predict(X_valid_scaled)
print(f"CB valid roc_auc_score:  {roc_auc_score(y_valid, CB_y_valid_proba):.3f}")
print(f"CB valid accuracy_score: {accuracy_score(y_valid, CB_y_valid_pred):.3f}")
print(f"CB valid precision_score:{precision_score(y_valid, CB_y_valid_pred):.3f}")
print(f"CB valid recall_score:   {recall_score(y_valid, CB_y_valid_pred):.3f}")
print(f"CB valid f1_score:       {f1_score(y_valid, CB_y_valid_pred):.3f}")

# Test Set 성능 평가

In [None]:
# # Logistic Regression 모델 학습의 경우
# LR_y_test_prob = LR_clf.predict_proba(X_test_scaled)[:,1]
# # y_test_true = y_test.values

# # X_test['y_test_prob_score'] = LR_y_test_prob
# # X_test['y_test_true'] = y_test_true

# # test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]

# # DecisionTree 모델 학습의 경우
# DT_y_test_prob = DT_clf.predict_proba(X_test_scaled)[:,1]
# # y_test_true = y_test.values

# # X_test['y_test_prob_score'] = DT_y_test_prob
# # X_test['y_test_true'] = y_test_true

# # test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]

# # # RandomForest 모델 학습의 경우
# # RF_y_test_prob = RF_clf.predict_proba(X_test_scaled)[:,1]
# # y_test_true = y_test.values

# # X_test['y_test_prob_score'] = RF_y_test_prob
# # X_test['y_test_true'] = y_test_true

# # test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]

In [None]:
# # XGBoost 모델 학습의 경우
# XGB_y_test_prob = XGB_clf.predict_proba(X_test_scaled)[:,1]
# # y_test_true = y_test.values

# # X_test['y_test_prob_score'] = XGB_y_test_prob
# # X_test['y_true'] = y_test_true

# # test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]


# # # LightGBM 모델 학습의 경우
# # LGBM_y_test_prob = LGBM_clf.predict_proba(X_test_scaled)[:,1]
# # y_test_true = y_test.values

# # X_test['y_test_prob_score'] = LGBM_y_test_prob
# # X_test['y_true'] = y_test_true

# # test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]


# # CatBoost 모델 학습의 경우
CB_y_test_prob = CB_clf.predict_proba(X_test_scaled)[:,1]
y_test_true = y_test.values

X_test['y_test_prob_score'] = CB_y_test_prob
X_test['y_true'] = y_test_true

test_pred_df = X_test[['datetime_int', 'y_test_prob_score', 'y_true']]

In [None]:
# Top-K@일 평가
# 매일 상위 K건만 심사 가능(예: 10건) -> 31일이면 예측 결과 총 310건을 하나로 합쳐서 Precision/Recall/F1 계산
K = 10

# 날짜별로 점수 상위 k
topk_by_day = (
    test_pred_df.sort_values(['datetime_int', 'y_test_prob_score'], ascending=[True, False])
    .groupby('datetime_int', group_keys=False)
    .head(K)
)

tp = topk_by_day['y_true'].sum()
pred_total = len(topk_by_day)          # 31 * K (일수가 31일일 때)
pos_total = test_pred_df['y_true'].sum()

precision_atK = (tp / pred_total) if pred_total else 0
recall_atK = (tp / pos_total) if pos_total else 0
f1_atK = (2*precision_atK*recall_atK/(precision_atK+recall_atK)) if (precision_atK+recall_atK) > 0 else 0

print(f"Precision@{K*31}: {precision_atK:.3f}, Recall@{K*31}: {recall_atK:.3f}, F1@{K*31}: {f1_atK:.3f}")

In [None]:
# 스코어 카드 계산

# y_test_prob_score * 1000 스케일링
test_pred_df['y_test_prob_score_scaled'] = (test_pred_df['y_test_prob_score']*1000).astype(int)

# 내림차순 정렬 (스코어 점수 높은 순서 먼저)
test_pred_df = test_pred_df.sort_values('y_test_prob_score_scaled', ascending=False).reset_index(drop=True)

# 구간(binning) 10점 단위
bins = list(range(0, 1001, 10))
labels = [f'[{b}-{b+10})' for b in bins[:-1]]
test_pred_df['bin'] = pd.cut(test_pred_df['y_test_prob_score_scaled'], bins=bins, right=False, labels=labels, include_lowest=True)

# 구간별 정상/부정 카운트
grp = test_pred_df.groupby('bin', observed=True).agg(
    normal_cnt = ('y_true', lambda x: (x==0).sum()),
    fraud_cnt = ('y_true', lambda x: (x==1).sum()),
    count = ('y_true', 'size'),
    max_y_test_prob_score = ('y_test_prob_score_scaled', 'max')
).reset_index()

# bin은 높은 점수부터 누적해야 하므로 max_y_test_prob_score 기준으로 내림차순 정렬
grp = grp.sort_values('max_y_test_prob_score', ascending=False).reset_index(drop=True)

# 누적 계산
grp['cum_fraud'] = grp['fraud_cnt'].cumsum() # A
grp['cum_total'] = grp['count'].cumsum()    # B
total_fraud = test_pred_df['y_true'].sum()

In [None]:
# Calculate cumulative precision percentage
grp['cum_precision_pct'] = (grp['cum_fraud'] / grp['cum_total']*100).round(3)
# Calculate cumulative recall (capture rate) percentage
grp['cum_recall_pct'] = (grp['cum_fraud'] / total_fraud *100).round(3)

# For F1-score: calculate P and R as ratios first, then convert to percentage
P = grp["cum_fraud"] / grp["cum_total"]
R = grp["cum_fraud"] / total_fraud
# Calculate F1-score, handling division by zero, then convert to pct and fill NaNs
grp["cum_f1_pct"] = (2 * P * R / (P + R).replace(0, np.nan) * 100).fillna(0).round(3)

# Select only the most readable columns for the final report
report = grp[[
    "bin", "normal_cnt", "fraud_cnt",
    "cum_fraud", "cum_total",
    "cum_precision_pct", "cum_recall_pct", "cum_f1_pct"
]]

In [None]:
# Display the final report
report

# Feature Importance

In [None]:
# Feature Importance 계산
from catboost import Pool

# CatBoost Pool 객체를 사용하여 학습 데이터에 대한 중요도를 계산.
importances = CB_clf.get_feature_importance(Pool(X_train_res_scaled, y_train_res))

# 결과를 DataFrame으로 만들고 상위 10개를 확인.
feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False).head(10)

print(feature_importance)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rc

# 한글 폰트 설정 (윈도우: 맑은 고딕)
rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

# 수평 막대 그래프 생성
bars = plt.barh(feature_importance['feature'], feature_importance['importance'], height=0.5, color='lightcoral')

# 막대 끝에 소수점 셋째 자리까지 값 표시
plt.bar_label(bars, fmt='%.3f', label_type="edge", padding=1)

# X축 레이블 및 제목 설정
plt.xlabel("Top 10 피처 중요도 by CatBoost")
plt.title("피처 중요도 by CatBoost")
plt.show()

In [None]:
# DataFrame으로 보기
df_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': LGBM_clf.feature_importances_
}).sort_values(by='importance', ascending=False)

# print(df_importance)

# 시각화
top10 = df_importance.head(10)
bars = plt.barh(top10['feature'], top10['importance'], height=0.5, color='lightcoral')
plt.bar_label(bars, fmt="%.f", label_type="edge", padding=1)
plt.xlabel("Top 10 피처 중요도 by LightGBM")
plt.show()