In [8]:
import numpy as np
import pandas as pd
from faker import Faker
from scipy.stats import skewnorm, lognorm

fake = Faker()

# 설정: 통계 정보
num_samples = 5078345

bank_ids_min = 1
bank_ids_max = 356303

# 실제 통계 정보 설정
amount_received_mean = 5988726.070072798
amount_received_std = 1037183108.8919889
amount_received_skewness = 649.3389172676475
amount_paid_mean = 4509273.3677414
amount_paid_std = 869772830.9198645
amount_paid_skewness = 858.747783057833

is_laundering_0 = 0.10194266045335636
is_laundering_1 = 99.89805733954664

# 1. 가상의 데이터를 생성하는 함수 작성
def generate_data(mean, std, n_samples, skewness=0):
    a = skewness
    # skew-normal distribution
    data = skewnorm.rvs(a, loc=mean, scale=std, size=n_samples)
    return np.clip(data, a_min=0, a_max=None)  # 클리핑 추가: 최소값을 0으로 설정

# 2. 데이터 생성
amount_received = generate_data(amount_received_mean, amount_received_std, num_samples, skewness=amount_received_skewness)
amount_paid = generate_data(amount_paid_mean, amount_paid_std, num_samples, skewness=amount_paid_skewness)

# 3. 기타 데이터 생성
bank_ids = list(range(bank_ids_min, bank_ids_max)) # sorted(df['From Bank'].unique())
from_banks = np.random.choice(bank_ids, size=num_samples)
to_banks = np.random.choice(bank_ids, size=num_samples)
accounts_from = [fake.iban() for _ in range(num_samples)]
accounts_to = [fake.iban() for _ in range(num_samples)]
currencies =  ['Australian Dollar', 'Bitcoin', 'Brazil Real', 'Canadian Dollar', 'Euro', 'Mexican Peso', 'Ruble', 'Rupee', 'Saudi Riyal', 'Shekel', 'Swiss Franc', 'UK Pound', 'US Dollar', 'Yen', 'Yuan']
payment_formats =  ['ACH', 'Bitcoin', 'Cash', 'Cheque', 'Credit Card', 'Reinvestment', 'Wire']
receiving_currency = np.random.choice(currencies, size=num_samples)
payment_currency = np.random.choice(currencies, size=num_samples)
payment_format = np.random.choice(payment_formats, size=num_samples)
is_laundering = np.random.choice([0, 1], size=num_samples, p=[is_laundering_0/100, is_laundering_1/100])
timestamps = [fake.date_time_between(start_date="-1y", end_date="now").strftime("%Y/%m/%d %H:%M") for _ in range(num_samples)]

# 4. 데이터프레임 생성
df = pd.DataFrame({
    "Timestamp": timestamps,
    "From Bank": from_banks,
    "Account": accounts_from,
    "To Bank": to_banks,
    "Account.1": accounts_to,
    "Amount Received": amount_received,
    "Receiving Currency": receiving_currency,
    "Amount Paid": amount_paid,
    "Payment Currency": payment_currency,
    "Payment Format": payment_format,
    "Is Laundering": is_laundering
})

# 5. 로그 변환
log_cols = ['Amount Received', 'Amount Paid']
for col in log_cols:
    df[col] = np.log1p(df[col])

# 최종 데이터 확인
print(df.head())

# 6. CSV 파일로 저장
df.to_csv("fake_transaction_data.csv", index=False)
print("CSV 파일이 성공적으로 저장되었습니다.")

          Timestamp  From Bank                 Account  To Bank  \
0  2024/07/12 06:11      24269  GB03YBIQ82824322188924    10366   
1  2024/03/15 05:25        743  GB48ZOUV28576466349095    23835   
2  2024/06/09 23:52      15642  GB22NVCF08985447995301     7345   
3  2024/09/16 14:11      27506  GB30VVMT16842308744570    10258   
4  2024/09/29 21:13      20197  GB29TTNF71713666901395    11059   

                Account.1  Amount Received Receiving Currency  Amount Paid  \
0  GB37DKAP36982161906437        19.247729            Bitcoin    20.027653   
1  GB89DJTU18617873730908        20.338451  Australian Dollar    20.592196   
2  GB76PVRT65851614655155        20.820667       Mexican Peso    17.674758   
3  GB25LIMJ04478098877952        20.487611           UK Pound    18.765700   
4  GB73XXIF46675723677295        20.850379           UK Pound    20.762284   

  Payment Currency Payment Format  Is Laundering  
0             Euro    Credit Card              1  
1     Mexican Peso        

In [10]:
import os
import pandas as pd

pd.set_option('display.max_columns', None)
path = './fake_transaction_data.csv'
df = pd.read_csv(path)

df.shape

(5078345, 11)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Timestamp           object 
 1   From Bank           int64  
 2   Account             object 
 3   To Bank             int64  
 4   Account.1           object 
 5   Amount Received     float64
 6   Receiving Currency  object 
 7   Amount Paid         float64
 8   Payment Currency    object 
 9   Payment Format      object 
 10  Is Laundering       int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 426.2+ MB


In [12]:
df.describe()

Unnamed: 0,From Bank,To Bank,Amount Received,Amount Paid,Is Laundering
count,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0
mean,14717.47,14717.16,20.15332,19.97464,0.9989701
std,8499.459,8500.177,1.045762,1.049467,0.03207495
min,1.0,1.0,12.72497,14.41705,0.0
25%,7356.0,7352.0,19.63439,19.45592,1.0
50%,14717.0,14721.0,20.37496,20.19751,1.0
75%,22080.0,22078.0,20.9052,20.72796,1.0
max,29434.0,29434.0,22.37807,22.1899,1.0


In [13]:
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2024/07/12 06:11,24269,GB03YBIQ82824322188924,10366,GB37DKAP36982161906437,19.247729,Bitcoin,20.027653,Euro,Credit Card,1
1,2024/03/15 05:25,743,GB48ZOUV28576466349095,23835,GB89DJTU18617873730908,20.338451,Australian Dollar,20.592196,Mexican Peso,Wire,1
2,2024/06/09 23:52,15642,GB22NVCF08985447995301,7345,GB76PVRT65851614655155,20.820667,Mexican Peso,17.674758,Saudi Riyal,Bitcoin,1
3,2024/09/16 14:11,27506,GB30VVMT16842308744570,10258,GB25LIMJ04478098877952,20.487611,UK Pound,18.7657,US Dollar,Cheque,1
4,2024/09/29 21:13,20197,GB29TTNF71713666901395,11059,GB73XXIF46675723677295,20.850379,UK Pound,20.762284,UK Pound,Bitcoin,1
