In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
from scipy.spatial import distance
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr
from collections import Counter
from google.colab import drive
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import dump
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import confusion_matrix
from joblib import load

In [2]:
# Menghubungkan ke Google Drive
drive.mount('/content/drive')

# Menentukan path file CSV
file_path = '/content/drive/MyDrive/PPATK/aml_data.csv'

Mounted at /content/drive


In [3]:
# Membaca file CSV ke dalam DataFrame
data = pd.read_csv(file_path)

In [4]:
time = 'Timestamp'
sender = 'Account'
amount = 'Amount Paid'

In [5]:
data = data[[time, sender, amount, 'Is Laundering']]

In [6]:
# Mengubah format Timestamp menjadi tipe data datetime
data[time] = pd.to_datetime(data[time]).dt.date

In [7]:
aggregated_data = data.groupby([sender, time]).agg({
    amount: 'sum',
    'Is Laundering': 'max'
}).reset_index()

In [8]:
# Step 2: Extract 'transaction' count by 'Account' and add it to aggregated data
transaction_count = data.groupby([sender, time]).size().reset_index(name='transaction')
aggregated_data = pd.merge(aggregated_data, transaction_count, on=['Account', time])

In [9]:
def calculate_digit_distribution(value):
    first_digit = str(value)[0]
    return int(first_digit)

data['First Digit'] = data[amount].apply(calculate_digit_distribution)
digit_distribution = data.groupby([sender, time, 'First Digit']).size().unstack(fill_value=0)
digit_distribution = digit_distribution.div(digit_distribution.sum(axis=1), axis=0)

In [10]:
# Add the digit distributions as new variables in aggregated data
for digit in range(1, 10):
    variable_name = f'{digit}_dist'
    aggregated_data[variable_name] = digit_distribution.loc[:, digit].values

In [None]:
# Step 4: Extract 'limit' variable based on 'Amount Paid' threshold
aggregated_data['limit'] = (aggregated_data[amount] > 33500).astype(int)

In [11]:
# 5. Calculate the Euclidean distance between the first digit distributions and Benford's Law
benfords_law  = [0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046]

# 5. Extract Euclidean distance from first digit distribution to Benford's Law
for digit in range(1, 10):
    dist_col = f'{digit}_dist'
    benford_prob = benfords_law[digit - 1]
    aggregated_data[f'{digit}_ed'] = np.sqrt((aggregated_data[dist_col] - benford_prob) ** 2)

In [12]:
# Step 6: Calculate standard deviation for each row
for digit in range(1, 10):
    aggregated_data[f'{digit}_sd'] = np.abs(aggregated_data[f'{digit}_dist'].fillna(0) - benfords_law[digit - 1]) / np.sqrt(benfords_law[digit - 1] * (1 - benfords_law[digit - 1]))

In [15]:
# Select input and output variables
input_vars = [amount, 'transaction', '1_dist', '2_dist', '3_dist', '4_dist', '5_dist', '6_dist',
              '7_dist', '8_dist', '9_dist', '1_ed', '2_ed', '3_ed', '4_ed', '5_ed', '6_ed', '7_ed', '8_ed', '9_ed',
              '1_sd', '2_sd', '3_sd', '4_sd', '5_sd', '6_sd', '7_sd', '8_sd', '9_sd']
output_var = 'Is Laundering'

In [16]:
X = aggregated_data[input_vars]
y = aggregated_data[output_var]

# LEWATI INI

In [None]:
# Perform aggregated_data balancing using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
# Define the ensemble of models
logreg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [None]:
ensemble = VotingClassifier(estimators=[('logreg', logreg), ('rf', rf), ('xgb', xgb)])

In [None]:
# Train the ensemble model
ensemble.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = ensemble.predict(X_test)

In [None]:
# Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9259
Precision: 0.8888
Recall: 0.9740
F1-score: 0.9295


In [None]:
save_path = '/content/drive/MyDrive/PPATK/ensemble_model.joblib'
dump(ensemble, save_path)

# NAH SINI AJA!

In [17]:
# Reduce the feature space using SelectKBest feature selection
selector = SelectKBest(chi2, k=10)  # Adjust the value of 'k' as needed
X_reduced = selector.fit_transform(X, y)

In [18]:
# Perform data balancing using SMOTE on the reduced feature space
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_reduced, y)

In [19]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# LANJUTKEN

In [20]:
# Define the ensemble of models
logreg = LogisticRegression()
rf = RandomForestClassifier(n_jobs=-1)  # Utilize parallel processing with multiple CPU cores
xgb = XGBClassifier(n_jobs=-1)  # Utilize parallel processing with multiple CPU cores

In [21]:
ensemble = VotingClassifier(estimators=[('logreg', logreg), ('rf', rf), ('xgb', xgb)])

In [22]:
# Train the ensemble model
ensemble.fit(X_train, y_train)

In [23]:
# Make predictions on the test set
y_pred = ensemble.predict(X_test)

In [24]:
# Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [25]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9323
Precision: 0.8984
Recall: 0.9753
F1-score: 0.9353


In [26]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [27]:
cm

array([[375071,  46825],
       [ 10461, 413825]])

In [28]:
save_path = '/content/drive/MyDrive/PPATK/ensemble_model_5.joblib'
dump(ensemble, save_path)

['/content/drive/MyDrive/PPATK/ensemble_model_5.joblib']

In [29]:
# Assuming you have the trained selector object named 'selector'
dump(selector, '/content/drive/MyDrive/PPATK/selectkbest_model_3.joblib')

['/content/drive/MyDrive/PPATK/selectkbest_model_3.joblib']

# DATA BARU INI

In [30]:
# Membaca file CSV ke dalam DataFrame
data_sampel = pd.read_csv('/content/drive/MyDrive/PPATK/sampel.csv')

In [31]:
selector_filename = '/content/drive/MyDrive/PPATK/selectkbest_model_3.joblib'
selector = load(selector_filename)

In [32]:
# Load the ensemble model from the file
model_filename = '/content/drive/MyDrive/PPATK/ensemble_model_5.joblib'
ensemble = load(model_filename)

In [33]:
time = 'Timestamp'
sender = 'Account'
amount = 'Amount Paid'

In [36]:
def preprocess_data(data, time_variable, account_variable, amount_variable):
    time = time_variable
    sender = account_variable
    amount = amount_variable
    
    # Mengubah format Timestamp menjadi tipe data datetime
    data[time] = pd.to_datetime(data[time]).dt.date
    
    # Aggregated data by 'Account' and day 'Timestamp' variables
    aggregated_data = data.groupby([sender, time]).agg({
        amount: 'sum',
        'Is Laundering': 'max'
    }).reset_index()
    
    # Extract 'transaction' count by 'Account' and add it to aggregated data
    transaction_count = data.groupby([sender, time]).size().reset_index(name='transaction')
    aggregated_data = pd.merge(aggregated_data, transaction_count, on=[sender, time])
    
    def calculate_digit_distribution(value):
        first_digit = str(value)[0]
        return int(first_digit)
    
    data['First Digit'] = data[amount].apply(calculate_digit_distribution)
    digit_distribution = data.groupby([sender, time, 'First Digit']).size().unstack(fill_value=0)
    digit_distribution = digit_distribution.div(digit_distribution.sum(axis=1), axis=0)
    
    # Add the digit distributions as new variables in aggregated data
    for digit in range(1, 10):
        variable_name = f'{digit}_dist'
        aggregated_data[variable_name] = digit_distribution.loc[:, digit].values
    
    # Calculate the Euclidean distance between the first digit distributions and Benford's Law
    benfords_law  = [0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046]
    
    # Extract Euclidean distance from first digit distribution to Benford's Law
    for digit in range(1, 10):
        dist_col = f'{digit}_dist'
        benford_prob = benfords_law[digit - 1]
        aggregated_data[f'{digit}_ed'] = np.sqrt((aggregated_data[dist_col] - benford_prob) ** 2)
    
    # Calculate standard deviation for each row
    for digit in range(1, 10):
        aggregated_data[f'{digit}_sd'] = np.abs(aggregated_data[f'{digit}_dist'].fillna(0) - benfords_law[digit - 1]) / np.sqrt(benfords_law[digit - 1] * (1 - benfords_law[digit - 1]))
    
    input_vars = [amount, 'transaction', '1_dist', '2_dist', '3_dist', '4_dist', '5_dist', '6_dist',
                  '7_dist', '8_dist', '9_dist', '1_ed', '2_ed', '3_ed', '4_ed', '5_ed', '6_ed', '7_ed', '8_ed', '9_ed',
                  '1_sd', '2_sd', '3_sd', '4_sd', '5_sd', '6_sd', '7_sd', '8_sd', '9_sd']
    X = aggregated_data[input_vars]
    
    # Reduce the feature space using SelectKBest feature selection
    X_reduced = selector.transform(X)
    
    return X_reduced, aggregated_data

In [37]:
preprocessed_data, aggregated_data = preprocess_data(data_sampel, time, sender, amount)

In [38]:
y_pred = ensemble.predict(preprocessed_data)

In [39]:
merged_result = pd.concat([aggregated_data, pd.DataFrame({'y_pred': y_pred})], axis=1)
filtered_result = merged_result[merged_result['y_pred'] == 1]

In [40]:
result = filtered_result.groupby(sender).agg({'Timestamp': list}).reset_index()
result.columns = ['Account', 'Tanggal Pencucian Uang']

In [None]:
result