In [1]:
import pandas as pd
import numpy as np

In [None]:
# load the dataset
df = pd.read_csv('./Big_Black_Money_Dataset.csv')
df.head()

In [None]:
# rebalance "Source of Money" to 10/90 illegal/legal
idx = df[df['Source of Money'] == 'Illegal'].sample(n=6000).index
df.loc[idx, ['Source of Money']] = 'Legal'


In [None]:
# redistribute "Amount (USD)" to follow a lognormal distribution

# Parameters for lognormal distribution
mean = 0 
sigma = 1  # Standard deviation

# Generate lognormal values
num_samples = 10000  # Number of samples
lognormal_values = np.random.lognormal(mean, sigma, num_samples)

# Scale the values to the range [1, 5250000]
scaled_values = 1 + (lognormal_values - lognormal_values.min()) * (5250000 - 1) / (lognormal_values.max() - lognormal_values.min())

# Convert to integer
scaled_values = scaled_values.astype(int)

# reassign back to dataset
df['Amount (USD)'] = scaled_values

In [None]:
# redistribute 'Money Laundering Risk Score' to a normal distribution

count = len(df)

# Parameters for the normal distribution
mean = 5  
std_dev = 1

filtered_values = []
# Keep generating until we have 100 values between 1 and 10
while len(filtered_values) < count:
    # Generate normal distribution values
    normal_values = np.random.normal(mean, std_dev, count)
    
    # Filter values between 1 and 10
    valid_values = normal_values[(normal_values >= 1) & (normal_values <= 10)]
    
    # Add filtered values to the list
    filtered_values.extend(valid_values)

# Truncate the list to exactly 100 values if necessary
filtered_values = filtered_values[:count]

# reassign risk score
df['Money Laundering Risk Score'] = filtered_values



# redistribute 'Money Laundering Risk Score' for illegal transactions on average have higher risk than legal transactions
idx = df[df['Source of Money'] == 'Illegal'].index
count = len(idx)

# Parameters for the normal distribution
mean = 7  
std_dev = 1

filtered_values = []
# Keep generating until we have 100 values between 1 and 10
while len(filtered_values) < count:
    # Generate normal distribution values
    normal_values = np.random.normal(mean, std_dev, count)
    
    # Filter values between 1 and 10
    valid_values = normal_values[(normal_values >= 3) & (normal_values <= 10)]
    
    # Add filtered values to the list
    filtered_values.extend(valid_values)

# Truncate the list to exactly 100 values if necessary
filtered_values = filtered_values[:count]

# reassign risk score
df.loc[idx, ['Money Laundering Risk Score']] = filtered_values

In [None]:
# increase the amount of illegal activities originating from  UAE, Switzerland, and Russia
idx = df[df['Source of Money'] == 'Illegal'].sample(n=200).index
count = len(idx)

options = ['Brazil', 'UK', 'UAE']
random_choices = np.random.choice(options, size=count, replace=True)

df.loc[idx, ['Country']] = random_choices

In [None]:
# write to file
df.to_csv('Big_Black_Money_Dataset_V2.csv')