In [20]:
import kaggle
import pandas as pd
import os

filepath = "data/Big_Black_Money_Dataset.csv"

In [21]:
    ### With Kaggle, we have to download the entire dataset first
kaggle.api.dataset_download_files('waqi786/global-black-money-transactions-dataset', path='data/', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/waqi786/global-black-money-transactions-dataset


In [19]:
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Transaction ID,Country,Amount (USD),Transaction Type,Date of Transaction,Person Involved,Industry,Destination Country,Reported by Authority,Source of Money,Money Laundering Risk Score,Shell Companies Involved,Financial Institution,Tax Haven Country
0,TX0000000001,Brazil,3267530.0,Offshore Transfer,2013-01-01 00:00:00,Person_1101,Construction,USA,True,Illegal,6,1,Bank_40,Singapore
1,TX0000000002,China,4965767.0,Stocks Transfer,2013-01-01 01:00:00,Person_7484,Luxury Goods,South Africa,False,Illegal,9,0,Bank_461,Bahamas
2,TX0000000003,UK,94167.5,Stocks Transfer,2013-01-01 02:00:00,Person_3655,Construction,Switzerland,True,Illegal,1,3,Bank_387,Switzerland
3,TX0000000004,UAE,386420.1,Cash Withdrawal,2013-01-01 03:00:00,Person_3226,Oil & Gas,Russia,False,Illegal,7,2,Bank_353,Panama
4,TX0000000005,South Africa,643378.4,Cryptocurrency,2013-01-01 04:00:00,Person_7975,Real Estate,USA,True,Illegal,1,9,Bank_57,Luxembourg


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction ID               10000 non-null  object 
 1   Country                      10000 non-null  object 
 2   Amount (USD)                 10000 non-null  float64
 3   Transaction Type             10000 non-null  object 
 4   Date of Transaction          10000 non-null  object 
 5   Person Involved              10000 non-null  object 
 6   Industry                     10000 non-null  object 
 7   Destination Country          10000 non-null  object 
 8   Reported by Authority        10000 non-null  bool   
 9   Source of Money              10000 non-null  object 
 10  Money Laundering Risk Score  10000 non-null  int64  
 11  Shell Companies Involved     10000 non-null  int64  
 12  Financial Institution        10000 non-null  object 
 13  Tax Haven Country

In [13]:
df = pd.read_csv(filepath, parse_dates=['Date of Transaction'])
df.head()

Unnamed: 0,Transaction ID,Country,Amount (USD),Transaction Type,Date of Transaction,Person Involved,Industry,Destination Country,Reported by Authority,Source of Money,Money Laundering Risk Score,Shell Companies Involved,Financial Institution,Tax Haven Country
0,TX0000000001,Brazil,3267530.0,Offshore Transfer,2013-01-01 00:00:00,Person_1101,Construction,USA,True,Illegal,6,1,Bank_40,Singapore
1,TX0000000002,China,4965767.0,Stocks Transfer,2013-01-01 01:00:00,Person_7484,Luxury Goods,South Africa,False,Illegal,9,0,Bank_461,Bahamas
2,TX0000000003,UK,94167.5,Stocks Transfer,2013-01-01 02:00:00,Person_3655,Construction,Switzerland,True,Illegal,1,3,Bank_387,Switzerland
3,TX0000000004,UAE,386420.1,Cash Withdrawal,2013-01-01 03:00:00,Person_3226,Oil & Gas,Russia,False,Illegal,7,2,Bank_353,Panama
4,TX0000000005,South Africa,643378.4,Cryptocurrency,2013-01-01 04:00:00,Person_7975,Real Estate,USA,True,Illegal,1,9,Bank_57,Luxembourg


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Transaction ID               10000 non-null  object        
 1   Country                      10000 non-null  object        
 2   Amount (USD)                 10000 non-null  float64       
 3   Transaction Type             10000 non-null  object        
 4   Date of Transaction          10000 non-null  datetime64[ns]
 5   Person Involved              10000 non-null  object        
 6   Industry                     10000 non-null  object        
 7   Destination Country          10000 non-null  object        
 8   Reported by Authority        10000 non-null  bool          
 9   Source of Money              10000 non-null  object        
 10  Money Laundering Risk Score  10000 non-null  int64         
 11  Shell Companies Involved     10000 non-nul

In [5]:
df.describe()

Unnamed: 0,Amount (USD),Date of Transaction,Money Laundering Risk Score,Shell Companies Involved
count,10000.0,10000,10000.0,10000.0
mean,2501818.0,2013-07-28 07:30:00,5.5264,4.4694
min,10031.8,2013-01-01 00:00:00,1.0,0.0
25%,1279005.0,2013-04-15 03:45:00,3.0,2.0
50%,2501310.0,2013-07-28 07:30:00,6.0,4.0
75%,3722416.0,2013-11-09 11:15:00,8.0,7.0
max,4999812.0,2014-02-21 15:00:00,10.0,9.0
std,1424364.0,,2.893603,2.879773


In [15]:
def load_and_clean_data(filepath):
    df = pd.read_csv(filepath, parse_dates=['Date of Transaction'])

    # Drop rows with missing critical values
    df = df.dropna(subset=['Amount (USD)', 'Country', 'Source of Money'])

    # Remove transaction ID (non-informative)
    df = df.drop(columns=['Transaction ID'])

    # Encode date into features (optional)
    df['Transaction Month'] = df['Date of Transaction'].dt.month
    df['Transaction Day'] = df['Date of Transaction'].dt.day
    df['Transaction Weekday'] = df['Date of Transaction'].dt.weekday
    df = df.drop(columns=['Date of Transaction'])

    # Encode categorical variables
    cat_cols = [
        'Country', 'Transaction Type', 'Person Involved', 'Industry',
        'Destination Country', 'Reported by Authority', 'Financial Institution',
        'Tax Haven Country'
    ]
    for col in cat_cols:
        df[col] = df[col].astype('category').cat.codes

    # Define features and target
    X = df.drop(columns=['Source of Money'])
    y = df['Source of Money'].apply(lambda x: 1 if x == 'Illegal' else 0)

    return X, y


In [16]:
df = load_and_clean_data(filepath)
df.head()

Unnamed: 0,Country,Amount (USD),Transaction Type,Person Involved,Industry,Destination Country,Reported by Authority,Source of Money,Money Laundering Risk Score,Shell Companies Involved,Financial Institution,Tax Haven Country,Transaction Month,Transaction Day,Transaction Weekday
0,0,3267530.0,2,69,2,9,1,Illegal,6,1,334,4,1,1,1
1,1,4965767.0,4,4569,4,5,0,Illegal,9,0,402,0,1,1,1
2,8,94167.5,4,1858,2,6,1,Illegal,1,3,319,5,1,1,1
3,7,386420.1,0,1546,5,3,0,Illegal,7,2,282,3,1,1,1
4,5,643378.4,1,4909,6,9,1,Illegal,1,9,452,2,1,1,1


In [14]:
df['Source of Money'].unique()

array(['Illegal', 'Legal'], dtype=object)