In [1]:
import pandas as pd
import numpy as np

# Load data

In [2]:
df = pd.read_csv('../dataset/HI-Small_Trans.csv')

#Rename columns
new_columns = [col.lower().replace(' ', '_') for col in df.columns]
df.columns = new_columns

In [3]:
df.head(5)

Unnamed: 0,timestamp,from_bank,account,to_bank,account.1,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


### Add possibly useful columns

In [4]:
df['same_account'] = df['account'] == df['account.1']
df['same_bank'] = df['from_bank'] == df['to_bank']
df['same_currency'] = df['receiving_currency'] == df['payment_currency']
df['same_amount'] = df['amount_received'] == df['amount_paid']

### Remove usesless columns

In [5]:
df.drop('timestamp', axis=1, inplace=True)
df.drop('from_bank', axis=1, inplace=True)
df.drop('to_bank', axis=1, inplace=True)
df.drop('account', axis=1, inplace=True)
df.drop('account.1', axis=1, inplace=True)

In [6]:
#Change position to is_laundering to last
column = df.pop("is_laundering")
df['is_laundering'] = column


df.head(5)

Unnamed: 0,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,same_account,same_bank,same_currency,same_amount,is_laundering
0,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,True,True,True,True,0
1,0.01,US Dollar,0.01,US Dollar,Cheque,False,False,True,True,0
2,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,True,True,True,True,0
3,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,True,True,True,True,0
4,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,True,True,True,True,0


### Label encoding 

In [7]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['receiving_currency', 'payment_currency']
combined_values = df[columns_to_encode].values.ravel()
unique_combined_values = pd.unique(combined_values)
encoder = LabelEncoder()
encoder.fit(unique_combined_values)
for column in columns_to_encode:
    df[column] = encoder.transform(df[column])

encoder.fit(df['payment_format'])
df['payment_format'] = encoder.transform(df['payment_format'])

columns_to_convert = ['same_account', 'same_bank', 'same_currency', 'same_amount', 'is_laundering']
df[columns_to_convert] = df[columns_to_convert].astype(int)

df.head(5)

Unnamed: 0,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,same_account,same_bank,same_currency,same_amount,is_laundering
0,3697.34,12,3697.34,12,5,1,1,1,1,0
1,0.01,12,0.01,12,3,0,0,1,1,0
2,14675.57,12,14675.57,12,5,1,1,1,1,0
3,2806.97,12,2806.97,12,5,1,1,1,1,0
4,36682.97,12,36682.97,12,5,1,1,1,1,0


### Save dataframe

In [8]:
df.to_csv('./preprocessed_data/preprocessed_data_small.csv', index=False)