In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [None]:
dask_df = dd.read_csv("../datasets/HI-Small_Trans.csv")
dask_df.head()

# Proportion Laundering and not Laundering

In [None]:
not_laundering = (dask_df['Is Laundering'] == 0).sum().compute()
laundering = (dask_df['Is Laundering'] == 1).sum().compute()
print(f"Total transactions: {not_laundering + laundering}")
print(f"Not laundering transactions: {not_laundering}")
print(f"Laundering transactions: {laundering}")

# Display payment format in relation to laundering transaction

In [None]:
import matplotlib.pyplot as plt

# Calculate the number of corresponding values for each value of the "Payment Format" and "Is Laundering" columns
count_values = dask_df.groupby(['Payment Format', 'Is Laundering']).size().compute()

# Convert the results to a Pandas DataFrame and use the unstack() method
count_values_payment = count_values.unstack()

print(count_values_payment)

# Create a bar chart with a logarithmic scale
fig, axs = plt.subplots(1, 2, figsize=(15, 6))
bar_width = 0.35
bar_positions = range(len(count_values_payment.index))
axs[0].bar(bar_positions, count_values_payment[0], bar_width, label='Is Laundering = 0')
axs[0].bar([p + bar_width for p in bar_positions], count_values_payment[1], bar_width, label='Is Laundering = 1')
axs[0].set_xticks(bar_positions)
axs[0].set_xticklabels(count_values_payment.index, rotation='vertical') 
axs[0].set_xticklabels(count_values_payment.index)
axs[0].set_xlabel('Payment Format')
axs[0].set_ylabel('Number of corresponding values')
axs[0].set_title('Bar chart in arithmetic scale')
axs[0].legend()

axs[1].bar(bar_positions, count_values_payment[0], bar_width, label='Is Laundering = 0')
axs[1].bar([p + bar_width for p in bar_positions], count_values_payment[1], bar_width, label='Is Laundering = 1')
axs[1].set_xticks(bar_positions)
axs[1].set_xticklabels(count_values_payment.index, rotation='vertical') 
axs[1].set_xticklabels(count_values_payment.index)
axs[1].set_xlabel('Payment Format')
axs[1].set_ylabel('Number of corresponding values')
axs[1].set_title('Bar chart in logarithmic scale')
axs[1].legend()
axs[1].set_yscale('log')

# Show the chart
plt.show()

# Display payment currency in relation to laundering transaction

In [None]:
# Calculate the number of corresponding values for each value of the "Payment Format" and "Is Laundering" columns
count_values = dask_df.groupby(['Payment Currency', 'Is Laundering']).size().compute()

# Convert the results to a Pandas DataFrame and use the unstack() method
count_values = count_values.unstack()

print(count_values.sort_values(1, ascending=False))

# Create a bar chart with a logarithmic scale
fig, axs = plt.subplots(1, 2, figsize=(15, 6))
bar_width = 0.35
bar_positions = range(len(count_values.index))
axs[0].bar(bar_positions, count_values[0], bar_width, label='Is Laundering = 0')
axs[0].bar([p + bar_width for p in bar_positions], count_values[1], bar_width, label='Is Laundering = 1')
axs[0].set_xticks(bar_positions)
axs[0].set_xticklabels(count_values.index, rotation='vertical') 
axs[0].set_xticklabels(count_values.index)
axs[0].set_xlabel('Payment Currency')
axs[0].set_ylabel('Number of corresponding values')
axs[0].set_title('Bar chart in arithmetic scale')
axs[0].legend()

axs[1].bar(bar_positions, count_values[0], bar_width, label='Is Laundering = 0')
axs[1].bar([p + bar_width for p in bar_positions], count_values[1], bar_width, label='Is Laundering = 1')
axs[1].set_xticks(bar_positions)
axs[1].set_xticklabels(count_values.index, rotation='vertical') 
axs[1].set_xticklabels(count_values.index)
axs[1].set_xlabel('Payment Currency')
axs[1].set_ylabel('Number of corresponding values')
axs[1].set_title('Bar chart in logarithmic scale')
axs[1].legend()
axs[1].set_yscale('log')

# Show the chart
plt.show()

# Display top 10 accounts for fraudolent transactions

In [None]:
account_df = dask_df.groupby('Account')['Is Laundering'].sum().reset_index()
account1_df = dask_df.groupby('Account.1')['Is Laundering'].sum().reset_index()
merged_df = account_df.merge(account1_df, left_on='Account', right_on='Account.1')

# somma i valori di Is Laundering
merged_df['Is Laundering'] = merged_df['Is Laundering_x'] + merged_df['Is Laundering_y']

# seleziona solo le colonne necessarie
merged_df = merged_df[['Account', 'Is Laundering']]
merged_df = merged_df.sort_values('Is Laundering', ascending=False)
merged_df.head(10)

# Label Encoding values

In [None]:
# Conversione della colonna 'date' in oggetti datetime
dask_df['Timestamp'] = dd.to_datetime(dask_df['Timestamp'], format='%Y/%m/%d %H:%M')
# Conversione degli oggetti datetime in numeri float rappresentanti la data
dask_df['Timestamp'] = dask_df['Timestamp'].astype('int64') / 10**9

df = dask_df.compute()


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(set(df['Account']).union(set(df['Account.1']))))
df[['Account', 'Account.1']] = df[['Account', 'Account.1']].apply(le.transform)
le.fit(list(set(df['Receiving Currency']).union(set(df['Payment Currency']))))
df[['Receiving Currency', 'Payment Currency']] = df[['Receiving Currency', 'Payment Currency']].apply(le.transform)
df[['Payment Format']] = df[['Payment Format']].apply(le.fit_transform)

In [None]:
df.head(5)

# Display correlation matrix for fraudolent transactions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm_laundering = df[df['Is Laundering'] == 1].corr()
cm_not_laundering = df[df['Is Laundering'] == 0].corr()

# create subplots
fig, axs = plt.subplots(ncols=2, figsize=(20, 5))

# plot the first correlation matrix heatmap
sns.heatmap(cm_laundering, cmap='coolwarm', annot=False, ax=axs[0])
axs[0].set_title('Is Laundering = 1')

# plot the second correlation matrix heatmap
sns.heatmap(cm_not_laundering, cmap='coolwarm', annot=False, ax=axs[1])
axs[1].set_title('Is Laundering = 0')

# display the plot
plt.show()


In [None]:
print("Laundering:")
count_amount, count_currency = len(df[(df['Amount Received'] == df['Amount Paid']) & (df['Is Laundering'] == 1)]), len(df[(df['Receiving Currency'] == df['Payment Currency']) & (df['Is Laundering'] == 1)])
print(f"    Same amount: {count_amount}")
print(f"    Same currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

count_amount, count_currency = len(df[(df['Amount Received'] != df['Amount Paid']) & (df['Is Laundering'] == 1)]), len(df[(df['Receiving Currency'] != df['Payment Currency']) & (df['Is Laundering'] == 1)])
print(f"    Different amount: {count_amount}")
print(f"    Different currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

print("Not Laundering:")
count_amount, count_currency = len(df[(df['Amount Received'] == df['Amount Paid']) & (df['Is Laundering'] == 0)]), len(df[(df['Receiving Currency'] == df['Payment Currency']) & (df['Is Laundering'] == 0)])
print(f"    Same amount: {count_amount}")
print(f"    Same currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

count_amount, count_currency = len(df[(df['Amount Received'] != df['Amount Paid']) & (df['Is Laundering'] == 0)]), len(df[(df['Receiving Currency'] != df['Payment Currency']) & (df['Is Laundering'] == 0)])
print(f"    Different amount: {count_amount}")
print(f"    Different currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}")

# More correlation among Accounts

Find how many times an Account send laund money and not laund money to the same Account.1

In [None]:
df_temp = df[['Account', 'Account.1', 'Is Laundering']]
df_grouped = df_temp.groupby(['Account', 'Account.1'])['Is Laundering'].nunique().reset_index()
df_grouped[df_grouped['Is Laundering'] > 1].groupby('Account').count().reset_index().rename(columns={'Is Laundering': 'times'}).drop('Account.1', axis=1).sort_values(by='times', ascending=False).head(10)


Find all transactions that are send from A to B with a certain value and from B to C with the same value.

For this it is necessary to use something like graphframes

In [None]:
import sys
sys.path.append("/Users/fabio/jars")

In [None]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext
from graphframes import GraphFrame
from pyspark.sql.types import *

spark_driver_memory = "10g"
spark_executor_memory = "6g"


spark = SparkSession.builder \
                    .config("spark.driver.memory", spark_driver_memory) \
                    .config("spark.executor.memory", spark_executor_memory) \
                    .master("local[*]") \
                    .getOrCreate()
print("Spark session created")
sc = spark.sparkContext
print("Spark context created")

In [None]:
schema = StructType([
    StructField('timestamp', FloatType(), True),
    StructField('from_bank', IntegerType(), True),
    StructField('from_account', IntegerType(), True),
    StructField('to_bank', IntegerType(), True),
    StructField('to_account', IntegerType(), True),
    StructField('amount_received', FloatType(), True),
    StructField('receiving_currency', IntegerType(), True),
    StructField('amount_paid', FloatType(), True),
    StructField('payment_currency', IntegerType(), True),
    StructField('payment_format', IntegerType(), True),
    StructField('is_laundering', IntegerType(), True)])


In [None]:

df = df.rename(columns={'Timestamp': 'timestamp', 'From Bank': 'from_bank', 'Account': 'account1',
                           'To Bank': 'to_bank', 'Account.1': 'account2', 'Amount Received': 'amount_received',
                             'Receiving Currency': 'receiving_currency', 'Amount Paid': 'amount_paid', 'Payment Currency': 'payment_currency',
                               'Payment Format': 'payment_format', 'Is Laundering': 'is_laundering'})
df.to_parquet('df.parquet')

In [None]:
spark_df = spark.read.parquet('df.parquet')
spark_df.show(5)

In [None]:
verteces = spark_df.select(col("account1").alias("id")).union(spark_df.select(col("account2").alias("id"))).distinct()
edges = spark_df.select(col("account1").alias("src"), col("account2").alias("dst"), col("amount_paid").alias("amount"), col("timestamp"), col("payment_format"), col("is_laundering"))
g = GraphFrame(verteces, edges)

In [None]:
pattern = g.find("(a)-[c1]->(b); (b)-[c2]->(c) ").filter("""
                                              a != b and
                                              b != c and

                                              c1.amount == c2.amount and
                                              c1.timestamp < c2.timestamp
                                            """)
pattern.show(5, truncate=False)


In [None]:
array_features =  np.array(pattern.select('c1','c2').collect(), dtype=int).squeeze()

In [None]:
from collections import defaultdict, Counter
dictionary = defaultdict(list)
for array in array_features:
    dictionary[(array[0][4], array[1][4])].append((array[0][5], array[1][5]))

In [None]:
le.fit(list(count_values_payment.reset_index()['Payment Format']))
c = Counter()
l = []
matrix = [[0 for _ in range(6)] for _ in range(7)]
data = {}
for key, items in dictionary.items():
    i = Counter(items)
    string = f"{le.inverse_transform([key[0]])[0]}-{le.inverse_transform([key[1]])[0]}"
    data[string] = np.array([[i[(0,0)], i[(0,1)]], [i[(1,0)], i[(1,1)]]])

fig, axs = plt.subplots(2, 6, figsize=(15, 5))

# Loop over each payment type and display the matrix values in the corresponding subplot
for i, (payment, matrix) in enumerate(data.items()):
    # Compute the row and column indices for the current subplot
    row = i // 6
    col = i % 6
    
    # Display the matrix values in the current subplot
    axs[row, col].imshow(matrix, cmap='Greens')
    axs[row, col].set_xticks([0, 1])
    axs[row, col].set_yticks([0, 1])
    axs[row, col].set_xticklabels(['0', '1'])
    axs[row, col].set_yticklabels(['0', '1'])
    axs[row, col].set_xlabel(str(payment.split("-")[1]))
    axs[row, col].set_ylabel(str(payment.split("-")[0]))
    axs[row, col].xaxis.set_label_position('top')
    axs[row, col].yaxis.set_label_position('left')
    axs[row, col].xaxis.set_ticks_position('top')
    axs[row, col].yaxis.set_ticks_position('left')

    for i in range(2):
        for j in range(2):
            axs[row, col].annotate(str(matrix[i, j]), xy=(j, i), ha='center', va='center', color='grey')


# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.8, hspace=0.5)

# Show the plot
plt.show()

In [None]:
pattern = g.find("(a)-[c1]->(b); (c)-[c2]->(b); (d)-[c3]->(b); (b)-[c4]->(e) ")\
.filter(""" 
        a != b and
        c != a and
        d != c and
        b != e and

        (c1.amount + c2.amount + c3.amount) == c4.amount
""")
pattern.show(5, truncate=False)