In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import tqdm
from pyspark.sql.window import Window
from sklearn.preprocessing import LabelEncoder
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext
from graphframes import GraphFrame
from pyspark.sql.types import *
import multiprocessing
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F

In [None]:
spark_driver_memory = "8g"
spark_executor_memory = "4g"


spark = SparkSession.builder \
                    .config("spark.driver.memory", spark_driver_memory) \
                    .config("spark.executor.memory", spark_executor_memory) \
                    .master("local[*]") \
                    .getOrCreate()
print("Spark session created")
sc = spark.sparkContext
print("Spark context created")

In [None]:
schema = StructType([
    StructField('timestamp', StringType(), True),
    StructField('from_bank', IntegerType(), True),
    StructField('from_account', StringType(), True),
    StructField('to_bank', IntegerType(), True),
    StructField('to_account', StringType(), True),
    StructField('amount_received', FloatType(), True),
    StructField('receiving_currency', StringType(), True),
    StructField('amount_paid', FloatType(), True),
    StructField('payment_currency', StringType(), True),
    StructField('payment_format', StringType(), True),
    StructField('is_laundering', IntegerType(), True)])



spark_df = spark.read.csv("../dataset/HI-Small_Trans.csv", header = False, schema=schema)

In [None]:
spark_df = spark_df.withColumn("index", monotonically_increasing_id())
spark_df = spark_df.filter(col('index') > 0)
spark_df.show(5)

# Proportion Laundering and not Laundering

In [None]:
total_count = spark_df.count()
spark_df.select('is_laundering').groupBy('is_laundering').agg(count('*').alias('count')).withColumn("proportion", col('count')/total_count).show(5, truncate=False)

# Display payment format in relation to laundering transaction

In [None]:


spark_df.select('payment_format', 'is_laundering') \
    .groupBy('payment_format') \
    .agg(
        sum(col('is_laundering').cast('int')).alias('1'),
        sum((1 - col('is_laundering')).cast('int')).alias('0')
    ).orderBy('1', ascending=False).show(truncate=False)

# Calculate the number of corresponding values for each value of the "Payment Format" and "Is Laundering" columns
grouped_df = spark_df.groupBy("payment_format", "is_laundering").count()

# Convert Spark DataFrame to Pandas DataFrame
count_values = grouped_df.toPandas()

# Use the unstack() method
count_values_payment = count_values.pivot(index='payment_format', columns='is_laundering', values='count')

# Create a bar chart with a logarithmic scale
fig, axs = plt.subplots(1, 2, figsize=(15, 6))
bar_width = 0.35
bar_positions = range(len(count_values_payment.index))
axs[0].bar(bar_positions, count_values_payment[0], bar_width, label='Is Laundering = 0')
axs[0].bar([p + bar_width for p in bar_positions], count_values_payment[1], bar_width, label='Is Laundering = 1')
axs[0].set_xticks(bar_positions)
axs[0].set_xticklabels(count_values_payment.index, rotation='vertical') 
axs[0].set_xlabel('Payment Format')
axs[0].set_ylabel('Number of corresponding values')
axs[0].set_title('Bar chart in arithmetic scale')
axs[0].legend()

axs[1].bar(bar_positions, count_values_payment[0], bar_width, label='Is Laundering = 0')
axs[1].bar([p + bar_width for p in bar_positions], count_values_payment[1], bar_width, label='Is Laundering = 1')
axs[1].set_xticks(bar_positions)
axs[1].set_xticklabels(count_values_payment.index, rotation='vertical') 
axs[1].set_xlabel('Payment Format')
axs[1].set_ylabel('Number of corresponding values')
axs[1].set_title('Bar chart in logarithmic scale')
axs[1].legend()
axs[1].set_yscale('log')

# Show the chart
plt.show()


# Display payment currency in relation to laundering transaction

In [None]:
spark_df.select('payment_currency', 'is_laundering') \
    .groupBy('payment_currency') \
    .agg(
        sum(col('is_laundering').cast('int')).alias('1'),
        sum((1 - col('is_laundering')).cast('int')).alias('0')
    ).orderBy('1', ascending=False).show(truncate=False)

grouped_df = spark_df.groupBy("payment_currency", "is_laundering").count()

# Convert Spark DataFrame to Pandas DataFrame
count_values = grouped_df.toPandas()

# Use the unstack() method
count_values_currency = count_values.pivot(index='payment_currency', columns='is_laundering', values='count')

# Sort the values by Is Laundering = 1 in descending order
count_values_currency = count_values_currency.sort_values(1, ascending=False)

# Create a bar chart with a logarithmic scale
fig, axs = plt.subplots(1, 2, figsize=(15, 6))
bar_width = 0.35
bar_positions = range(len(count_values_currency.index))
axs[0].bar(bar_positions, count_values_currency[0], bar_width, label='Is Laundering = 0')
axs[0].bar([p + bar_width for p in bar_positions], count_values_currency[1], bar_width, label='Is Laundering = 1')
axs[0].set_xticks(bar_positions)
axs[0].set_xticklabels(count_values_currency.index, rotation='vertical') 
axs[0].set_xticklabels(count_values_currency.index)
axs[0].set_xlabel('Payment Currency')
axs[0].set_ylabel('Number of corresponding values')
axs[0].set_title('Bar chart in arithmetic scale')
axs[0].legend()

axs[1].bar(bar_positions, count_values_currency[0], bar_width, label='Is Laundering = 0')
axs[1].bar([p + bar_width for p in bar_positions], count_values_currency[1], bar_width, label='Is Laundering = 1')
axs[1].set_xticks(bar_positions)
axs[1].set_xticklabels(count_values_currency.index, rotation='vertical') 
axs[1].set_xticklabels(count_values_currency.index)
axs[1].set_xlabel('Payment Currency')
axs[1].set_ylabel('Number of corresponding values')
axs[1].set_title('Bar chart in logarithmic scale')
axs[1].legend()
axs[1].set_yscale('log')

# Show the chart
plt.show()

# Display top 10 accounts for fraudolent transactions

In [None]:
spark_df.select(col('from_account').alias('account'), col('is_laundering'))\
.filter(col('is_laundering') == 1).groupBy('account')\
.agg(count('*').alias('count_laundering'))\
.orderBy('count_laundering', ascending=False)\
.show(10)

# Display top 20 accounts for transactions

In [None]:
spark_df.select(col('from_account').alias('account'))\
.groupBy('account')\
.agg(count('*').alias('count_transactions'))\
.orderBy('count_transactions', ascending=False)\
.show(20)

# Display relationhip between amount paid and laundering transaction

In [None]:
grouped_stats = spark_df.groupBy('is_laundering').agg(
    min(col('amount_paid')).alias('min'),
    max(col('amount_paid')).alias('max'),
    mean(col('amount_paid')).alias('mean')
)

# Mostra le statistiche
grouped_stats.show()


# Estrai il DataFrame Spark come Pandas DataFrame
df_pd = spark_df.toPandas()

# Applica la scala logaritmica e la formattazione numerica
plt.yscale("log")
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: format(x, ',.2f')))

# Crea il grafico scatter
plt.scatter(df_pd['is_laundering'], df_pd['amount_paid'], alpha=0.5)
plt.title("Relationship between Is Laundering and Amount Paid")
plt.xlabel("Is Laundering")
plt.ylabel("Amount Paid")
plt.xticks([0, 1])
plt.grid(True)
plt.show()

# Analyze timestamp

In [None]:
spark_df = spark_df.withColumn("timestamp", to_timestamp("timestamp", "yyyy/MM/dd HH:mm"))

# Split the timestamp column into separate components
spark_df = spark_df.withColumn("year", year("timestamp"))\
                             .withColumn("month", month("timestamp"))\
                             .withColumn("day", dayofmonth("timestamp"))\
                             .withColumn("hour", hour("timestamp"))\
                             .withColumn("minute", minute("timestamp"))
spark_df.persist()
spark_df.show(5)

In [None]:
def laundering_for(col_name: str):
    print(f"Laundering for {col_name}")
    spark_df.select(col_name, 'is_laundering') \
    .groupBy(col_name) \
    .agg(
        sum(col('is_laundering').cast('int')).alias('count(1)'),
        sum((1 - col('is_laundering')).cast('int')).alias('count(0)'),
    ).withColumn("ratio", (col('count(1)')/col('count(0)')).cast('Decimal(20,6)')) \
  .orderBy(col('ratio').desc()) \
  .show(truncate=False)

In [None]:
laundering_for('year')
laundering_for('month')
laundering_for('day')
laundering_for('hour')
laundering_for('minute')

# Count the number of transaction an account receive in different period of time

In [None]:
window = Window.partitionBy('to_account', 'day')
spark_df = spark_df.withColumn("transaction_received_per_day", count('*').over(window))

window = Window.partitionBy('to_account', 'hour')
spark_df = spark_df.withColumn("transaction_received_per_hour", count('*').over(window))

window = Window.partitionBy('to_account', 'minute')
spark_df = spark_df.withColumn("transaction_received_per_minute", count('*').over(window))


# Count the number of transaction an account send in different period of time

In [None]:
window = Window.partitionBy('from_account', 'day')
spark_df = spark_df.withColumn("transaction_send_per_day", count('*').over(window))

window = Window.partitionBy('from_account', 'hour')
spark_df = spark_df.withColumn("transaction_send_per_hour", count('*').over(window))

window = Window.partitionBy('from_account', 'minute')
spark_df = spark_df.withColumn("transaction_send_per_minute", count('*').over(window))

In [None]:
spark_df.cache()
spark_df.show()

# Label Encoding values

In [None]:
columns_to_encode = ['from_account', 'to_account', 'receiving_currency', 'payment_currency', 'payment_format']

# Applica l'indicizzazione delle stringhe a ciascuna colonna
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(spark_df) for column in columns_to_encode]
indexed_df = spark_df
for indexer in indexers:
    indexed_df = indexer.transform(indexed_df)

In [None]:
for column in columns_to_encode:
    indexed_df = indexed_df.drop(column).withColumnRenamed(column + "_index", column)

In [None]:
columns_to_cast = ['from_account', 'to_account', 'receiving_currency', 'payment_currency', 'payment_format']

for column in columns_to_cast:
    indexed_df = indexed_df.withColumn(column, col(column).cast("integer"))

df = indexed_df
df.show(5)

# More correlation among Accounts


Find how many times an Account send laund money and not laund money to the same Account.1

In [None]:
df_temp = df.select('from_account', 'to_account', 'is_laundering')

# Raggruppa e conta le occorrenze uniche
grouped_df = df_temp.groupBy('from_account', 'to_account').agg(collect_set('is_laundering').alias('unique_values'))

# Filtra i risultati con più di una occorrenza
filtered_df = grouped_df.filter(col('unique_values').getItem(0) != col('unique_values').getItem(1))

# Calcola il numero di occorrenze filtrate per ogni 'from_account'
result_df = filtered_df.groupBy('from_account').count().orderBy(col('count').desc())

# Mostra i primi 10 risultati
result_df.show(10)

# Save first part of analysis

In [None]:
df.write.parquet('df.parquet3')

In [None]:
df = spark.read.parquet('df.parquet1').drop("__index_level_0__")

df = df.withColumn("timestamp",
    concat_ws("",
        year("timestamp"),
        lpad(month("timestamp"), 2, "0"),
        lpad(dayofmonth("timestamp"), 2, "0"),
        lpad(hour("timestamp"), 2, "0"),
        lpad(minute("timestamp"), 2, "0")
    )
)
df = df.withColumn("timestamp", col("timestamp").cast(LongType()))
df.orderBy('index').show(5)

# Using graph frame to work with graph

Find all transactions that are send from A to B with a certain value and from B to C with the same value. 

In [None]:
combinations = {}
values = list(df.select("payment_format").distinct().collect())
j = 0
for i in range(len(values)):
    for k in range(0, len(values)):
        combinations[(i,k)] = j
        j+=1

In [None]:
def add_money_send_to_send(df):

    train_vertices = df.select(F.col("from_account").alias("id")).union(df.select(F.col("to_account").alias("id"))).distinct()
    train_edges = df.select(F.col("from_account").alias("src"), F.col("to_account").alias("dst"), F.col("index"), F.col("amount_paid").alias("amount"), F.col("timestamp"), F.col("payment_format"), F.col("is_laundering"))
    g = GraphFrame(train_vertices, train_edges)


    schema = StructType([
        StructField("index", LongType(), False),
        StructField("timestamp", DoubleType(), False),
        StructField("from", IntegerType(), False),
        StructField("to", IntegerType(), False),
        StructField("payment_format", IntegerType(), False),
        StructField("is_laundering", IntegerType(), False),
        StructField("payment_payment", IntegerType(), False)
    ])


    motif = "(a)-[c1]->(b); (b)-[c2]->(c)"
    filter_string = "a != b and b != c and c1.amount == c2.amount and c1.timestamp < c2.timestamp"
    graph = g.find(motif).filter(filter_string).distinct()
    graph.cache()
    columns = ['c1', 'c2']
    pattern = np.array(graph.select(*columns).collect()).squeeze()
    total_rows = []

    for row in pattern:
        rows_to_append = []
        payment_formats = []
        if isinstance(row[1], np.ndarray):
            for r in row:
                #index | timestamp | from | to | payment_format | is_laundering 
                rows_to_append.append([int(r[2]), float(r[4]), int(r[0]), int(r[1]), int(r[5]), int(r[6])])
                payment_formats.append(int(r[5]))
        else:
                rows_to_append.append([int(r[2]), float(r[4]), int(r[0]), int(r[1]), int(r[5]), int(r[6])])
                payment_formats.append(int(r[5]))
        
        for r in rows_to_append:
            r.append(combinations[(payment_formats[0], payment_formats[1])])
            total_rows.append(r)

    temp_df = spark.createDataFrame(total_rows, schema)

    temp_df = temp_df.dropDuplicates(['index'])

    joined_df = df.join(temp_df.select("index", "payment_payment").withColumnRenamed("payment_payment", "payment_payment_B"), on="index", how="left")

    # Aggiungi la colonna "payment_payment" a dfA, usando il valore corrispondente da dfB se presente, altrimenti imposta -1
    df = joined_df.withColumn("payment_payment", F.when(F.col("payment_payment_B").isNotNull(), F.col("payment_payment_B")).otherwise(-1)).drop("payment_payment_B")
  
    return df

In [None]:
df = add_money_send_to_send(df)

## Find circular patterns

In [None]:
def find_cyclesr(df):
    schema = StructType([
        StructField("index", LongType(), False),
        StructField("timestamp", DoubleType(), False),
        StructField("from", IntegerType(), False),
        StructField("to", IntegerType(), False),
        StructField("payment_format", IntegerType(), False),
        StructField("is_laundering", IntegerType(), False),
        StructField("hop_2", IntegerType(), False),
        StructField("hop_3", IntegerType(), False),
        StructField("hop_4", IntegerType(), False),
        StructField("hop_5", IntegerType(), False),
        StructField("hop_6", IntegerType(), False),
        StructField("hop_7", IntegerType(), False),
        StructField("hop_8", IntegerType(), False),
        StructField("hop_9", IntegerType(), False),
        StructField("hop_10", IntegerType(), False),
        StructField("hop_11", IntegerType(), False),
        StructField("hop_12", IntegerType(), False),
        StructField("hop_13", IntegerType(), False)
    ])

    all_df = []
    filtered_spark = df.filter(F.col("payment_currency") == F.col("receiving_currency"))
    filtered_spark.persist()
    for j in range(1):
        verteces = (
            filtered_spark.filter(F.col("payment_format") == j)\
            .select(F.col("from_account").alias("id"))\
            .union(
                filtered_spark.filter(F.col("payment_format") == j).select(F.col("to_account").alias("id"))
            )\
            .distinct()
        )
        
        edges = (
            filtered_spark.filter(F.col("payment_format") == j)\
            .select(
                F.col("from_account").alias("src"),
                F.col("to_account").alias("dst"),
                F.col("index"),
                F.col("amount_paid").alias("amount"),
                F.col("timestamp"),
                F.col("payment_format"),
                F.col("is_laundering")
            )
        )

        
        g = GraphFrame(verteces, edges)
        g = g.dropIsolatedVertices()
        g.persist()
        for hop in tqdm.tqdm(range(2,14)):
            motif = ""

            for i in range(hop):
                motif += "(n" + str(i) + ")-[c" + str(i+1) + "]->(n" + str((i+1) % hop) + "); "
            motif = motif.strip("; ")

            filter_string = ""
            for i in range(hop):
                for j in range(i, hop-1):
                    filter_string += "n{} != n{}".format(i, j+1)
                    if i+1 < hop-1:
                        filter_string += " and "
            filter_string += " and "
            for j in range(1,hop):
                    filter_string += "c{}.timestamp < c{}.timestamp".format(j, j+1)
                    if(j+1 < hop):
                        filter_string += " and "    
            graph = g.find(motif).filter(filter_string)
            select_col = []
            for i in range(hop):
                select_col.append("c{}".format(i+1))
            pattern = np.array(graph.select(*select_col).collect()).squeeze()
            total_rows = []
            for row in pattern:
                if isinstance(row[1], np.ndarray):
                    for r in row:
                        #index | timestamp | from | to | payment_format | is_laundering | hop
                        total_rows.append([int(r[2]), int(r[4]), int(r[0]), int(r[1]), int(r[5]), int(r[6]), hop])
                else:
                    total_rows.append([int(row[2]), int(row[4]), int(row[0]), int(row[1]), int(row[5]), int(row[6]), hop])

            dataframe = pd.DataFrame(total_rows, columns=['index', 'timestamp', 'from', 'to', 'payment_format', 'is_laundering', 'hop'])

            all_df.append(dataframe.drop_duplicates())

    merged_df = pd.concat(all_df, ignore_index=True)
    one_hot_encoded_df = pd.get_dummies(merged_df, columns=['hop'], prefix='hop')

    # Manually add missing hop columns (hop_2 to hop_13) and fill with False
    columns_to_add = [f"hop_{i}" for i in range(2, 14)]
    for col in columns_to_add:
        if col not in one_hot_encoded_df.columns:
            one_hot_encoded_df[col] = False



    grouped_df = one_hot_encoded_df.groupby('index').agg({
        'timestamp': 'first',
        'from': 'first',
        'to': 'first',
        'payment_format': 'first',
        'is_laundering': 'first',
        **{col: 'any' for col in columns_to_add}
    }).reset_index()
   
    columns_to_encode = ['hop_2', 'hop_3', 'hop_4', 'hop_5', 'hop_6', 'hop_7', 'hop_8', 'hop_9', 'hop_10', 'hop_11', 'hop_12', 'hop_13']
    grouped_df[columns_to_encode] = grouped_df[columns_to_encode].fillna(False, inplace=False).astype(int)
    
    temp_df = spark.createDataFrame(grouped_df, schema)

    temp_df = temp_df.dropDuplicates(['index'])

    # Step 1: Seleziona solo le colonne necessarie da temp_df e rinomina le colonne
    temp_df_selected = temp_df.select(
        "index",
        "hop_2", "hop_3", "hop_4" , "hop_5", "hop_6", "hop_7", "hop_8", "hop_9", "hop_10", "hop_11", "hop_12", "hop_13"
    ).withColumnRenamed("hop_2", "hop_2_B").withColumnRenamed("hop_3", "hop_3_B").withColumnRenamed("hop_4", "hop_4_B").withColumnRenamed("hop_5", "hop_5_B").withColumnRenamed("hop_6", "hop_6_B").withColumnRenamed("hop_7", "hop_7_B").withColumnRenamed("hop_8", "hop_8_B").withColumnRenamed("hop_9", "hop_9_B").withColumnRenamed("hop_10", "hop_10_B").withColumnRenamed("hop_11", "hop_11_B").withColumnRenamed("hop_12", "hop_12_B").withColumnRenamed("hop_13", "hop_13_B")

    # Step 2: Esegui una left join tra df e temp_df_selected, usando l'indice come chiave di join
    joined_df = df.join(temp_df_selected, on="index", how="left")

    # Step 3: Usa la funzione when per assegnare il valore corrispondente da hop_i_B se presente, altrimenti imposta il valore a 0
    for i in range(2, 14):
        joined_df = joined_df.withColumn(
            f"hop_{i}", 
            F.when(F.col(f"hop_{i}_B").isNotNull(), F.col(f"hop_{i}_B")).otherwise(0)
        )

    # Step 4: Rimuovi le colonne aggiunte da temp_df_selected
    joined_df = joined_df.drop(*[f"hop_{i}_B" for i in range(2, 14)])

    # Il risultato finale è il dataframe df con le colonne hop_2 a hop_13 aggiunte e i valori 0 dove necessario
    df_result = joined_df

    g.unpersist()
    filtered_spark.unpersist()

    return df_result

In [None]:
def find_cycles(df):

    schema = StructType([
        StructField("index", LongType(), False),
        StructField("timestamp", LongType(), False),
        StructField("from", IntegerType(), False),
        StructField("to", IntegerType(), False),
        StructField("payment_format", IntegerType(), False),
        StructField("is_laundering", IntegerType(), False),
        StructField("hop_2", IntegerType(), False),
        StructField("hop_3", IntegerType(), False),
        StructField("hop_4", IntegerType(), False),
        StructField("hop_5", IntegerType(), False),
        StructField("hop_6", IntegerType(), False),
        StructField("hop_7", IntegerType(), False),
        StructField("hop_8", IntegerType(), False),
        StructField("hop_9", IntegerType(), False),
        StructField("hop_10", IntegerType(), False),
        StructField("hop_11", IntegerType(), False),
        StructField("hop_12", IntegerType(), False),
        StructField("hop_13", IntegerType(), False),
    ])


    all_df = []
    filtered_spark = df.filter(F.col("payment_currency") == F.col("receiving_currency"))
    filtered_spark.cache()
    #payment_formats = filtered_spark.select("payment_format").distinct().rdd.flatMap(lambda x: x).collect()

    for j in range(1):
        verteces = filtered_spark.filter(F.col("payment_format") == j).select(F.col("from_account").alias("id")).union(filtered_spark.filter(F.col("payment_format") == j).select(F.col("to_account").alias("id"))).distinct()
        edges = filtered_spark.filter(F.col("payment_format") == j).select(F.col("from_account").alias("src"), F.col("to_account").alias("dst"), F.col("index"), F.col("amount_paid").alias("amount"), F.col("timestamp"), F.col("payment_format"), F.col("is_laundering"))
        g = GraphFrame(verteces, edges)
        g = g.dropIsolatedVertices()
        g.cache()
        for hop in tqdm.tqdm(range(2,14)):
            motif = ""

            for i in range(hop):
                motif += "(n" + str(i) + ")-[c" + str(i+1) + "]->(n" + str((i+1) % hop) + "); "
            motif = motif.strip("; ")

            filter_string = ""
            for i in range(hop):
                for j in range(i, hop-1):
                    filter_string += "n{} != n{}".format(i, j+1)
                    if i+1 < hop-1:
                        filter_string += " and "
            filter_string += " and "
            for j in range(1,hop):
                    filter_string += "c{}.timestamp < c{}.timestamp".format(j, j+1)
                    if(j+1 < hop):
                        filter_string += " and "    
            graph = g.find(motif).filter(filter_string)
            select_col = []
            for i in range(hop):
                select_col.append("c{}".format(i+1))
            pattern = np.array(graph.select(*select_col).collect()).squeeze()
            total_rows = []

            for row in pattern:
                if isinstance(row[1], np.ndarray):
                    for r in row:
                        #index | timestamp | from | to | payment_format | is_laundering | hop
                        total_rows.append([int(r[2]), r[4], int(r[0]), int(r[1]), int(r[5]), int(r[6]), hop])
                else:
                    total_rows.append([int(row[2]), row[4], int(row[0]), int(row[1]), int(row[5]), int(row[6]), hop])

            dataframe = pd.DataFrame(total_rows, columns=['index', 'timestamp', 'from', 'to', 'payment_format', 'is_laundering', 'hop'])

            all_df.append(dataframe.drop_duplicates())

    merged_df = pd.concat(all_df, ignore_index=True)
    one_hot_encoded_df = pd.get_dummies(merged_df, columns=['hop'], prefix='hop')

    # Manually add missing hop columns (hop_2 to hop_13) and fill with False
    columns_to_add = [f"hop_{i}" for i in range(2, 14)]
    for col in columns_to_add:
        if col not in one_hot_encoded_df.columns:
            one_hot_encoded_df[col] = False

    grouped_df = one_hot_encoded_df.groupby('index').agg({
        'timestamp': 'first',
        'from': 'first',
        'to': 'first',
        'payment_format': 'first',
        'is_laundering': 'first',
        **{col: 'any' for col in columns_to_add}
    }).reset_index()
   
    columns_to_encode = ['hop_2', 'hop_3', 'hop_4', 'hop_5', 'hop_6', 'hop_7', 'hop_8', 'hop_9', 'hop_10', 'hop_11', 'hop_12', 'hop_13']
    grouped_df[columns_to_encode] = grouped_df[columns_to_encode].fillna(False, inplace=False).astype(int)
    
    temp_df = spark.createDataFrame(grouped_df, schema)

    temp_df = temp_df.dropDuplicates(['index'])

    # Step 1: Seleziona solo le colonne necessarie da temp_df e rinomina le colonne
    temp_df_selected = temp_df.select(
        "index",
        "hop_2", "hop_3", "hop_4", "hop_5", "hop_6",
        "hop_7", "hop_8", "hop_9", "hop_10", "hop_11",
        "hop_12", "hop_13"
    ).withColumnRenamed("hop_2", "hop_2_B").withColumnRenamed("hop_3", "hop_3_B").withColumnRenamed("hop_4", "hop_4_B").withColumnRenamed("hop_5", "hop_5_B").withColumnRenamed("hop_6", "hop_6_B").withColumnRenamed("hop_7", "hop_7_B").withColumnRenamed("hop_8", "hop_8_B").withColumnRenamed("hop_9", "hop_9_B").withColumnRenamed("hop_10", "hop_10_B").withColumnRenamed("hop_11", "hop_11_B").withColumnRenamed("hop_12", "hop_12_B").withColumnRenamed("hop_13", "hop_13_B")

    # Step 2: Esegui una left join tra df e temp_df_selected, usando l'indice come chiave di join
    joined_df = df.join(temp_df_selected, on="index", how="left")

    # Step 3: Usa la funzione when per assegnare il valore corrispondente da hop_i_B se presente, altrimenti imposta il valore a 0
    for i in range(2, 14):
        joined_df = joined_df.withColumn(
            f"hop_{i}", 
            F.when(F.col(f"hop_{i}_B").isNotNull(), F.col(f"hop_{i}_B")).otherwise(0)
        )

    # Step 4: Rimuovi le colonne aggiunte da temp_df_selected
    joined_df = joined_df.drop(*[f"hop_{i}_B" for i in range(2, 14)])

    # Il risultato finale è il dataframe df con le colonne hop_2 a hop_13 aggiunte e i valori 0 dove necessario
    df_result = joined_df

    return df_result

In [None]:
find_cycles(df).show()

                                                                                

# Find fan in

In [None]:
def find_fanin(g: GraphFrame):
    motif = "(a)-[c1]->(b); (c)-[c2]->(b)"
    filter_motif = "(abs(c1.timestamp - c2.timestamp)) <= 40000 and c1.index != c2.index and c1.payment_currency == c2.payment_currency"#and c1.payment_format == c2.payment_format"
  
    pattern = g.find(motif).filter(filter_motif).select("c1", "c2").distinct()
    fan_in_trans = pattern.groupBy(F.col("c1")).agg(F.count("*").alias("fan_in_degree")).select(F.col("c1").alias("transaction"), F.col("fan_in_degree"))
    #fan_in_trans.cache()
    return fan_in_trans

def add_fan_in(df):
    filtered_spark = df.filter(F.col("payment_currency") == F.col("receiving_currency"))
    filtered_spark = filtered_spark.coalesce(12)
    filtered_spark.cache()
    payment_formats = filtered_spark.select("payment_format").distinct().rdd.flatMap(lambda x: x).collect()

    total_fan_in = None

    for payment_format in payment_formats:
        print(f"Find fan in payment_format: {payment_format}")
        filtered_by_format = filtered_spark.filter(F.col("payment_format") == payment_format)
        verteces = (
            filtered_by_format.select(F.col("from_account").alias("id"))
            .union(df.select(F.col("to_account").alias("id")))
            .distinct()
        )
        edges = (
            filtered_by_format.select(
                F.col("from_account").alias("src"),
                F.col("to_account").alias("dst"),
                F.col("index"),
                F.col("timestamp"),
                F.col("payment_currency"),
                F.col("payment_format"),
                F.col("is_laundering")
                
            )
        )
        g = GraphFrame(verteces, edges)
        
        if total_fan_in is None:
            find_fan_in = find_fanin(g)
            total_fan_in = find_fan_in
        else:
            find_fan_in = find_fanin(g)
            total_fan_in = total_fan_in.unionAll(find_fan_in)
           
    
    
    def extract_values(transaction):
        src, dst, index, timestamp, payment_currency, payment_format, is_laundering = transaction
        return (src, dst, index, timestamp,payment_currency,  payment_format, is_laundering)

    # Definisci lo schema per il DataFrame Spark
    schema_udf = StructType([
        StructField("src", IntegerType(), True),
        StructField("dst", IntegerType(), True),
        StructField("index", IntegerType(), True),
        StructField("timestamp", FloatType(), True),
        StructField("payment_currency", IntegerType(), True),
        StructField("payment_format", IntegerType(), True),
        StructField("is_laundering", IntegerType(), True)
    ])

    # Applica la funzione UDF per estrarre i valori dalla colonna "transaction" e crea un nuovo DataFrame
    extract_udf = F.udf(extract_values, schema_udf)
    new_spark_df = total_fan_in.withColumn("extracted", extract_udf("transaction"))

    # Seleziona le colonne necessarie e converte il DataFrame Spark in un DataFrame Pandas
    temp_df = new_spark_df.select("extracted.*", "fan_in_degree")

    joined_df = df.join(temp_df.select("index", "fan_in_degree").withColumnRenamed("fan_in_degree", "fan_in_degree_B"), on="index", how="left")

    # Aggiungi la colonna "payment_payment" a dfA, usando il valore corrispondente da dfB se presente, altrimenti imposta -1
    df = joined_df.withColumn("fan_in_degree", F.when(F.col("fan_in_degree_B").isNotNull(), F.col("fan_in_degree_B")).otherwise(0)).drop("fan_in_degree_B")


       
    return df

In [None]:
df = add_fan_in(df)

In [None]:
df.show()

# Find fan out

In [None]:
from pyspark.storagelevel import StorageLevel

def find_fanout(g: GraphFrame, motif: str, filter_motif: str):
    pattern = g.find(motif).filter(filter_motif).select("c1", "c2").distinct()
    fan_out_trans = pattern.groupBy(F.col("c1")).agg(F.count("*").alias("fan_out_degree")).select(F.col("c1").alias("transaction"), F.col("fan_out_degree"))
    fan_out_trans.persist(StorageLevel.MEMORY_ONLY)  # Cache the result
    return fan_out_trans

def add_fan_out(df, num_partitions, total_fan_out=None):
    filtered_spark = df.filter(F.col("payment_currency") == F.col("receiving_currency")).repartition(num_partitions)
    filtered_spark.persist(StorageLevel.MEMORY_ONLY)  # Cache the filtered DataFrame

    # Define motif and filter_motif for find_fanout function
    motif = "(a)-[c1]->(b); (a)-[c2]->(c)"
    filter_motif = "(abs(c1.timestamp - c2.timestamp)) <= 40000 and c1.index != c2.index"

    if total_fan_out is None:
        distinct_from_accounts = filtered_spark.select("from_account").distinct().withColumnRenamed("from_account", "id")
        g = GraphFrame(distinct_from_accounts, 
                       filtered_spark.select(F.col("from_account").alias("src"), 
                                            F.col("to_account").alias("dst"), 
                                            "index", "timestamp", "payment_format", "is_laundering"))
        total_fan_out = find_fanout(g, motif, filter_motif)
    else:
        distinct_from_accounts = filtered_spark.select("from_account").distinct().withColumnRenamed("from_account", "id")
        g = GraphFrame(distinct_from_accounts, 
                       filtered_spark.select(F.col("from_account").alias("src"), 
                                            F.col("to_account").alias("dst"), 
                                            "index", "timestamp", "payment_format", "is_laundering"))
        find_fan_out = find_fanout(g, motif, filter_motif)
        total_fan_out = total_fan_out.unionAll(find_fan_out)
        find_fan_out.unpersist()

    def extract_values(transaction):
        src, dst, index, timestamp, payment_format, is_laundering = transaction
        return (src, dst, index, timestamp, payment_format, is_laundering)

    schema_udf = StructType([
        StructField("src", IntegerType(), True),
        StructField("dst", IntegerType(), True),
        StructField("index", IntegerType(), True),
        StructField("timestamp", FloatType(), True),
        StructField("payment_format", IntegerType(), True),
        StructField("is_laundering", IntegerType(), True)
    ])

    extract_udf = F.udf(extract_values, schema_udf)
    new_spark_df = total_fan_out.withColumn("extracted", extract_udf("transaction"))

    temp_df = new_spark_df.select("extracted.*", "fan_out_degree")

    joined_df = df.join(temp_df.select("index", "fan_out_degree").withColumnRenamed("fan_out_degree", "fan_out_degree_B"), on="index", how="left")

    df = joined_df.withColumn("fan_out_degree", F.when(F.col("fan_out_degree_B").isNotNull(), F.col("fan_out_degree_B")).otherwise(0)).drop("fan_out_degree_B")

    return df

In [None]:
df = add_fan_out(df, 12)

In [None]:
df.show()

# Display correlation matrix for fraudolent transactions

In [None]:
# Seleziona le colonne numeriche su cui calcolare la correlazione
numeric_columns = ['from_bank', 'to_bank', 'from_account', 'to_account','receiving_currency','payment_currency','payment_format', 'amount_received', 'amount_paid',  'day', 'hour', 'minute', 'transaction_received_per_day',  'transaction_received_per_hour',  'transaction_received_per_minute', 'transaction_send_per_day',  'transaction_send_per_hour',  'transaction_send_per_minute','payment_payment',  'fan_in_degree','hop_2', 'hop_3', 'hop_4', 'hop_5', 'hop_6', 'hop_7', 'hop_8', 'hop_9', 'hop_10', 'hop_11', 'hop_12', 'hop_13','is_laundering']

# Crea un VectorAssembler per creare una singola colonna "features"
assembler = VectorAssembler(inputCols=numeric_columns, outputCol="features")
assembled_df = assembler.transform(df).select("features")


# Calcola la matrice di correlazione per entrambi i casi
laundering_corr_matrix = df.select(numeric_columns).toPandas().corr()

# create subplots

fig, ax = plt.subplots(figsize=(20, 5))

# plot the first correlation matrix heatmap
sns.heatmap(laundering_corr_matrix, cmap='coolwarm', annot=False, ax=ax)
ax.set_title('Correlation matrix')

# display the plot
plt.show()


In [None]:
print("Laundering:")
count_amount, count_currency = len(df[(df['Amount Received'] == df['Amount Paid']) & (df['Is Laundering'] == 1)]), len(df[(df['Receiving Currency'] == df['Payment Currency']) & (df['Is Laundering'] == 1)])
print(f"    Same amount: {count_amount}")
print(f"    Same currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

count_amount, count_currency = len(df[(df['Amount Received'] != df['Amount Paid']) & (df['Is Laundering'] == 1)]), len(df[(df['Receiving Currency'] != df['Payment Currency']) & (df['Is Laundering'] == 1)])
print(f"    Different amount: {count_amount}")
print(f"    Different currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

print("Not Laundering:")
count_amount, count_currency = len(df[(df['Amount Received'] == df['Amount Paid']) & (df['Is Laundering'] == 0)]), len(df[(df['Receiving Currency'] == df['Payment Currency']) & (df['Is Laundering'] == 0)])
print(f"    Same amount: {count_amount}")
print(f"    Same currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}\n")

count_amount, count_currency = len(df[(df['Amount Received'] != df['Amount Paid']) & (df['Is Laundering'] == 0)]), len(df[(df['Receiving Currency'] != df['Payment Currency']) & (df['Is Laundering'] == 0)])
print(f"    Different amount: {count_amount}")
print(f"    Different currency: {count_currency}")
print(f"    Difference: {np.abs(count_amount - count_currency)}")

In [None]:
import sys
sys.path.append("/Users/fabio/jars")

In [None]:
schema = StructType([
    StructField('timestamp', FloatType(), True),
    StructField('from_bank', IntegerType(), True),
    StructField('from_account', IntegerType(), True),
    StructField('to_bank', IntegerType(), True),
    StructField('to_account', IntegerType(), True),
    StructField('amount_received', FloatType(), True),
    StructField('receiving_currency', IntegerType(), True),
    StructField('amount_paid', FloatType(), True),
    StructField('payment_currency', IntegerType(), True),
    StructField('payment_format', IntegerType(), True),
    StructField('is_laundering', IntegerType(), True)])


In [None]:
df = drename(columns={'Timestamp': 'timestamp', 'From Bank': 'from_bank', 'Account': 'from_account',
                           'To Bank': 'to_bank', 'Account.1': 'to_account', 'Amount Received': 'amount_received',
                             'Receiving Currency': 'receiving_currency', 'Amount Paid': 'amount_paid', 'Payment Currency': 'payment_currency',
                               'Payment Format': 'payment_format', 'Is Laundering': 'is_laundering'})
dto_parquet('dparquet')

In [None]:
verteces = spark_df.select(col("from_account").alias("id")).union(spark_df.select(col("to_account").alias("id"))).distinct()
edges = spark_df.select(col("from_account").alias("src"), col("to_account").alias("dst"), col("index"), col("amount_paid").alias("amount"), col("timestamp"), col("payment_format"), col("is_laundering"))
g = GraphFrame(verteces, edges)

In [None]:
pattern = g.find("(a)-[c1]->(b); (b)-[c2]->(c) ").filter("""
                                              a != b and
                                              b != c and

                                              c1.amount == c2.amount and
                                              c1.timestamp < c2.timestamp
                                            """)
pattern.show(5, truncate=False)


In [None]:
array_features =  np.array(pattern.select('c1','c2').collect(), dtype=int).squeeze()

In [None]:
from collections import defaultdict, Counter
dictionary = defaultdict(list)
for array in array_features:
    dictionary[(array[0][5], array[1][5])].append((array[0][6], array[1][6]))

In [None]:
le = LabelEncoder()
le.fit(list(count_values_payment.reset_index()['Payment Format']))
c = Counter()
l = []
matrix = [[0 for _ in range(6)] for _ in range(7)]
data = {}
for key, items in dictionary.items():
    i = Counter(items)
    string = f"{le.inverse_transform([key[0]])[0]}-{le.inverse_transform([key[1]])[0]}"
    data[string] = np.array([[i[(0,0)], i[(0,1)]], [i[(1,0)], i[(1,1)]]])

fig, axs = plt.subplots(2, 6, figsize=(15, 5))

# Loop over each payment type and display the matrix values in the corresponding subplot
for i, (payment, matrix) in enumerate(data.items()):
    # Compute the row and column indices for the current subplot
    row = i // 6
    column = i % 6
    
    # Display the matrix values in the current subplot
    axs[row, column].imshow(matrix, cmap='Greens')
    axs[row, column].set_xticks([0, 1])
    axs[row, column].set_yticks([0, 1])
    axs[row, column].set_xticklabels(['0', '1'])
    axs[row, column].set_yticklabels(['0', '1'])
    axs[row, column].set_xlabel(str(payment.split("-")[1]))
    axs[row, column].set_ylabel(str(payment.split("-")[0]))
    axs[row, column].xaxis.set_label_position('top')
    axs[row, column].yaxis.set_label_position('left')
    axs[row, column].xaxis.set_ticks_position('top')
    axs[row, column].yaxis.set_ticks_position('left')

    for i in range(2):
        for j in range(2):
            axs[row, column].annotate(str(matrix[i, j]), xy=(j, i), ha='center', va='center', color='grey')


# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.8, hspace=0.5)

# Show the plot
plt.show()

# Find circular patterns

In [None]:
def find_cycles(g: GraphFrame, hop: int = 2):
    motif = ""

    for i in range(hop):
        motif += "(n" + str(i) + ")-[c" + str(i+1) + "]->(n" + str((i+1) % hop) + "); "
    motif = motif.strip("; ")

    filter_string = ""
    for i in range(hop):
        for j in range(i, hop-1):
            filter_string += "n{} != n{}".format(i, j+1)
            if i+1 < hop-1:
                filter_string += " and "
    filter_string += " and "
    for j in range(1,hop):
            filter_string += "c{}.timestamp < c{}.timestamp".format(j, j+1)
            if(j+1 < hop):
                filter_string += " and "    
    graph = g.find(motif)
    graph = graph.filter(filter_string)
    select_col = []
    for i in range(hop):
        select_col.append("c{}".format(i+1))
    pattern = np.array(graph.select(*select_col).collect()).squeeze()
    total_rows = []

    for row in pattern:
        if isinstance(row[1], np.ndarray):
            for r in row:
                #index | timestamp | from | to | payment_format | is_laundering | hop
                total_rows.append([int(r[2]), float(r[4]), int(r[0]), int(r[1]), int(r[5]), int(r[6]), hop])
        else:
            total_rows.append([int(row[2]), float(row[4]), int(row[0]), int(row[1]), int(row[5]), int(row[6]), hop])

    dataframe = pd.DataFrame(total_rows, columns=['index', 'timestamp', 'from', 'to', 'payment_format', 'is_laundering', 'hop'])

    return dataframe.drop_duplicates()

In [None]:
spark.stop()

In [None]:
all_df = []
filtered_spark = df.filter(col("payment_currency") == col("receiving_currency"))
filtered_spark.cache()
payment_formats = filtered_spark.select("payment_format").distinct().rdd.flatMap(lambda x: x).collect()

for j in range(1):
    verteces = filtered_spark.filter(col("payment_format") == j).select(col("from_account").alias("id")).union(df.select(col("to_account").alias("id"))).distinct()
    edges = filtered_spark.filter(col("payment_format") == j).select(col("from_account").alias("src"), col("to_account").alias("dst"), col("index"), col("amount_paid").alias("amount"), col("timestamp"), col("payment_format"), col("is_laundering"))
    g = GraphFrame(verteces, edges)
    g = g.dropIsolatedVertices()
    for i in tqdm.tqdm(range(2,14)):
        all_df.append(find_cycles(g, i))

In [None]:
merged_df = pd.concat(all_df, ignore_index=True)

# Step 4: Applica il one-hot encoding sulla colonna "hop"
one_hot_encoded_df = pd.get_dummies(merged_df, columns=['hop'], prefix='hop')
grouped_df = one_hot_encoded_dgroupby('index').agg({
    'timestamp': 'first',
    'from': 'first',
    'to': 'first',
    'payment_format': 'first',
    'is_laundering': 'first',
    'hop_2': 'any',
    'hop_3': 'any',
    'hop_4': 'any',
    'hop_5': 'any',
    'hop_6': 'any',
    'hop_7': 'any',
    'hop_8': 'any',
    'hop_9': 'any',
    'hop_10': 'any',
    'hop_11': 'any',
    'hop_12': 'any',
}).reset_index()
columns_to_encode = ['hop_2', 'hop_3', 'hop_4', 'hop_5', 'hop_6', 'hop_7', 'hop_8', 'hop_9', 'hop_10', 'hop_11', 'hop_12']
grouped_df[columns_to_encode] = grouped_df[columns_to_encode].astype(int)

In [None]:
# Calcola il conteggio delle righe che soddisfano le condizioni specificate per ogni valore hop da hop_2 a hop_12
hop_columns = grouped_dcolumns[grouped_dcolumns.str.startswith('hop_')].tolist()
counts = {}
for hop_column in hop_columns:
    count_0 = grouped_df[(grouped_df[hop_column] == 1) & (grouped_df['is_laundering'] == 0)].shape[0]
    count_1 = grouped_df[(grouped_df[hop_column] == 1) & (grouped_df['is_laundering'] == 1)].shape[0]
    counts[hop_column] = {'0': count_0, '1': count_1}

# Converti i risultati in un DataFrame
counts_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
counts_df = counts_drename(columns={'index': 'hop'})

color_0 = 'green'
color_1 = 'orange'

# Plot the bar chart
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35
x = counts_dindex
labels = counts_df['hop']
bar_0 = ax.bar(x - width / 2, counts_df['0'], width,  label='is_laundering = 0', color=color_0)
bar_1 = ax.bar(x + width / 2, counts_df['1'], width,  label='is_laundering = 1', color=color_1)

ax.set_xlabel('Hop Values')
ax.set_ylabel('Number of Rows')
ax.set_title('Count of Rows with Hop Values and is_laundering')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Add count labels above each bar
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate('{}'.format(height),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        
autolabel(bar_0)
autolabel(bar_1)

plt.tight_layout()
plt.show()

# Find Fan In 

In [None]:
def find_fanin(g: GraphFrame):
    motif = "(a)-[c1]->(b); (c)-[c2]->(b)"
    filter_motif = "(abs(c1.timestamp - c2.timestamp)) <= 40000 and c1.index != c2.index and c1.payment_currency == c2.payment_currency"#and c1.payment_format == c2.payment_format"
  
    pattern = g.find(motif).filter(filter_motif).select("c1", "c2").distinct()
    fan_in_trans = pattern.groupBy(col("c1")).agg(count("*").alias("fan_in_degree")).select(col("c1").alias("transaction"), col("fan_in_degree"))
    fan_in_trans.cache()
    
    
        
    return fan_in_trans

In [None]:
filtered_spark = spark_dfilter(col("payment_currency") == col("receiving_currency"))
filtered_spark.cache()
payment_formats = filtered_spark.select("payment_format").distinct().rdd.flatMap(lambda x: x).collect()

total_fan_in = None

for payment_format in payment_formats:
    print(f"Find fan in payment_format: {payment_format}")
    filtered_by_format = filtered_spark.filter(col("payment_format") == payment_format)
    verteces = (
        filtered_by_format.select(col("from_account").alias("id"))
        .union(spark_df.select(col("to_account").alias("id")))
        .distinct()
    )
    edges = (
        filtered_by_format.select(
            col("from_account").alias("src"),
            col("to_account").alias("dst"),
            col("index"),
            col("timestamp"),
            col("payment_currency"),
            col("payment_format"),
            col("is_laundering")
            
        )
    )
    g = GraphFrame(verteces, edges)
    if total_fan_in is None:
        total_fan_in = find_fanin(g)
    else:
        total_fan_in = total_fan_in.unionAll(find_fanin(g))

In [None]:
# Definisci la funzione UDF per estrarre i valori dalla colonna "transaction" e creare una struttura
def extract_values(transaction):
    src, dst, index, timestamp, payment_currency, payment_format, is_laundering = transaction
    return (src, dst, index, timestamp,payment_currency,  payment_format, is_laundering)

# Definisci lo schema per il DataFrame Spark
schema = StructType([
    StructField("src", IntegerType(), True),
    StructField("dst", IntegerType(), True),
    StructField("index", IntegerType(), True),
    StructField("timestamp", FloatType(), True),
    StructField("payment_currency", IntegerType(), True),
    StructField("payment_format", IntegerType(), True),
    StructField("is_laundering", IntegerType(), True)
])

In [None]:
# Applica la funzione UDF per estrarre i valori dalla colonna "transaction" e crea un nuovo DataFrame
extract_udf = udf(extract_values, schema)
new_spark_df = total_fan_in.withColumn("extracted", extract_udf("transaction"))

# Seleziona le colonne necessarie e converte il DataFrame Spark in un DataFrame Pandas
pandas_df = new_spark_df.select("extracted.*", "fan_in_degree").toPandas()


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm

# Assume you have loaded the DataFrame 'df' with the data

# Filter out rows where timestamp is NaN (if needed)
# df = ddropna(subset=['timestamp'])

# Create a color map for different payment_formats
num_unique_payment_formats = pandas_df['payment_format'].nunique()
color_map = cm.get_cmap('tab20', num_unique_payment_formats) # Choose 'tab20' colormap for more distinct colors

# Mapping dictionary for payment_format names
payment_format_names = {
    0: 'ACH',
    1: 'Bitcoin',
    2: 'Cash',
    3: 'Cheque',
    4: 'Credit Card',
    5: 'Reinvestment',
    6: 'Wire',
    # Add more mappings as needed
}

# Replace the payment_format values with their desired names
pandas_df['payment_format_name'] = pandas_df['payment_format'].map(payment_format_names)

# Create a 3D scatter plot
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot the points with different colors based on payment_format
for i, (payment_format, group_df) in enumerate(pandas_dgroupby('payment_format_name')):
    color = color_map(i / num_unique_payment_formats)  # Map payment_format to a color from the colormap
    ax.scatter(group_df['fan_in_degree'], group_df['is_laundering'], group_df['payment_format'],
               alpha=0.5, marker='o', color=color, label=f'Payment_format={payment_format}')

# Set axis labels
ax.set_xlabel('Fan_in_degree')
ax.set_ylabel('Is_laundering')
ax.set_zlabel('Payment_format')

# Set the title
ax.set_title('Correlation between Fan_in_degree, Is_laundering, and Payment_format')

# Create a custom legend outside of the 3D plot
ax.legend(loc='center left', bbox_to_anchor=(1, 1))

plt.show()

# Find fan out

In [None]:

def find_fanout(g: GraphFrame):
    motif = "(a)-[c1]->(b); (a)-[c2]->(c)"
    filter_motif = "(abs(c1.timestamp - c2.timestamp)) <= 40000 and c1.index != c2.index"
  
    pattern = g.find(motif).filter(filter_motif).select("c1", "c2").distinct()
    fan_out_trans = pattern.groupBy(col("c1")).agg(count("*").alias("fan_out_degree")).select(col("c1").alias("transaction"), col("fan_out_degree"))
    fan_out_trans.cache()
    
    
        
    return fan_out_trans

In [None]:
filtered_spark = spark_dfilter(col("payment_currency") == col("receiving_currency"))
filtered_spark.cache()
payment_formats = filtered_spark.select("payment_format").distinct().rdd.flatMap(lambda x: x).collect()

total_fan_out = None

for payment_format in range(2):
    print(f"Find fan in payment_format: {payment_format}")
    filtered_by_format = filtered_spark.filter(col("payment_format") == payment_format)
    verteces = (
        filtered_by_format.select(col("from_account").alias("id"))
        .union(spark_df.select(col("to_account").alias("id")))
        .distinct()
    )
    edges = (
        filtered_by_format.select(
            col("from_account").alias("src"),
            col("to_account").alias("dst"),
            col("index"),
            col("timestamp"),
            col("payment_format"),
            col("is_laundering")
        )
    )
    g = GraphFrame(verteces, edges)
    if total_fan_out is None:
        total_fan_out = find_fanout(g)
    else:
        total_fan_out = total_fan_out.unionAll(find_fanout(g))

In [None]:
# Definisci la funzione UDF per estrarre i valori dalla colonna "transaction" e creare una struttura
def extract_values(transaction):
    src, dst, index, timestamp, payment_format, is_laundering = transaction
    return (src, dst, index, timestamp, payment_format, is_laundering)

# Definisci lo schema per il DataFrame Spark
schema = StructType([
    StructField("src", IntegerType(), True),
    StructField("dst", IntegerType(), True),
    StructField("index", IntegerType(), True),
    StructField("timestamp", FloatType(), True),
    StructField("payment_format", IntegerType(), True),
    StructField("is_laundering", IntegerType(), True)
])

In [None]:
# Applica la funzione UDF per estrarre i valori dalla colonna "transaction" e crea un nuovo DataFrame
extract_udf = udf(extract_values, schema)
new_spark_df = total_fan_out.withColumn("extracted", extract_udf("transaction"))

# Seleziona le colonne necessarie e converte il DataFrame Spark in un DataFrame Pandas
pandas_df = new_spark_df.select("extracted.*", "fan_out_degree").toPandas()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm


num_unique_payment_formats = pandas_df['payment_format'].nunique()
color_map = cm.get_cmap('tab20', num_unique_payment_formats) 

# Mapping dictionary for payment_format names
payment_format_names = {
    0: 'ACH',
    1: 'Bitcoin'
}

# Replace the payment_format values with their desired names
pandas_df['payment_format_name'] = pandas_df['payment_format'].map(payment_format_names)

# Create a 3D scatter plot
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot the points with different colors based on payment_format
for i, (payment_format, group_df) in enumerate(pandas_dgroupby('payment_format_name')):
    color = color_map(i / num_unique_payment_formats)  # Map payment_format to a color from the colormap
    ax.scatter(group_df['fan_out_degree'], group_df['is_laundering'], group_df['payment_format'],
               alpha=0.5, marker='o', color=color, label=f'Payment_format={payment_format}')

# Set axis labels
ax.set_xlabel('Fan_out_degree')
ax.set_ylabel('Is_laundering')
ax.set_zlabel('Payment_format')

# Set the title
ax.set_title('Correlation between Fan_out_degree, Is_laundering, and Payment_format')

# Create a custom legend outside of the 3D plot
ax.legend(loc='center left', bbox_to_anchor=(1, 1))

plt.show()

# End analysis