# Tentando aprender sobre as GNNs usando o dataset do TCC

In [1]:
import pandas as pd
from datetime import datetime

def get_dict_val(name, collection):
    if name in collection:
        val = collection[name]
    else:
        val = len(collection)
        collection[name] = val
    return val

def format_timestamp(timestamp):
    firstTs = -1
    timestamps = []
    for i in timestamp:
        dt_ts = datetime.strptime(i, '%Y/%m/%d %H:%M')
        ts = dt_ts.timestamp()
        if firstTs == -1:
            day = dt_ts.day
            month = dt_ts.month
            year = dt_ts.year
            startTime = datetime(year, month, day)
            firstTs = startTime.timestamp() - 10
        ts = ts - firstTs
        timestamps.append(ts)

    return timestamps


df_edges = pd.read_csv("./data/HI-Small_Trans.csv")

currency = dict()
payment_format = dict()
fromAccIdStr = dict()
toAccIdStr = dict()

df_edges["Timestamp"] = format_timestamp(df_edges["Timestamp"])
df_edges["Received Currency"] = df_edges['Receiving Currency'].apply(lambda x: get_dict_val(x, currency))
df_edges["Sent Currency"] = df_edges['Payment Currency'].apply(lambda x: get_dict_val(x, currency))
df_edges["Payment Format"] = df_edges['Payment Format'].apply(lambda x: get_dict_val(x, payment_format))
df_edges["temp"] = df_edges["From Bank"].astype(str) + df_edges["Account"].astype(str)
df_edges["from_id"] = df_edges["temp"].apply(lambda x: get_dict_val(x, fromAccIdStr))
df_edges["temp"] = df_edges["To Bank"].astype(str) + df_edges["Account.1"].astype(str)
df_edges["to_id"] = df_edges["temp"].apply(lambda x: get_dict_val(x, toAccIdStr))

df_edges.reset_index(drop=True, inplace=True)
df_edges["EdgeID"] = df_edges.index

df_edges.rename(columns={"Amount Paid":"Amount Sent"}, inplace=True)

df_edges.drop(columns=["temp", "From Bank", "Account",
                       "To Bank", "Account.1", "Receiving Currency",
                       "Payment Currency"], inplace=True)

df_edges = df_edges.reindex(columns=["EdgeID","from_id","to_id","Timestamp",
                                     "Amount Sent","Sent Currency","Amount Received",
                                     "Received Currency","Payment Format","Is Laundering"])

df_edges["Timestamp"] = df_edges["Timestamp"] - df_edges["Timestamp"].min()
df_edges = df_edges.sort_values(by="Timestamp")

In [2]:
df_edges[:10]

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
316720,316720,236468,228822,0.0,47.64,13,47.64,13,0,0
261688,261688,195398,189349,0.0,3917.42,10,3917.42,10,0,0
261696,261696,195434,185780,0.0,97.49,10,97.49,10,2,0
126680,126680,94506,91981,0.0,13939.05,2,13939.05,2,5,0
126564,126564,94425,91897,0.0,10.37,2,10.37,2,0,0
126541,126541,94408,91880,0.0,6667.54,2,6667.54,2,0,0
323272,323272,241245,233501,0.0,0.001058,1,0.001058,1,6,0
126527,126527,94395,91867,0.0,257.89,2,257.89,2,2,0
126524,126524,94379,91851,0.0,10.94,2,10.94,2,0,0
126505,126505,94375,91847,0.0,1186351.0,2,1186351.0,2,0,0


Agora que já temos o DataFrame das arestas, vamos criar o DataFrame dos nós:

In [3]:
import torch
import numpy as np

max_n_id = df_edges.loc[:, ['from_id', 'to_id']].to_numpy().max() + 1
df_nodes = pd.DataFrame({'NodeID': np.arange(max_n_id), 'Feature': np.ones(max_n_id)})
timestamps = torch.Tensor(df_edges['Timestamp'].to_numpy())
y = torch.LongTensor(df_edges['Is Laundering'].to_numpy())

edge_features = ['Timestamp', 'Amount Received', 'Received Currency', 'Payment Format']
node_features = ['Feature']

X = torch.tensor(df_nodes.loc[:, node_features].to_numpy()).float()
edge_index = torch.LongTensor(df_edges.loc[:, ['from_id', 'to_id']].to_numpy().T)
edge_attr = torch.tensor(df_edges.loc[:, edge_features].to_numpy()).float()

n_days = int(timestamps.max() / (3600 * 24) + 1)
n_samples = y.shape[0]

Encontrando o ponto ótimo de separação do dataset, respeitando o critério:
* Treino: 60%
* Validação: 20%
* Teste: 20%

In [4]:
import itertools

#data splitting
daily_irs, weighted_daily_irs, daily_inds, daily_trans = [], [], [], [] #irs = illicit_transactions

for day in range(n_days):
        l = day * 24 * 3600
        r = (day + 1) * 24 * 3600
        day_inds = torch.where((timestamps >= l) & (timestamps < r))[0]
        daily_irs.append(y[day_inds].float().mean())
        weighted_daily_irs.append(y[day_inds].float().mean() * day_inds.shape[0] / n_samples)
        daily_inds.append(day_inds)
        daily_trans.append(day_inds.shape[0])

# Recommended split_percentages for train, validation and test. 
split_per = [0.6, 0.2, 0.2]
daily_totals = np.array(daily_trans)
d_ts = daily_totals
I = list(range(len(d_ts)))
split_scores = dict()

# Iterates over all days combination ranges and stores the score at split_scores
for i,j in itertools.combinations(I, 2):
    if j >= i:
        split_totals = [d_ts[:i].sum(), d_ts[i:j].sum(), d_ts[j:].sum()]
        split_totals_sum = np.sum(split_totals)
        split_props = [v/split_totals_sum for v in split_totals] # proportion of each split compared to the total transactions
        split_error = [abs(v-t)/t for v,t in zip(split_props, split_per)] 
        score = max(split_error) #- (split_totals_sum/total) + 1
        split_scores[(i,j)] = score
    else:
        continue
i,j = min(split_scores, key=split_scores.get) # get the best i,j from split_scores

# split contains a list for each split (train, validation and test) and each list contains the days that are part of the respective split
split = [list(range(i)), list(range(i, j)), list(range(j, len(daily_totals)))]

# seperate the transactions based on their indices in the timestamp array
split_inds = {k: [] for k in range(3)}
for i in range(3):
    for day in split[i]:
        split_inds[i].append(daily_inds[day]) #split_inds contains a list for each split (tr,val,te) which contains the indices of each day seperately

tr_inds = torch.cat(split_inds[0])
val_inds = torch.cat(split_inds[1])
te_inds = torch.cat(split_inds[2])

tr_x, val_x, te_x = X, X, X # sets the placeholder (ones) to the variables

Analisando o Output do modelo:

In [5]:
print(f"Total train samples: {tr_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
        f"{y[tr_inds].float().mean() * 100 :.2f}% || Train days: {split[0][:]}")
print(f"Total val samples: {val_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
    f"{y[val_inds].float().mean() * 100:.2f}% || Val days: {split[1][:]}")
print(f"Total test samples: {te_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
    f"{y[te_inds].float().mean() * 100:.2f}% || Test days: {split[2][:]}")

# IR stants for Illicit Ratio!

Total train samples: 63.98% || IR: 0.08% || Train days: [0, 1, 2, 3, 4, 5]
Total val samples: 19.01% || IR: 0.11% || Val days: [6, 7]
Total test samples: 17.01% || IR: 0.19% || Test days: [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]


In [7]:
e_tr = tr_inds.numpy() # Edge train array
e_val = np.concatenate([tr_inds, val_inds]) # Edge validation (train + val) array

# Train
tr_edge_index, tr_edge_attr, tr_y, tr_edge_times = edge_index[:,e_tr],  edge_attr[e_tr],  y[e_tr],  timestamps[e_tr]

# Validation (tr + val)
val_edge_index, val_edge_attr, val_y, val_edge_times = edge_index[:,e_val], edge_attr[e_val], y[e_val], timestamps[e_val]

# Test (tr + val + te)
te_edge_index, te_edge_attr, te_y, te_edge_times = edge_index, edge_attr, y, timestamps

In [8]:
import data_util

tr_data = data_util.GraphData(x=tr_x, y=tr_y, edge_index=tr_edge_index, edge_attr=tr_edge_attr, timestamps=tr_edge_times)
val_data = data_util.GraphData(x=val_x, y=val_y, edge_index=val_edge_index, edge_attr=val_edge_attr, timestamps=val_edge_times)
te_data = data_util.GraphData(x=te_x, y=te_y, edge_index=te_edge_index, edge_attr=te_edge_attr, timestamps=te_edge_times)

Para lembrar:

tr_edge_attr = df_edges.loc[:, edge_features]

In [12]:
df_edges.loc[:, edge_features]

Unnamed: 0,Timestamp,Amount Received,Received Currency,Payment Format
316720,0.0,47.64,13,0
261688,0.0,3917.42,10,0
261696,0.0,97.49,10,2
126680,0.0,13939.05,2,5
126564,0.0,10.37,2,0
...,...,...,...,...
4962230,1504920.0,3749.14,0,3
4962231,1509480.0,1785.27,2,3
4962232,1509480.0,1785.27,2,3
4962233,1515480.0,2154.54,0,3


In [8]:
tr_edge_attr

tensor([[0.0000e+00, 4.7640e+01, 1.3000e+01, 0.0000e+00],
        [0.0000e+00, 3.9174e+03, 1.0000e+01, 0.0000e+00],
        [0.0000e+00, 9.7490e+01, 1.0000e+01, 2.0000e+00],
        ...,
        [5.1834e+05, 4.1111e+02, 0.0000e+00, 1.0000e+00],
        [5.1834e+05, 2.2221e+06, 9.0000e+00, 3.0000e+00],
        [5.1834e+05, 1.3590e+01, 0.0000e+00, 2.0000e+00]])

In [13]:
import data_util

adj_list_in, adj_list_out = data_util.to_adj_nodes_with_times(tr_data)

In [14]:
adj_list_in

{0: [(0, 1200),
  (70990, 87480),
  (70981, 87600),
  (465688, 145500),
  (56296, 173040),
  (79111, 174000),
  (36048, 345840),
  (51392, 432060),
  (58569, 433560),
  (78696, 433560)],
 1: [(47302, 300),
  (51392, 660),
  (1, 1200),
  (90263, 1440),
  (399662, 79560),
  (62665, 87300),
  (35698, 173640),
  (70990, 259380),
  (31800, 260640),
  (70981, 347160),
  (36048, 432060),
  (28527, 432420)],
 2: [(2, 0)],
 3: [(3, 0),
  (3, 120),
  (36048, 900),
  (36044, 1260),
  (3, 69120),
  (12186, 87240),
  (53291, 87660),
  (70988, 172980),
  (88956, 260580),
  (5, 424800),
  (399662, 424920),
  (250231, 424980),
  (6, 424980),
  (7, 425100),
  (4, 425400),
  (11, 425820),
  (0, 426540),
  (79938, 432720),
  (76576, 433740)],
 4: [(4, 360),
  (4, 78540),
  (432632, 103200),
  (58573, 172920),
  (22815, 173460),
  (78696, 174540),
  (78696, 260820)],
 5: [(22601, 60),
  (5, 180),
  (144, 86580),
  (1, 106800),
  (47716, 172920),
  (55879, 259200),
  (36047, 259800),
  (36044, 260160),
  (

In [27]:
in_ports = data_util.ports(tr_edge_index, adj_list_in)
print(in_ports[:20])

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])


In [26]:
reverse_ports = True

out_ports = [data_util.ports(tr_edge_index.flipud(), adj_list_out)] if reverse_ports else []

In [32]:
tmp_edge_attr = tr_edge_attr.clone()
tmp_edge_attr

tensor([[0.0000e+00, 4.7640e+01, 1.3000e+01, 0.0000e+00],
        [0.0000e+00, 3.9174e+03, 1.0000e+01, 0.0000e+00],
        [0.0000e+00, 9.7490e+01, 1.0000e+01, 2.0000e+00],
        ...,
        [5.1834e+05, 4.1111e+02, 0.0000e+00, 1.0000e+00],
        [5.1834e+05, 2.2221e+06, 9.0000e+00, 3.0000e+00],
        [5.1834e+05, 1.3590e+01, 0.0000e+00, 2.0000e+00]])

In [33]:
tmp_edge_attr = torch.cat([tmp_edge_attr, in_ports] + out_ports, dim=1)
tmp_edge_attr

tensor([[0.0000e+00, 4.7640e+01, 1.3000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 3.9174e+03, 1.0000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 9.7490e+01, 1.0000e+01, 2.0000e+00, 0.0000e+00, 0.0000e+00],
        ...,
        [5.1834e+05, 4.1111e+02, 0.0000e+00, 1.0000e+00, 1.0000e+00, 3.0000e+00],
        [5.1834e+05, 2.2221e+06, 9.0000e+00, 3.0000e+00, 0.0000e+00, 1.0000e+00],
        [5.1834e+05, 1.3590e+01, 0.0000e+00, 2.0000e+00, 2.0000e+00, 1.0342e+04]])