In [56]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [57]:
wallets_data = pd.read_csv("../dataset/custom/wallets_features_aggregated.csv")

In [58]:
wallets_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822942 entries, 0 to 822941
Data columns (total 46 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   class                                822942 non-null  int64  
 1   num_timesteps_appeared_in            822942 non-null  float64
 2   fees_median                          822942 non-null  float64
 3   num_addr_transacted_multiple         822942 non-null  float64
 4   addrId                               822942 non-null  object 
 5   userId                               822942 non-null  int64  
 6   user_ts_fees_share_mean              822942 non-null  float64
 7   user_ts_fees_share_min               822942 non-null  float64
 8   user_ts_fees_share_max               822942 non-null  float64
 9   user_addr_cnt                        822942 non-null  int64  
 10  user_outcoming_tx_cnt                822942 non-null  float64
 11  user_incoming

In [59]:
wallets_data["addrId"]

0          11N1nXt3xohkDrUQLSupi5aG69f8CmkYC
1         121qMEKXMfFg94DcWy2GwyN7hJximJj2F8
2         122U79JdvMcaSbLbXdD5cTi67jQC4Ube3W
3         123KBbJSbQZzBpp9ugyKVSHJhRABmhcYqV
4         123oKmBW2d2SQGYRrvfzmp6whsPpJ6P268
                         ...                
822937    3LQxUzikM2rWBhUFrLFNiuWPbPLLJn84DB
822938    19iVyH1qUxgywY8LJSbpV4VavjZmyuEyxV
822939    1GUkazUBpXWdSJ9HbgTapAH7uybpi3Cs6K
822940    18rdKmjrg1EawxgiVT3ikLExj6GWS2MNCk
822941    1HtqDMWgn6186e8t3EesZQiw7gNbaPJfJH
Name: addrId, Length: 822942, dtype: object

In [60]:
# addresses = wallets_data[wallets_data["addrId"] <= "133oKmBW2d2SQGYRrvfzmp6whsPpJ6P268"]["addrId"]
addresses = wallets_data["addrId"]
addresses.shape

(822942,)

In [61]:
valid_addresses_data = wallets_data[wallets_data["addrId"].isin(addresses)].sort_values(by="addrId")

In [62]:
addr_edges = pd.read_csv("../dataset/Elliptic++ Dataset/AddrAddr_edgelist.csv")
addr_edges = addr_edges[
    addr_edges["input_address"].isin(addresses) &
    addr_edges["output_address"].isin(addresses)
]
addr_edges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2868964 entries, 0 to 2868963
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   input_address   object
 1   output_address  object
dtypes: object(2)
memory usage: 43.8+ MB


In [63]:
import torch
from torch_geometric.data import Data


In [64]:
node_feats_prepared = valid_addresses_data.sort_values(by="addrId").drop("addrId", axis=1)
feats_ts = torch.tensor(np.array(node_feats_prepared))

In [65]:
addr2id = valid_addresses_data.sort_values(by="addrId").reset_index().drop("index", axis=1).reset_index()[["addrId", "index"]]
addr2id

Unnamed: 0,addrId,index
0,111112TykSw72ztDN2WJger4cynzWYC5w,0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,1
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,4
...,...,...
822937,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,822937
822938,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,822938
822939,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,822939
822940,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,822940


In [66]:
feats_ts.shape

torch.Size([822942, 45])

### Edge index

In [80]:
edges_feats = pd.read_csv("../dataset/custom/AddrAddr_EdgeFeatures.csv")
edges_feats = edges_feats[
    edges_feats["input_address"].isin(addresses) &
    edges_feats["output_address"].isin(addresses)
]
edges_feats.head()

Unnamed: 0,input_address,output_address,in_BTC_total,out_BTC_total,active_ts_cnt,tx_cnt
0,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,1A2vTkKSsmVLN2EPEJT3KZR4q1Rvv6c6Xs,0.056681,0.056435,1,1
1,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,1KWbPoFkzadegdff9rCK1wBFu3mD8M17Wp,0.056681,0.056435,1,1
2,1117wASFaYgJJP6MiY8cPD5DMdQda8gDZ,1K7o3aMfiddvUgMGagdNE5GkiykPPyGj32,1.214191,1.214091,1,1
3,1117wASFaYgJJP6MiY8cPD5DMdQda8gDZ,1Po4J4SNyJuGnMGYJfGTXLEvGgAZKiddr7,1.214191,1.214091,1,1
4,111HRAJxnoxqyKRVnjqBmwqneUrHc1chi,12RoZAgmZMFHMMrvaqrYZrLMPpAFEFGyWU,3.016258,3.013936,1,1


In [82]:
addr_edges_with_feat = addr_edges \
    .merge(
        edges_feats,
        how="left",
        on=("input_address", "output_address"),
    )
addr_edges_with_feat.head()

Unnamed: 0,input_address,output_address,in_BTC_total,out_BTC_total,active_ts_cnt,tx_cnt
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,7.000303,6.999303,1,1
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,7.000303,6.999303,1,1
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G,5.525902,5.525802,1,1
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c,11.811274,11.811174,1,1
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA,11.811274,11.811174,1,1


In [83]:
addr_edges_ixs = addr_edges_with_feat \
    .merge(
        addr2id,
        how="left",
        left_on="input_address",
        right_on="addrId"
    ) \
    .drop(["input_address", "addrId"], axis=1) \
    .rename(columns={"index": "input_index"}) \
    .merge(
        addr2id,
        how="left",
        left_on="output_address",
        right_on="addrId"
    ) \
    .drop(["output_address", "addrId"], axis=1) \
    .rename(columns={"index": "output_index"})


In [85]:
addr_edges_ixs

Unnamed: 0,in_BTC_total,out_BTC_total,active_ts_cnt,tx_cnt,input_index,output_index
0,7.000303,6.999303,1,1,80301,438386
1,7.000303,6.999303,1,1,80301,80301
2,5.525902,5.525802,1,1,43616,441088
3,11.811274,11.811174,1,1,589947,243568
4,11.811274,11.811174,1,1,589947,547238
...,...,...,...,...,...,...
2868959,2106.965204,2106.905760,27,206,805802,805802
2868960,0.056750,0.056735,1,1,766805,739698
2868961,0.008051,0.008035,1,1,720571,777875
2868962,0.003356,0.003306,1,1,501046,501046


In [87]:
edge_index = torch.tensor(np.array(addr_edges_ixs[["input_index", "output_index"]]).T)
edge_index

tensor([[ 80301,  80301,  43616,  ..., 720571, 501046, 501046],
        [438386,  80301, 441088,  ..., 777875, 501046, 108362]])

In [99]:
edge_features= torch.tensor(np.array(addr_edges_ixs.drop(["input_index", "output_index"], axis=1)))
edge_features.shape

torch.Size([2868964, 4])

In [102]:
edge_index.shape, edge_features.shape

(torch.Size([2, 2868964]), torch.Size([2868964, 4]))

In [None]:
# def generate_edge_index(edge_cnt, edges_list):
#     res = torch.zeros((2, edge_cnt))
#     ind = 0
#     for edge in tqdm(edges_list, desc=f"total: {edge_cnt}"):
#         l = edge[1]["input_address"]
#         r = edge[1]["output_address"]
#         # print(addr2id[l], addr2id[r])
#         res[0][ind] = addr2id[addr2id["addrId"] == l]["index"].item()
#         res[1][ind] = addr2id[addr2id["addrId"] == r]["index"].item()
#         ind += 1
        
#     return res

In [79]:
# edge_index = generate_edge_index(addr_edges.shape[0], addr_edges.iterrows())
# edge_index = edge_index.type(torch.long)

### Edge features



Unnamed: 0,input_address,output_address,in_BTC_total,out_BTC_total,active_ts_cnt,tx_cnt
4,111HRAJxnoxqyKRVnjqBmwqneUrHc1chi,12RoZAgmZMFHMMrvaqrYZrLMPpAFEFGyWU,3.016258,3.013936,1,1
8,111Y3BkUGLSWQjHCb2Mg8oMNiwc1jxMwe,12eweZVTQx8MktqkEodiss1z2RqCbYjDqE,29.67172,29.642631,1,1
25,1121SNiVgJ6esyxhmEK45E8Q1YxWs6htNz,124d7pxH9esG6TjYDn3xHoaoihp3jxzLKh,74.097317,74.065961,1,1
26,1121SNiVgJ6esyxhmEK45E8Q1YxWs6htNz,12DaEKUiYyMzKabQJrNNmwfKD1nrArbUYx,74.097317,74.065961,1,1
27,1121SNiVgJ6esyxhmEK45E8Q1YxWs6htNz,12UJqvec8EHHnvghx8rgV2aMgECHewuArZ,74.097317,74.065961,1,1


In [None]:
# def generate_edge_features(edges_feats, feats_list, edge_cnt, edges_list):
#     res = torch.zeros((edge_cnt, len(feats_list)))
#     ind = 0
#     for edge in edges_list:
#         l = edge[1]["input_address"]
#         r = edge[1]["output_address"]
#         for i in range(len(feats_list)):
#             feat = feats_list[i]
#             data = edges_feats[(edges_feats["input_address"] == l) & (edges_feats["input_address"] == r)][:1]
#             if data.shape[0] == 0:
#                 res[ind][i] = 0
#             else:
#                 res[ind][i] = data[feat].item()
#         ind += 1
        
#     return res

In [None]:
# edge_features = generate_edge_features(
#     edges_feats,
#     ["in_BTC_total", "out_BTC_total", "active_ts_cnt", "tx_cnt"],
#     addr_edges.shape[0],
#     addr_edges.iterrows()
# )

### Create graph

In [107]:
graph = Data(
    x=feats_ts.type(torch.float),
    edge_index=edge_index,
    y=torch.tensor(np.array(valid_addresses_data["class"] == 1)).type(torch.long),
    edge_attr=edge_features.type(torch.float)
)
graph


Data(x=[822942, 45], edge_index=[2, 2868964], edge_attr=[2868964, 4], y=[822942])

In [108]:
torch.save(graph, "../dataset/graph/graph_full.pth")

In [105]:
edge_index.shape

torch.Size([2, 2868964])

In [107]:
feats_ts

tensor([[2.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 7.0423e-03, 4.3959e+05,
         1.0000e+00],
        [3.0000e+00, 6.0000e+00, 0.0000e+00,  ..., 5.7984e-04, 4.8596e+05,
         1.0000e+00],
        [2.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 2.7600e-02, 4.3152e+05,
         1.0000e+00],
        ...,
        [2.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 1.8074e-03, 4.5975e+05,
         1.0000e+00],
        [3.0000e+00, 1.0000e+00, 1.0000e-04,  ..., 1.0000e-04, 4.0935e+05,
         1.0000e+00],
        [3.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 1.2430e-04, 4.2346e+05,
         1.0000e+00]], dtype=torch.float64)

In [94]:
(valid_addresses_data["class"] == 1).value_counts()

class
False    808676
True      14266
Name: count, dtype: int64