# Кластеризация на пользователей

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)

Сделаем маппинг tx <-> список входящих кошельков

In [50]:
from collections import defaultdict

addr_tx_df = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")

tx_inputs = defaultdict(lambda : [])
addr_list = set()
for index, row in addr_tx_df.iterrows():
    tx_inputs[row["txId"]].append(row["input_address"])
    addr_list.add(row["input_address"])
tx_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")
for index, row in tx_addr_df.iterrows():
    addr_list.add(row["output_address"])

addr_list = list(addr_list)


Сделаем граф зависимостей между адресами, запустим DFS и разобьем на обычные КС

In [51]:
addr_to_neighbors = defaultdict(lambda : [])
for tx, addrs in tx_inputs.items():
    if len(addrs) <= 1:
        continue
    for i in range(1, len(addrs)):
        addr_to_neighbors[addrs[i - 1]].append(addrs[i])
        addr_to_neighbors[addrs[i]].append(addrs[i - 1])

In [52]:
from copy import deepcopy

used = defaultdict(lambda : False)
addr_to_component = defaultdict(lambda : 0)

def dfs(node, comp_id):
    used[node] = True
    addr_to_component[node] = comp_id

    nodes_to_visit = deepcopy(addr_to_neighbors[node])

    while len(nodes_to_visit):
        nei = nodes_to_visit.pop()
        used[nei] = True
        addr_to_component[nei] = comp_id

        for nei1 in addr_to_neighbors[nei]:
            if used[nei1] or nei1 in nodes_to_visit:
                continue
            nodes_to_visit.append(nei1)

current_comp = 1
for addr in addr_list:
    if not used[addr]:
        dfs(addr, current_comp)
        current_comp += 1

In [53]:
list(addr_to_component.items())[:1000]

[('1CgzeXJFcStceNX4q6YRfFS3FN2xaepEg9', 1),
 ('1Chqwg76F1jvvgdfEWg1VwcNxogCbLF9PB', 1),
 ('19rCgDfUxnBXWzgAWrXtVXsnYWYHXWYihT', 1),
 ('1KxVQdjFVgzEEZCqBKXxz58YN23SGyUk9c', 1),
 ('19wE8yd3VmFKexAEpuXsUuQZmrJqQ9Qjfv', 1),
 ('1LZJyWJQHhvpfChjUaJj7Pukr82zimN1n5', 1),
 ('1BLGUrjbyfyjMxUsMhtrFFgjwR5dqoFgBA', 1),
 ('1BrbCquFR4EwBk2yDPDE9FZR5aM8pEGxKr', 1),
 ('1ATQuaiwh4BLUFCQgLESMK7NA9uDXea8nD', 1),
 ('1Lkn2dYfCEP2kA9nUA2CBpWdQfVBeYz9Ps', 1),
 ('1A5VbutDgnQm1Sk6LKLx6PWCVt55iQEV8Q', 2),
 ('1MVWpKq8XMKdmPbHYAeGs5bbu58ySH4Ste', 3),
 ('1KNvjhAB3GeWFFKnT1zcibgXAQyDzfCHD4', 4),
 ('1K9CeoSZ3qm7vsVL12XAWUiVacfwRUe6Mp', 5),
 ('1ESkCRzwxi6AdZDJzYsbxJXdtJ5vm5HBSE', 6),
 ('1EBZjxTX7X6WQTLzMB9pM4oDGCZTy1ZURy', 7),
 ('1D9TLHghZm8Q9qdmZZqFjxt7hvxmjkuAt9', 8),
 ('1EBTtYA7EJQQEnwXPxwB6KWFgxVHN1yocp', 9),
 ('16WU5dZRqstwQbFtZ8XGsCFjA5DxsBSwrC', 9),
 ('1BhxKSjLPL15GnVhcJrviQjP1CQX5MNeVt', 9),
 ('1NX8bUFckSyxSVukpKuk9j32Rt53BFeExx', 9),
 ('1GodSyw8s3vVuYmhNhE56KSjUCQGThbvr9', 9),
 ('14iMFBtXUm3jisXDsS1XYaXiUX1Kg

In [54]:
addr_to_component["1PQBBWY435CTSnir4wrfD8HXZ4E92v9Xd6"], \
addr_to_component["1AueTrCc9oK7ovbtfYeVSJe4UQEaxcRMFy"]

(16, 16)

# Подготовим синтетические датасеты графа
- addr <-> user_id
- user_id <-> txId
- tx_id <-> user_id
- user_id <-> user_id

In [55]:
addr_to_user_df = {
    "addrId": [],
    "userId": []
}
for addr, user in addr_to_component.items():
    addr_to_user_df["addrId"].append(addr)
    addr_to_user_df["userId"].append(user)
addr_to_user_df = pd.DataFrame(addr_to_user_df)
addr_to_user_df.to_csv("../dataset/custom/AddrUser.csv", index=False)

In [56]:
tx_with_users = pd.merge(
    left=addr_tx_df,
    right=addr_to_user_df,
    left_on="input_address",
    right_on="addrId",
    how="left"
)
tx_with_users.drop("input_address", axis=1, inplace=True)
tx_with_users.drop("addrId", axis=1, inplace=True)

tx_with_users = tx_with_users.drop_duplicates()
tx_with_users.to_csv("../dataset/custom/UserTx.csv", index=False)

In [57]:
tx_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")
tx_with_users = pd.merge(
    left=tx_addr_df,
    right=addr_to_user_df,
    left_on="output_address",
    right_on="addrId",
    how="left"
)
tx_with_users.drop("output_address", axis=1, inplace=True)
tx_with_users.drop("addrId", axis=1, inplace=True)
tx_with_users

# tx_with_users = tx_with_users.drop_duplicates()
tx_with_users.to_csv("../dataset/custom/TxUser.csv", index=False)

In [64]:
addr_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/AddrAddr_edgelist.csv")
addr_with_users = pd.merge(
    left=addr_addr_df,
    right=addr_to_user_df,
    left_on="input_address",
    right_on="addrId",
    how="left"
)
addr_with_users.drop("input_address", axis=1, inplace=True)
addr_with_users.drop("addrId", axis=1, inplace=True)
addr_with_users["inputUserId"] = addr_with_users["userId"]
addr_with_users.drop("userId", axis=1, inplace=True)

addr_with_users = pd.merge(
    left=addr_with_users,
    right=addr_to_user_df,
    left_on="output_address",
    right_on="addrId",
    how="left"
)
addr_with_users.drop("output_address", axis=1, inplace=True)
addr_with_users.drop("addrId", axis=1, inplace=True)
addr_with_users["outputUserId"] = addr_with_users["userId"]
addr_with_users.drop("userId", axis=1, inplace=True)

addr_with_users

addr_with_users = addr_with_users.drop_duplicates()
addr_with_users.to_csv("../dataset/custom/UserUser.csv", index=False)

# Проверка классов легальных-нелегальных по пользователям

In [36]:
addr_to_user = pd.read_csv("../dataset/custom/AddrUser.csv")
addr_to_class = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_classes.csv")
addr_to_tx = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")
tx_to_addr = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")

addr_to_class.head()

Unnamed: 0,address,class
0,111112TykSw72ztDN2WJger4cynzWYC5w,2
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2


In [7]:
users_with_classes = pd.merge(
    left=addr_to_user,
    right=addr_to_class,
    left_on="addrId",
    right_on="address",
    how="left"
)
users_with_classes.drop("address", axis=1, inplace=True)
users_with_classes.drop("addrId", axis=1, inplace=True)

users_with_classes

Unnamed: 0,userId,class
0,1,3
1,1,3
2,1,3
3,1,3
4,1,3
...,...,...
822937,569509,3
822938,569510,3
822939,569511,2
822940,569512,2


In [27]:
user_id_to_classes_cnt = users_with_classes.groupby("userId")["class"].nunique().reset_index()
user_id_to_classes_cnt[user_id_to_classes_cnt["class"] == 3]

Unnamed: 0,userId,class
15,16,3
25,26,3
26,27,3
31,32,3
40,41,3
...,...,...
48538,48539,3
64427,64428,3
64985,64986,3
106925,106926,3


In [22]:
wallets = addr_to_user[addr_to_user["userId"] == 518392]["addrId"]
wallets

771643    18mYdEtn6SAQP7QDn7iHdtW2mNk9tPvSbQ
771644    17jEGHeYqjrGkseSHuSwWiPKMq6Sc9ZKBg
Name: addrId, dtype: object

In [23]:
addr_to_class[addr_to_class["address"].isin(wallets)]

Unnamed: 0,address,class
178415,17jEGHeYqjrGkseSHuSwWiPKMq6Sc9ZKBg,2
210668,18mYdEtn6SAQP7QDn7iHdtW2mNk9tPvSbQ,3


In [26]:
addr_to_tx[addr_to_tx["input_address"].isin(wallets)]

Unnamed: 0,input_address,txId
17309,18mYdEtn6SAQP7QDn7iHdtW2mNk9tPvSbQ,232916687
17310,17jEGHeYqjrGkseSHuSwWiPKMq6Sc9ZKBg,232916687
398107,18mYdEtn6SAQP7QDn7iHdtW2mNk9tPvSbQ,1876630
447109,18mYdEtn6SAQP7QDn7iHdtW2mNk9tPvSbQ,195437282


In [34]:
wallets = addr_to_user[addr_to_user["userId"] == 48539]["addrId"]
addr_to_class[addr_to_class["address"].isin(wallets)]


Unnamed: 0,address,class
51766,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q,1
183017,17sp1vCt7gfFCwJ5UecaCMnn56F1X1JbfW,2
198339,18NtWvNaaZbitYpqsrhAutehZcdghqBcya,2
256688,1AFHGRgnJY7HBeFSno6X6bRkiZU4VLkeR3,3
351795,1DKgBtq3qJZiBFaDwi8xu58mN4e5RB4L9k,2
366043,1DnmZZfyzLKzxLXwG2eLSrhz9uZiA3ALNx,2
442900,1GKDpoVnmzRE4JrsC3hHN6QudPmP9RE7WR,2
444032,1GMY2dzKWBbLTEyHCGJiNxMNDT3MK9zpbF,3
470122,1HDMBc7CUAuuwxySB8MjUFZVy1d9H3zUGg,3
479717,1HXqzCUAhMeYMQifZpJeU4UY44qV32hWAJ,2


In [31]:
addr_to_tx[addr_to_tx["input_address"].isin(wallets)]

Unnamed: 0,input_address,txId
64154,17sp1vCt7gfFCwJ5UecaCMnn56F1X1JbfW,224254822
64155,1HXqzCUAhMeYMQifZpJeU4UY44qV32hWAJ,224254822
64156,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q,224254822
64157,1GMY2dzKWBbLTEyHCGJiNxMNDT3MK9zpbF,224254822
64158,1GKDpoVnmzRE4JrsC3hHN6QudPmP9RE7WR,224254822
64159,1LFgxURGeNLQ4U6yWx5RMKwax8frbgGzzm,224254822
64160,1DKgBtq3qJZiBFaDwi8xu58mN4e5RB4L9k,224254822
64161,1HDMBc7CUAuuwxySB8MjUFZVy1d9H3zUGg,224254822
64162,1AFHGRgnJY7HBeFSno6X6bRkiZU4VLkeR3,224254822
64163,1DnmZZfyzLKzxLXwG2eLSrhz9uZiA3ALNx,224254822


In [35]:
addr_to_tx[addr_to_tx["input_address"].isin(["13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q"])]

Unnamed: 0,input_address,txId
64156,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q,224254822


In [37]:
tx_to_addr[tx_to_addr["output_address"].isin(["13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q"])]

Unnamed: 0,txId,output_address
101458,223909678,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q


In [42]:
user_to_min_class = users_with_classes.groupby("userId")["class"].min().reset_index()
user_to_min_class.to_csv("../dataset/custom/UserClasses.csv", index=False)
user_to_min_class["class"].value_counts()

class
3    408435
2    155083
1      5995
Name: count, dtype: int64

In [44]:
addr_to_class["class"].value_counts()

class
3    557588
2    251088
1     14266
Name: count, dtype: int64

# Соберем полную таблицу фичей пользователей

In [51]:
user_to_addr_cnt = pd.read_csv("../dataset/custom/AddrUser.csv") \
    .groupby("userId")["addrId"] \
    .nunique() \
    .reset_index() \
    .rename({"addrId": "addr_cnt"}, axis=1)
user_to_addr_cnt.head()

Unnamed: 0,userId,addr_cnt
0,1,10
1,2,1
2,3,1
3,4,1
4,5,1


In [50]:
user_to_outcoming_tx_cnt = pd.read_csv("../dataset/custom/UserTx.csv") \
    .groupby("userId")["txId"] \
    .nunique() \
    .reset_index() \
    .rename({"txId": "outcoming_tx_cnt"}, axis=1)
user_to_outcoming_tx_cnt.head()

Unnamed: 0,userId,outcoming_tx_cnt
0,1,2
1,2,1
2,9,3
3,10,1
4,13,1


In [52]:
user_to_incoming_tx_cnt = pd.read_csv("../dataset/custom/TxUser.csv") \
    .groupby("userId")["txId"] \
    .nunique() \
    .reset_index() \
    .rename({"txId": "incoming_tx_cnt"}, axis=1)
user_to_incoming_tx_cnt.head()

Unnamed: 0,userId,incoming_tx_cnt
0,1,12
1,2,3
2,3,1
3,4,1
4,5,1


In [56]:
user_to_incoming_users_cnt = pd.read_csv("../dataset/custom/UserUser.csv") \
    .groupby("outputUserId")["inputUserId"] \
    .nunique() \
    .reset_index() \
    .rename({"outputUserId": "userId", "inputUserId": "input_users_cnt"}, axis=1)
user_to_incoming_users_cnt.head()

Unnamed: 0,userId,input_users_cnt
0,1,3
1,2,2
2,3,1
3,4,1
4,5,1


In [57]:
user_to_outcoming_users_cnt = pd.read_csv("../dataset/custom/UserUser.csv") \
    .groupby("inputUserId")["outputUserId"] \
    .nunique() \
    .reset_index() \
    .rename({"inputUserId": "userId", "outputUserId": "output_users_cnt"}, axis=1)
user_to_outcoming_users_cnt.head()

Unnamed: 0,userId,output_users_cnt
0,1,3
1,2,2
2,9,6
3,10,2
4,13,8


In [68]:
def merge_dfs(l, r, field_to_coalesce=None):
    res = pd.merge(
        left=l,
        right=r,
        on="userId",
        how="left",
    )
    if field_to_coalesce:
        res[field_to_coalesce] = res[field_to_coalesce].fillna(0)
    return res

user_data = merge_dfs(user_to_addr_cnt, user_to_outcoming_tx_cnt, field_to_coalesce="outcoming_tx_cnt")
user_data = merge_dfs(user_data, user_to_incoming_tx_cnt, field_to_coalesce="incoming_tx_cnt")
user_data = merge_dfs(user_data, user_to_incoming_users_cnt, field_to_coalesce="input_users_cnt")
user_data = merge_dfs(user_data, user_to_outcoming_users_cnt, field_to_coalesce="output_users_cnt")
user_data = merge_dfs(user_data, pd.read_csv("../dataset/custom/UserClasses.csv"))



user_data.to_csv("../dataset/custom/UserData_Small.csv", index=False)

In [None]:
wallets_features = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_features.csv")
wallets_features = pd.merge(
    left=wallets_features,
    right=addr_to_user,
    left_on="address",
    right_on="addrId",
    how="left",
)
wallets_features

In [70]:
wallets_features_agg = wallets_features \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "btc_transacted_total": "sum",
        "btc_sent_total": "sum",
        "btc_received_total": "sum",
        "btc_sent_median": "median",
        "btc_received_median": "median",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "active_time_steps_cnt",
        "txId": "incoming_tx_cnt",
    })
wallets_features_agg.head()

Unnamed: 0,userId,active_time_steps_cnt,btc_transacted_total,btc_sent_total,btc_received_total,btc_sent_median,btc_received_median
0,1,2,344.591752,210.131752,134.46,0.765,1.275
1,2,2,0.468014,0.143051,0.324963,0.0,0.022739
2,3,1,0.045772,0.0,0.045772,0.0,0.045772
3,4,1,0.01048,0.0,0.01048,0.0,0.01048
4,5,1,0.00163,0.0,0.00163,0.0,0.00163


In [75]:
user_to_outcoming_addr_cnt = pd.read_csv("../dataset/custom/UserTx.csv") \
    .merge(
        pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv"),
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId")["output_address"] \
    .nunique() \
    .reset_index() \
    .rename({"userId": "userId", "output_address": "interracted_output_address_cnt"}, axis=1)

user_to_incoming_addr_cnt = pd.read_csv("../dataset/custom/TxUser.csv") \
    .merge(
        pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv"),
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId")["input_address"] \
    .nunique() \
    .reset_index() \
    .rename({"userId": "userId", "input_address": "interracted_input_address_cnt"}, axis=1)


user_to_incoming_addr_cnt

Unnamed: 0,userId,interracted_input_address_cnt
0,1,21
1,2,60
2,3,1
3,4,2
4,5,1
...,...,...
531662,569509,1
531663,569510,1
531664,569511,1
531665,569512,97


In [78]:
user_data_populated = merge_dfs(user_data, wallets_features_agg)
user_data_populated = merge_dfs(user_data_populated, user_to_outcoming_addr_cnt, "interracted_output_address_cnt")
user_data_populated = merge_dfs(user_data_populated, user_to_incoming_addr_cnt, "interracted_input_address_cnt")

user_data_populated.to_csv("../dataset/custom/UserData.csv", index=False)

**Add fees info**

In [None]:
tx_to_fee = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "fees"]]

user_to_tx = pd.read_csv("../dataset/custom/UserTx.csv")

tx_to_fee = tx_to_fee.groupby('txId')['fees'].agg(['sum', 'mean', "min"]).reset_index() \
    .rename(columns={"sum": "fees_total", "mean": "fees_mean", "min": "fees_min"})

user_to_fees = user_to_tx \
    .merge(
        tx_to_fee,
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId") \
    .agg({
        "fees_total": "sum",
        "fees_mean": "mean",
        "fees_min": "min",
    }) \
    .reset_index()
user_to_fees


Unnamed: 0,userId,fees_total,fees_mean,fees_min
0,1,0.002039,0.001020,0.000776
1,2,0.000452,0.000452,0.000452
2,9,0.035568,0.011856,0.000204
3,10,0.000777,0.000777,0.000777
4,13,0.003040,0.003040,0.003040
...,...,...,...,...
146778,569495,0.000100,0.000100,0.000100
146779,569496,0.000100,0.000100,0.000100
146780,569500,0.000136,0.000136,0.000136
146781,569501,0.000816,0.000816,0.000816


In [92]:
user_data_populated_with_fees = merge_dfs(user_data_populated, user_to_fees)
user_data_populated_with_fees["fees_total"] = user_data_populated_with_fees["fees_total"].fillna(0)
user_data_populated_with_fees["fees_mean"] = user_data_populated_with_fees["fees_mean"].fillna(0)
user_data_populated_with_fees["fees_min"] = user_data_populated_with_fees["fees_min"].fillna(0)
user_data_populated_with_fees.to_csv("../dataset/custom/UserData.csv", index=False)