# Кластеризация на пользователей

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)

Unnamed: 0,addrId,txId,Time step
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139,1
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,230325147,1
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,230325154,1
...,...,...,...
477112,1HdnGvuc21Y4QfBEHUc3NFRJhGywdSFUb,157659046,49
477113,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh,157659306,49
477114,3DzbpEogZ1mn9FgCHcmzYPLDbV9GuxYHpi,157668825,49
477115,34yD1sQg6C16aANCtibYXRj5NsX6tt4v5R,125788182,49


((323503, 3), 263869, 400212)

((146321, 3), 136343, 400212)

In [15]:
146321/323503

0.4523018333678513

Сделаем маппинг tx <-> список входящих кошельков

In [4]:
from collections import defaultdict

addr_tx_df = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")

tx_inputs = defaultdict(lambda : [])
addr_list = set()
for index, row in addr_tx_df.iterrows():
    tx_inputs[row["txId"]].append(row["input_address"])
    addr_list.add(row["input_address"])
tx_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")
for index, row in tx_addr_df.iterrows():
    addr_list.add(row["output_address"])

addr_list = list(addr_list)


Сделаем граф зависимостей между адресами, запустим DFS и разобьем на обычные КС

In [5]:
addr_to_neighbors = defaultdict(lambda : [])
for tx, addrs in tx_inputs.items():
    if len(addrs) <= 1:
        continue
    for i in range(1, len(addrs)):
        addr_to_neighbors[addrs[i - 1]].append(addrs[i])
        addr_to_neighbors[addrs[i]].append(addrs[i - 1])

In [6]:
from copy import deepcopy

used = defaultdict(lambda : False)
addr_to_component = defaultdict(lambda : 0)

def dfs(node, comp_id):
    used[node] = True
    addr_to_component[node] = comp_id

    nodes_to_visit = deepcopy(addr_to_neighbors[node])

    while len(nodes_to_visit):
        nei = nodes_to_visit.pop()
        used[nei] = True
        addr_to_component[nei] = comp_id

        for nei1 in addr_to_neighbors[nei]:
            if used[nei1] or nei1 in nodes_to_visit:
                continue
            nodes_to_visit.append(nei1)

current_comp = 1
for addr in addr_list:
    if not used[addr]:
        dfs(addr, current_comp)
        current_comp += 1

In [7]:
list(addr_to_component.items())[:1000]

[('3ERqtBuTkehxuKaMjRghMgkZbL6QTGsAgk', 1),
 ('1MvrNMPgAqc9M5uRedjGTHaKQ4yWFSSBb2', 2),
 ('1Nyga8W7K8AWtCECGYnJgTpEiikd8gxtZU', 3),
 ('127YEf8HFGb26bk7Dk8HAX9AywevCuYNjV', 4),
 ('15c8pksxYhRwKYQDUkYveTyGNvpJ6MNVAX', 4),
 ('16W7H1u5oajJVCPoKCm1boQHSrv6yANVZr', 4),
 ('1GhW8xuMt5nZ2LPptYkJBRpxLBvq6YLwUH', 4),
 ('14PJ3ZdE5GigS2jxn3FM3BiTBxrbJZoRko', 4),
 ('1JyqaPFquMib3te2iFmAxxSoXUpKr5cjAn', 4),
 ('1Eowd1eWu9mca2e4YFmjQQqA3HHpsn3geG', 4),
 ('1GLK8HoHmfCJe9cWEw8JvxFytfcMZA2zuL', 4),
 ('1CnxggyLkQjsxeKe2sKrPXv516acznY3Eh', 4),
 ('114Vpp1rW5VVqjDaHCokA3WBFGQZisQHLY', 4),
 ('18LSE19xt8mNbk3B5RXFQaYmE9Cbo8f4Xd', 4),
 ('168tQmk875m6S5XUdf1sgD1GU9L2SstGqM', 4),
 ('1LJ1rfY8E3aUVi4m7QpFN6kehF2cv77bA8', 4),
 ('14tbpvCUTXV9D44aahJU9fanmLP3roiDNN', 4),
 ('19dRrQMHpsgTni4gUAdTs3d8mTqoJUsWF3', 4),
 ('1AYbvgKBcJtj5FpHegVoH97qWbcB1xGEb5', 4),
 ('1G3HhfcymCrm8wFMSmR1wrBZZsjwYqHaWm', 4),
 ('1Mm7HCGYV1m4uvS2QycZkMEcx87iWZFxKW', 4),
 ('1K7PJQwSB4JQbxf7FL778f99r6aRzm7M4n', 4),
 ('19LM8Qvu5TUdJdyLvAGhBSzDiuSrk

In [8]:
addr_to_component["1PQBBWY435CTSnir4wrfD8HXZ4E92v9Xd6"], \
addr_to_component["1AueTrCc9oK7ovbtfYeVSJe4UQEaxcRMFy"]

(4, 4)

In [10]:
tx_with_users = pd.merge(
    left=addr_tx_df,
    right=addr_to_user_df,
    left_on="input_address",
    right_on="addrId",
    how="left"
)
tx_with_users.drop("input_address", axis=1, inplace=True)
tx_with_users.drop("addrId", axis=1, inplace=True)

tx_with_users = tx_with_users.drop_duplicates()
tx_with_users.to_csv("../dataset/custom/UserTx.csv", index=False)

In [11]:
tx_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")
tx_with_users = pd.merge(
    left=tx_addr_df,
    right=addr_to_user_df,
    left_on="output_address",
    right_on="addrId",
    how="left"
)
tx_with_users.drop("output_address", axis=1, inplace=True)
tx_with_users.drop("addrId", axis=1, inplace=True)
tx_with_users

# tx_with_users = tx_with_users.drop_duplicates()
tx_with_users.to_csv("../dataset/custom/TxUser.csv", index=False)

In [12]:
addr_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/AddrAddr_edgelist.csv")
addr_with_users = pd.merge(
    left=addr_addr_df,
    right=addr_to_user_df,
    left_on="input_address",
    right_on="addrId",
    how="left"
)
addr_with_users.drop("input_address", axis=1, inplace=True)
addr_with_users.drop("addrId", axis=1, inplace=True)
addr_with_users["inputUserId"] = addr_with_users["userId"]
addr_with_users.drop("userId", axis=1, inplace=True)

addr_with_users = pd.merge(
    left=addr_with_users,
    right=addr_to_user_df,
    left_on="output_address",
    right_on="addrId",
    how="left"
)
addr_with_users.drop("output_address", axis=1, inplace=True)
addr_with_users.drop("addrId", axis=1, inplace=True)
addr_with_users["outputUserId"] = addr_with_users["userId"]
addr_with_users.drop("userId", axis=1, inplace=True)

addr_with_users

addr_with_users = addr_with_users.drop_duplicates()
addr_with_users.to_csv("../dataset/custom/UserUser.csv", index=False)

# Проверка классов легальных-нелегальных по пользователям

In [13]:
addr_to_user = pd.read_csv("../dataset/custom/AddrUser.csv")
addr_to_class = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_classes.csv")
addr_to_tx = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")
tx_to_addr = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")

addr_to_class.head()

Unnamed: 0,address,class
0,111112TykSw72ztDN2WJger4cynzWYC5w,2
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2


In [14]:
users_with_classes = pd.merge(
    left=addr_to_user,
    right=addr_to_class,
    left_on="addrId",
    right_on="address",
    how="left"
)
users_with_classes.drop("address", axis=1, inplace=True)
users_with_classes.drop("addrId", axis=1, inplace=True)

users_with_classes

Unnamed: 0,userId,class
0,1,3
1,2,3
2,3,3
3,4,2
4,4,3
...,...,...
822937,569509,3
822938,569510,3
822939,569511,2
822940,569512,3


In [15]:
user_id_to_classes_cnt = users_with_classes.groupby("userId")["class"].nunique().reset_index()
user_id_to_classes_cnt[user_id_to_classes_cnt["class"] == 3]

Unnamed: 0,userId,class
3,4,3
31,32,3
35,36,3
110,111,3
127,128,3
...,...,...
93412,93413,3
183347,183348,3
205323,205324,3
227113,227114,3


In [16]:
wallets = addr_to_user[addr_to_user["userId"] == 518392]["addrId"]
wallets

771673    17Lr3WNwJPJN5es7pP5RnftNGskcPP24L8
Name: addrId, dtype: object

In [17]:
addr_to_class[addr_to_class["address"].isin(wallets)]

Unnamed: 0,address,class
166532,17Lr3WNwJPJN5es7pP5RnftNGskcPP24L8,3


In [18]:
addr_to_tx[addr_to_tx["input_address"].isin(wallets)]

Unnamed: 0,input_address,txId
243417,17Lr3WNwJPJN5es7pP5RnftNGskcPP24L8,5559864


In [19]:
wallets = addr_to_user[addr_to_user["userId"] == 48539]["addrId"]
addr_to_class[addr_to_class["address"].isin(wallets)]


Unnamed: 0,address,class
218700,192WpD5RLTWARzWT9wuMoeQ5JABHXWnwZw,2


In [20]:
addr_to_tx[addr_to_tx["input_address"].isin(wallets)]

Unnamed: 0,input_address,txId


In [21]:
addr_to_tx[addr_to_tx["input_address"].isin(["13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q"])]

Unnamed: 0,input_address,txId
64156,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q,224254822


In [22]:
tx_to_addr[tx_to_addr["output_address"].isin(["13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q"])]

Unnamed: 0,txId,output_address
101458,223909678,13c5bD3yay8QaJGbHrW8WjGYg8ekCCxz5q


In [23]:
user_to_min_class = users_with_classes.groupby("userId")["class"].min().reset_index()
user_to_min_class.to_csv("../dataset/custom/UserClasses.csv", index=False)
user_to_min_class["class"].value_counts()

class
3    408435
2    155083
1      5995
Name: count, dtype: int64

In [24]:
addr_to_class["class"].value_counts()

class
3    557588
2    251088
1     14266
Name: count, dtype: int64

# Соберем полную таблицу фичей пользователей

In [25]:
user_to_addr_cnt = pd.read_csv("../dataset/custom/AddrUser.csv") \
    .groupby("userId")["addrId"] \
    .nunique() \
    .reset_index() \
    .rename({"addrId": "addr_cnt"}, axis=1)
user_to_addr_cnt.head()

Unnamed: 0,userId,addr_cnt
0,1,1
1,2,1
2,3,1
3,4,5656
4,5,1


In [26]:
user_to_outcoming_tx_cnt = pd.read_csv("../dataset/custom/UserTx.csv") \
    .groupby("userId")["txId"] \
    .nunique() \
    .reset_index() \
    .rename({"txId": "outcoming_tx_cnt"}, axis=1)
user_to_outcoming_tx_cnt.head()

Unnamed: 0,userId,outcoming_tx_cnt
0,2,1
1,4,167
2,11,1
3,12,1
4,13,1


In [27]:
user_to_incoming_tx_cnt = pd.read_csv("../dataset/custom/TxUser.csv") \
    .groupby("userId")["txId"] \
    .nunique() \
    .reset_index() \
    .rename({"txId": "incoming_tx_cnt"}, axis=1)
user_to_incoming_tx_cnt.head()

Unnamed: 0,userId,incoming_tx_cnt
0,1,1
1,2,5
2,3,1
3,4,4110
4,5,1


In [28]:
user_to_incoming_users_cnt = pd.read_csv("../dataset/custom/UserUser.csv") \
    .groupby("outputUserId")["inputUserId"] \
    .nunique() \
    .reset_index() \
    .rename({"outputUserId": "userId", "inputUserId": "input_users_cnt"}, axis=1)
user_to_incoming_users_cnt.head()

Unnamed: 0,userId,input_users_cnt
0,1,1
1,2,5
2,3,1
3,4,3546
4,5,1


In [29]:
user_to_outcoming_users_cnt = pd.read_csv("../dataset/custom/UserUser.csv") \
    .groupby("inputUserId")["outputUserId"] \
    .nunique() \
    .reset_index() \
    .rename({"inputUserId": "userId", "outputUserId": "output_users_cnt"}, axis=1)
user_to_outcoming_users_cnt.head()

Unnamed: 0,userId,output_users_cnt
0,2,1
1,4,215
2,11,2
3,12,1
4,13,2


In [30]:
def merge_dfs(l, r, field_to_coalesce=None):
    res = pd.merge(
        left=l,
        right=r,
        on="userId",
        how="left",
    )
    if field_to_coalesce:
        res[field_to_coalesce] = res[field_to_coalesce].fillna(0)
    return res

user_data = merge_dfs(user_to_addr_cnt, user_to_outcoming_tx_cnt, field_to_coalesce="outcoming_tx_cnt")
user_data = merge_dfs(user_data, user_to_incoming_tx_cnt, field_to_coalesce="incoming_tx_cnt")
user_data = merge_dfs(user_data, user_to_incoming_users_cnt, field_to_coalesce="input_users_cnt")
user_data = merge_dfs(user_data, user_to_outcoming_users_cnt, field_to_coalesce="output_users_cnt")
user_data = merge_dfs(user_data, pd.read_csv("../dataset/custom/UserClasses.csv"))



user_data.to_csv("../dataset/custom/UserData_Small.csv", index=False)

In [31]:
wallets_features = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_features.csv")
wallets_features = pd.merge(
    left=wallets_features,
    right=addr_to_user,
    left_on="address",
    right_on="addrId",
    how="left",
)
wallets_features

Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median,addrId,userId
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,0.000000,0.0,0.0,24.0,1.0,1.0,1.0,1.0,111112TykSw72ztDN2WJger4cynzWYC5w,244669
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,8814
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,8814
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,8814
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,8814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268255,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,31,1.0,1.0,451684.0,451685.0,1.0,2.0,451685.0,451684.0,...,0.000000,0.0,0.0,3.0,1.0,1.0,1.0,1.0,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,28030
1268256,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,44,0.0,1.0,477895.0,477895.0,0.0,1.0,0.0,477895.0,...,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,191828
1268257,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,9,0.0,1.0,407342.0,407342.0,0.0,1.0,0.0,407342.0,...,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,566274
1268258,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,3,0.0,1.0,395235.0,395235.0,0.0,1.0,0.0,395235.0,...,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,222919


In [32]:
wallets_features_agg = wallets_features \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "btc_transacted_total": "sum",
        "btc_sent_total": "sum",
        "btc_received_total": "sum",
        "btc_sent_median": "median",
        "btc_received_median": "median",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "active_time_steps_cnt",
        "txId": "incoming_tx_cnt",
    })
wallets_features_agg.head()

Unnamed: 0,userId,active_time_steps_cnt,btc_transacted_total,btc_sent_total,btc_received_total,btc_sent_median,btc_received_median
0,1,1,0.004675,0.0,0.004675,0.0,0.004675
1,2,1,0.06,0.03,0.03,0.0,0.0008
2,3,1,0.0049,0.0,0.0049,0.0,0.0049
3,4,26,23970.064106,12838.606215,11131.457891,0.03835,0.015823
4,5,1,0.007039,0.0,0.007039,0.0,0.007039


In [33]:
user_to_outcoming_addr_cnt = pd.read_csv("../dataset/custom/UserTx.csv") \
    .merge(
        pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv"),
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId")["output_address"] \
    .nunique() \
    .reset_index() \
    .rename({"userId": "userId", "output_address": "interracted_output_address_cnt"}, axis=1)

user_to_incoming_addr_cnt = pd.read_csv("../dataset/custom/TxUser.csv") \
    .merge(
        pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv"),
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId")["input_address"] \
    .nunique() \
    .reset_index() \
    .rename({"userId": "userId", "input_address": "interracted_input_address_cnt"}, axis=1)


user_to_incoming_addr_cnt

Unnamed: 0,userId,interracted_input_address_cnt
0,1,1
1,2,5
2,3,1
3,4,11097
4,5,1
...,...,...
531662,569509,8
531663,569510,1
531664,569511,2
531665,569512,2


In [34]:
user_data_populated = merge_dfs(user_data, wallets_features_agg)
user_data_populated = merge_dfs(user_data_populated, user_to_outcoming_addr_cnt, "interracted_output_address_cnt")
user_data_populated = merge_dfs(user_data_populated, user_to_incoming_addr_cnt, "interracted_input_address_cnt")

user_data_populated.to_csv("../dataset/custom/UserData.csv", index=False)

**Add fees info**

In [35]:
tx_to_fee = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "fees"]]

user_to_tx = pd.read_csv("../dataset/custom/UserTx.csv")

tx_to_fee = tx_to_fee.groupby('txId')['fees'].agg(['sum', 'mean', "min"]).reset_index() \
    .rename(columns={"sum": "fees_total", "mean": "fees_mean", "min": "fees_min"})

user_to_fees = user_to_tx \
    .merge(
        tx_to_fee,
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby("userId") \
    .agg({
        "fees_total": "sum",
        "fees_mean": "mean",
        "fees_min": "min",
    }) \
    .reset_index()
user_to_fees


Unnamed: 0,userId,fees_total,fees_mean,fees_min
0,2,0.000200,0.000200,0.000200
1,4,1.838772,0.011011,0.000182
2,11,0.000100,0.000100,0.000100
3,12,0.002932,0.002932,0.002932
4,13,0.000147,0.000147,0.000147
...,...,...,...,...
146778,569484,0.000147,0.000147,0.000147
146779,569488,0.001095,0.001095,0.001095
146780,569489,0.000200,0.000200,0.000200
146781,569495,0.000136,0.000136,0.000136


In [36]:
user_data_populated_with_fees = merge_dfs(user_data_populated, user_to_fees)
user_data_populated_with_fees["fees_total"] = user_data_populated_with_fees["fees_total"].fillna(0)
user_data_populated_with_fees["fees_mean"] = user_data_populated_with_fees["fees_mean"].fillna(0)
user_data_populated_with_fees["fees_min"] = user_data_populated_with_fees["fees_min"].fillna(0)
user_data_populated_with_fees.to_csv("../dataset/custom/UserData.csv", index=False)

In [65]:
def gini_coefficient(values):
    sorted_values = sorted(values)
    n = len(values)
    cumsum = sum([(i+1) * val for i, val in enumerate(sorted_values)])
    return (2 * cumsum) / (n * sum(values)) - (n + 1) / n

def safe_gini_coefficient(values):
    if len(values) < 2:
        return 0
    if sum(values) == 0:
        return np.nan
    try:
        return gini_coefficient(values)
    except:
        return np.nan
user_to_ts_gini = wallets_features.groupby('userId')['Time step'].apply(safe_gini_coefficient)
user_to_ts_gini = user_to_ts_gini.reset_index().rename(columns={"Time step": "overall_activity_coef"})

In [66]:
user_to_ts_gini.value_counts()

userId  overall_activity_coef
1       0.000000                 1
379678  0.000000                 1
379672  0.000000                 1
379673  0.009434                 1
379674  0.023810                 1
                                ..
189834  0.000000                 1
189833  0.000000                 1
189832  0.000000                 1
189831  0.000000                 1
569513  0.000000                 1
Name: count, Length: 569513, dtype: int64

In [67]:
user_data_populated_with_gini = merge_dfs(user_data_populated, user_to_ts_gini)


In [86]:
# wallets_features = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_features.csv")
tx_to_fee = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "fees", "Time step"]]
ts_to_fees = tx_to_fee.groupby('Time step')['fees'].agg(['median', 'mean']).reset_index() \
    .rename(columns={"median": "fees_in_ts_median", "mean": "fees_in_ts_mean"})

tx_to_fee_sum = tx_to_fee.groupby(['txId', "Time step"])['fees'].agg(['sum']).reset_index() \
    .rename(columns={"sum": "tx_fees_sum"})
tx_to_fee_sum
user_to_fees_by_ts = user_to_tx \
    .merge(
        tx_to_fee_sum,
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .groupby(["userId", "Time step"])["tx_fees_sum"] \
    .agg(['mean', "median"]).reset_index() \
    .rename(columns={"mean": "user_tx_fees_mean", "median": "user_tx_sum_median"})
user_to_fees_by_ts = user_to_fees_by_ts \
    .merge(
        ts_to_fees,
        how="left",
        left_on="Time step",
        right_on="Time step",
    )
user_to_fees_by_ts["user_mean_fees_to_ts_fees_share"] = user_to_fees_by_ts["user_tx_fees_mean"] / user_to_fees_by_ts["fees_in_ts_mean"]
user_to_fees_by_ts

Unnamed: 0,userId,Time step,user_tx_fees_mean,user_tx_sum_median,fees_in_ts_median,fees_in_ts_mean,user_mean_fees_to_ts_fees_share
0,2,4,0.000200,0.000200,0.000100,0.000205,0.977704
1,4,23,0.001932,0.001923,0.000200,0.000376,5.140958
2,4,24,0.001315,0.001215,0.000204,0.000424,3.099915
3,4,25,0.002931,0.002801,0.000175,0.000480,6.105331
4,4,26,0.007804,0.006590,0.000237,0.000613,12.729608
...,...,...,...,...,...,...,...
151059,569484,32,0.000147,0.000147,0.000358,0.000564,0.260580
151060,569488,40,0.001095,0.001095,0.000887,0.002286,0.478945
151061,569489,11,0.000200,0.000200,0.000100,0.000228,0.877566
151062,569495,8,0.000136,0.000136,0.000200,0.000464,0.292087


In [88]:
user_to_fees_info = user_to_fees_by_ts \
    .groupby(["userId"])["user_mean_fees_to_ts_fees_share"] \
    .agg(['mean', "min", "max"]).reset_index() \
    .rename(columns={
        "mean": "user_ts_fees_share_mean",
        "min": "user_ts_fees_share_min",
        "max": "user_ts_fees_share_max",
    })
user_to_fees_info

Unnamed: 0,userId,user_ts_fees_share_mean,user_ts_fees_share_min,user_ts_fees_share_max
0,2,0.977704,0.977704,0.977704
1,4,9.835605,3.099915,19.605217
2,11,0.552181,0.552181,0.552181
3,12,16.190508,16.190508,16.190508
4,13,0.310192,0.310192,0.310192
...,...,...,...,...
146778,569484,0.260580,0.260580,0.260580
146779,569488,0.478945,0.478945,0.478945
146780,569489,0.877566,0.877566,0.877566
146781,569495,0.292087,0.292087,0.292087


In [None]:
user_data_populated_with_gini_and_ts_fees = merge_dfs(user_data_populated_with_gini, user_to_fees_info)
# user_data_populated_with_gini_and_ts_fees.head(15)
user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_mean"] = user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_mean"].fillna(0)
user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_min"] = user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_min"].fillna(0)
user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_max"] = user_data_populated_with_gini_and_ts_fees["user_ts_fees_share_max"].fillna(0)

In [109]:
user_data_populated_with_gini_and_ts_fees.to_csv("../dataset/custom/UserData.csv", index=False)

In [111]:
user_to_whole_fee_cnt = user_to_tx \
    .merge(
        tx_to_fee_sum,
        how="left",
        left_on="txId",
        right_on="txId",
    ) \
    .drop(["txId", "Time step"], axis=1)
user_to_whole_fee_cnt["whole_fee_4"] = (
    np.floor(user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)) == 
    user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)
).astype(int)
user_to_whole_fee_cnt["whole_fee_5"] = (
    np.floor(user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)) == 
    user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)
).astype(int)
user_to_whole_fee_cnt["whole_fee_6"] = (
    np.floor(user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)) == 
    user_to_whole_fee_cnt["tx_fees_sum"] * (10**4)
).astype(int)
user_to_whole_fee_cnt = user_to_whole_fee_cnt.groupby("userId") \
    .agg({
        "whole_fee_4": "sum",
        "whole_fee_5": "sum",
        "whole_fee_6": "sum",
    }) \
    .reset_index()
user_data_with_whole_fees = merge_dfs(user_data_populated_with_gini_and_ts_fees, user_to_whole_fee_cnt)
user_data_with_whole_fees["whole_fee_4"] = user_data_with_whole_fees["whole_fee_4"].fillna(0)
user_data_with_whole_fees["whole_fee_5"] = user_data_with_whole_fees["whole_fee_5"].fillna(0)
user_data_with_whole_fees["whole_fee_6"] = user_data_with_whole_fees["whole_fee_6"].fillna(0)

user_data_with_whole_fees.to_csv("../dataset/custom/UserData.csv", index=False)