In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [9]:
CUTOFF = 40

addr_user_df = pd.read_csv("../dataset/custom/AddrUser.csv")

addr_tx_df = pd.read_csv("../dataset/custom/UserTx.csv")
tx_addr_df = pd.read_csv("../dataset/custom/TxUser.csv")

tx_info = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "Time step"]]

addr_to_ts = addr_tx_df \
    .merge(
        tx_info,
        how="left",
        on="txId"
    )

addr_to_ts_inp = tx_addr_df \
    .merge(
        tx_info,
        how="left",
        on="txId"
    )


addr_to_ts_full = pd.concat([addr_to_ts_inp, addr_to_ts])

train_split = addr_to_ts_full[
    (addr_to_ts_full["Time step"] <= CUTOFF)
].drop_duplicates(["userId", "txId"])

test_split = addr_to_ts_full[
    (addr_to_ts_full["Time step"] > CUTOFF)
].drop_duplicates(["userId", "txId"])


In [10]:
train_split.head()

Unnamed: 0,txId,userId,Time step
0,230325127,152011,1
1,230325127,477626,1
2,230325139,111,1
3,86875675,165,1
4,86875675,93608,1


In [4]:
users_feats_full = pd.read_csv("../dataset/custom/UserData.csv")
users_feats_full = users_feats_full[[
    "userId",
    "addr_cnt",
    "class",
    "overall_activity_coef",
]]

In [5]:
users_feats_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569513 entries, 0 to 569512
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   userId                 569513 non-null  int64  
 1   addr_cnt               569513 non-null  int64  
 2   class                  569513 non-null  int64  
 3   overall_activity_coef  569513 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 17.4 MB


In [8]:
pd.read_csv("../dataset/custom/UserData.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569513 entries, 0 to 569512
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   userId                          569513 non-null  int64  
 1   addr_cnt                        569513 non-null  int64  
 2   outcoming_tx_cnt                569513 non-null  float64
 3   incoming_tx_cnt                 569513 non-null  float64
 4   input_users_cnt                 569513 non-null  float64
 5   output_users_cnt                569513 non-null  float64
 6   class                           569513 non-null  int64  
 7   active_time_steps_cnt           569513 non-null  int64  
 8   btc_transacted_total            569513 non-null  float64
 9   btc_sent_total                  569513 non-null  float64
 10  btc_received_total              569513 non-null  float64
 11  btc_sent_median                 569513 non-null  float64
 12  btc_received_med

In [16]:
tx_feats_prep = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[[
    "txId",
    "out_BTC_total",
    "in_BTC_total",
    "fees",
    "Time step",
]]

user_to_tx_train = addr_tx_df[addr_tx_df["userId"].isin(train_split["userId"]) & addr_tx_df["txId"].isin(train_split["txId"])] \
    .merge(tx_feats_prep, how="left", on="txId") \
    .drop(["out_BTC_total"], axis=1) \
    .rename(columns={"txId": "outcoming_tx_cnt"})

user_to_tx_train = user_to_tx_train \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "in_BTC_total": "sum",
        "outcoming_tx_cnt": "nunique",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "sending_time_steps_cnt",
        "in_BTC_total": "btc_sent_total",
    })

# user_to_tx_test = test_split \
#     .merge(tx_feats_prep, how="left", on="txId")
user_to_tx_train

Unnamed: 0,userId,sending_time_steps_cnt,btc_sent_total,outcoming_tx_cnt
0,2,1,0.005000,1
1,4,17,5120.494578,157
2,11,1,0.395700,1
3,12,1,4.002932,1
4,13,1,0.296973,1
...,...,...,...,...
113326,569484,1,0.012000,1
113327,569488,1,0.059000,1
113328,569489,1,0.124524,1
113329,569495,1,37.399729,1


In [18]:
tx_to_user_train = tx_addr_df[tx_addr_df["userId"].isin(train_split["userId"]) & tx_addr_df["txId"].isin(train_split["txId"])] \
    .merge(tx_feats_prep, how="left", on="txId") \
    .drop(["in_BTC_total"], axis=1) \
    .rename(columns={"txId": "incoming_tx_cnt"})

tx_to_user_train = tx_to_user_train \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "out_BTC_total": "sum",
        "incoming_tx_cnt": "nunique",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "receiving_time_steps_cnt",
        "out_BTC_total": "btc_received_total",
    })
tx_to_user_train.head()

Unnamed: 0,userId,receiving_time_steps_cnt,btc_received_total,incoming_tx_cnt
0,2,1,5.715931,5
1,3,1,1.918851,1
2,4,18,142156.362019,3894
3,5,1,31.128083,1
4,8,1,33.608093,1


In [None]:
user_data_train_full = tx_to_user_train \
    .merge(user_to_tx_train, how="left", on="userId") \
    .merge(users_feats_full, how="left", on="userId")

user_data_train_full.head()
user_data_train_full = user_data_train_full.fillna(0)
user_data_train_full.head()

Unnamed: 0,userId,receiving_time_steps_cnt,btc_received_total,incoming_tx_cnt,sending_time_steps_cnt,btc_sent_total,outcoming_tx_cnt,addr_cnt,class,overall_activity_coef
0,2,1,5.715931,5,1.0,0.005,1.0,1,3,0.0
1,3,1,1.918851,1,0.0,0.0,0.0,1,3,0.0
2,4,18,142156.362019,3894,17.0,5120.494578,157.0,5656,1,0.084105
3,5,1,31.128083,1,0.0,0.0,0.0,1,3,0.0
4,8,1,33.608093,1,0.0,0.0,0.0,1,2,0.0


In [24]:
tx_feats_prep = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[[
    "txId",
    "out_BTC_total",
    "in_BTC_total",
    "fees",
    "Time step",
]]

user_to_tx_test = addr_tx_df[addr_tx_df["userId"].isin(test_split["userId"]) & addr_tx_df["txId"].isin(test_split["txId"])] \
    .merge(tx_feats_prep, how="left", on="txId") \
    .drop(["out_BTC_total"], axis=1) \
    .rename(columns={"txId": "outcoming_tx_cnt"})

user_to_tx_test = user_to_tx_test \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "in_BTC_total": "sum",
        "outcoming_tx_cnt": "nunique",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "sending_time_steps_cnt",
        "in_BTC_total": "btc_sent_total",
    })

tx_to_user_test = tx_addr_df[tx_addr_df["userId"].isin(test_split["userId"]) & tx_addr_df["txId"].isin(test_split["txId"])] \
    .merge(tx_feats_prep, how="left", on="txId") \
    .drop(["in_BTC_total"], axis=1) \
    .rename(columns={"txId": "incoming_tx_cnt"})

tx_to_user_test = tx_to_user_test \
    .groupby("userId") \
    .agg({
        "Time step": "nunique",
        "out_BTC_total": "sum",
        "incoming_tx_cnt": "nunique",
    }) \
    .reset_index() \
    .rename(columns={
        "Time step": "receiving_time_steps_cnt",
        "out_BTC_total": "btc_received_total",
    })
tx_to_user_train.head()

user_data_test_full = tx_to_user_test \
    .merge(user_to_tx_test, how="left", on="userId") \
    .merge(users_feats_full, how="left", on="userId")

user_data_test_full.head()
user_data_test_full = user_data_test_full.fillna(0)
user_data_test_full.head()


Unnamed: 0,userId,receiving_time_steps_cnt,btc_received_total,incoming_tx_cnt,sending_time_steps_cnt,btc_sent_total,outcoming_tx_cnt,addr_cnt,class,overall_activity_coef
0,1,1,17.024997,1,0.0,0.0,0.0,1,3,0.0
1,4,8,26615.474226,216,6.0,78.802683,10.0,5656,1,0.084105
2,6,1,92.662008,1,0.0,0.0,0.0,1,3,0.0
3,7,2,155.328833,2,0.0,0.0,0.0,1,3,0.006024
4,15,1,0.651488,4,1.0,0.036179,1.0,4,2,0.0


In [25]:
user_data_train_full.to_csv("../dataset/custom/train_test_split/users_only/users_features__train.csv")
user_data_test_full.to_csv("../dataset/custom/train_test_split/users_only/users_features__test.csv")
