In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime

pd.set_option('display.max_columns', None)

# <center> Data Preparation </center>

- read the data

### <u>BTC</u>
- drop '% of coins' column
- fill 'Outs' with 0
- derive 'Total' = Ins + Outs
- convert date columns to the right format
- derive the address lenght
- derive percentage of the current supply column
- derive difference between ins and outs transactions
- derive the percentage of ins, outs, and diff from the total number of transactions
- derive the interval between the first and last in transactions (accumulation interval)
- derive the interval between the first and last out transactions (offloading interval)
- derive the difference between the accumulation and offloading intervals
- derive the interval between the first in and first out transactions (time it took until the first out transaction)
- convert the intervals in days
- derive the average accumulation and offloading per day
- derive the difference between the average accumulation and offloading
- derive the interval in days from the last transaction to the snaphot time (12.07.2022 19:00 CEST -> 17:00 UTC)
- derive binary column if last transaction planned for the future
- derive the interval in days from genesis to the first in transaction


### <u>DOGE</u>
- drop '% of coins' column
- fill 'Outs' with 0
- add top DOGE holder
- correct small copy mistake
- derive 'Total' = Ins + Outs
- convert date columns to the right format
- derive the address lenght
- derive percentage of the current supply column
- derive difference between ins and outs transactions
- derive the percentage of ins, outs, and diff from the total number of transactions
- derive the interval between the first and last in transactions (accumulation interval)
- derive the interval between the first and last out transactions (offloading interval)
- derive the difference between the accumulation and offloading intervals
- derive the interval between the first in and first out transactions (time it took until the first out transaction)
- convert the intervals in days
- derive the average accumulation and offloading per day
- derive the difference between the average accumulation and offloading
- derive the interval in days from the last transaction to the snaphot time (12.07.2022 19:00 CEST -> 17:00 UTC)
- derive binary column if last transaction planned for the future
- derive the interval in days from genesis to the first in transaction


### <u>ETH</u>
- convert date columns to the right format
- derive the address lenght
- derive percentage of the current supply column
- derive difference between ins and outs transactions
- derive the percentage of ins, outs, and diff from the total number of transactions
- derive the interval between the first and last in transactions (accumulation interval)
- derive the interval between the first and last out transactions (offloading interval)
- derive the difference between the accumulation and offloading intervals
- derive the interval between the first in and first out transactions (time it took until the first out transaction)
- convert the intervals in days
- derive the average accumulation and offloading per day
- derive the difference between the average accumulation and offloading
- derive the interval in days from the last transaction to the snaphot time (12.07.2022 19:00 CEST -> 17:00 UTC)
- derive binary column if last transaction planned for the future
- derive the interval in days from genesis to the first in transaction


In [2]:
BTC_SUPPLY = 19091931
DOGE_SUPPLY = 132670000000
ETH_SUPPLY = 121516840

In [3]:
genesis_B = datetime(2009, 1, 3) # 3 January 2009
genesis_D = datetime(2013, 12, 6) # 6 December 2013
genesis_E = datetime(2015, 7, 30) # 30 July 2015

In [4]:
# read the data
BTC = pd.read_csv("Data/Preprocessed/P_F100BTC.csv", index_col=0)
DOGE = pd.read_csv("Data/Preprocessed/P_F100DOGE.csv", index_col=0)
ETH = pd.read_csv("Data/Preprocessed/P_F100ETH.csv", index_col=0)

In [5]:
BTC = BTC.drop("% of coins", axis=1)
DOGE = DOGE.drop("% of coins", axis=1)

In [6]:
BTC['Outs'] = BTC['Outs'].fillna(value=0)
DOGE['Outs'] = DOGE['Outs'].fillna(value=0)

In [7]:
# add the top DOGE holder
DOGE.loc[0] = ['DBs4WcRE7eysKwRxHNX88XZVCQ9M6QSUSz', '06/02/2021 20:20', '3/06/2022 20:36', 313.0, '12/04/2021 09:14', '23/06/2022 20:36', 72.0, 2.916194e+10, 'Exchange', 42620987112.22499847, 13459043074] 
DOGE.index = DOGE.index + 1
DOGE.sort_index(inplace=True) 

In [8]:
# correct small copy mistake
DOGE['Out_amount'][3] = 837629257.7746412

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DOGE['Out_amount'][3] = 837629257.7746412


In [9]:
BTC['Total'] = BTC['Ins'] + BTC['Outs']
DOGE['Total'] = DOGE['Ins'] + DOGE['Outs']

In [10]:
print(len(BTC.columns))
print(len(DOGE.columns))
print(len(ETH.columns))
print((BTC.columns))
print((DOGE.columns))
print((ETH.columns))

12
12
12
Index(['Address', 'First In', 'Last In', 'Ins', 'First Out', 'Last Out',
       'Outs', 'Balance', 'Label', 'In_amount', 'Out_amount', 'Total'],
      dtype='object')
Index(['Address', 'First In', 'Last In', 'Ins', 'First Out', 'Last Out',
       'Outs', 'Balance', 'Label', 'In_amount', 'Out_amount', 'Total'],
      dtype='object')
Index(['Address', 'Total', 'Balance', 'Label', 'Outs', 'Last Out', 'First Out',
       'Ins', 'Last In', 'First In', 'In_amount', 'Out_amount'],
      dtype='object')


In [11]:
print(BTC.dtypes)
print()
print(DOGE.dtypes)
print()
print(ETH.dtypes)

Address        object
First In       object
Last In        object
Ins           float64
First Out      object
Last Out       object
Outs          float64
Balance         int64
Label          object
In_amount     float64
Out_amount    float64
Total         float64
dtype: object

Address        object
First In       object
Last In        object
Ins           float64
First Out      object
Last Out       object
Outs          float64
Balance       float64
Label          object
In_amount     float64
Out_amount    float64
Total         float64
dtype: object

Address        object
Total           int64
Balance       float64
Label          object
Outs            int64
Last Out       object
First Out      object
Ins             int64
Last In        object
First In       object
In_amount     float64
Out_amount    float64
dtype: object


In [12]:
# convert date columns to the right format
BTC['First_in'] = pd.to_datetime(BTC['First In'], format="%d/%m/%Y %H:%M")
BTC['Last_in'] = pd.to_datetime(BTC['Last In'], format="%d/%m/%Y %H:%M")
BTC['First_out'] = pd.to_datetime(BTC['First Out'], format="%d/%m/%Y %H:%M")
BTC['Last_out'] = pd.to_datetime(BTC['Last Out'], format="%d/%m/%Y %H:%M")
BTC = BTC.drop(['First In', 'Last In', 'First Out', 'Last Out'], axis=1)

DOGE['First_in'] = pd.to_datetime(DOGE['First In'], format="%d/%m/%Y %H:%M")
DOGE['Last_in'] = pd.to_datetime(DOGE['Last In'], format="%d/%m/%Y %H:%M")
DOGE['First_out'] = pd.to_datetime(DOGE['First Out'], format="%d/%m/%Y %H:%M")
DOGE['Last_out'] = pd.to_datetime(DOGE['Last Out'], format="%d/%m/%Y %H:%M")
DOGE = DOGE.drop(['First In', 'Last In', 'First Out', 'Last Out'], axis=1)

ETH['First_in'] = pd.to_datetime(ETH['First In'], format="%d/%m/%Y %H:%M")
ETH['Last_in'] = pd.to_datetime(ETH['Last In'], format="%d/%m/%Y %H:%M")
ETH['First_out'] = pd.to_datetime(ETH['First Out'], format="%d/%m/%Y %H:%M")
ETH['Last_out'] = pd.to_datetime(ETH['Last Out'], format="%d/%m/%Y %H:%M")
ETH = ETH.drop(['First In', 'Last In', 'First Out', 'Last Out'], axis=1)

In [13]:
# derive the address lenght
def get_len(address):
    return len(address)
BTC['Address_length'] = BTC['Address'].apply(lambda x: get_len(x))
DOGE['Address_length'] = DOGE['Address'].apply(lambda x: get_len(x))
ETH['Address_length'] = ETH['Address'].apply(lambda x: get_len(x))

In [14]:
# derive percentage of the current supply column
BTC['Percentage'] = BTC['Balance']/BTC_SUPPLY*100
DOGE['Percentage'] = DOGE['Balance']/DOGE_SUPPLY*100
ETH['Percentage'] = ETH['Balance']/ETH_SUPPLY*100

In [15]:
# derive difference between ins and outs transactions
BTC['Diff'] = BTC['Ins'] - BTC['Outs']
DOGE['Diff'] = DOGE['Ins'] - DOGE['Outs']
ETH['Diff'] = ETH['Ins'] - ETH['Outs']

In [16]:
# derive the percentage of ins, outs, and diff from the total number of transactions
BTC['Percentage_ins'] = BTC['Ins']/(BTC['Total'])*100
BTC['Percentage_outs'] = BTC['Outs']/(BTC['Total'])*100
BTC['Percentage_diff'] = BTC['Diff']/(BTC['Total'])*100

DOGE['Percentage_ins'] = DOGE['Ins']/(DOGE['Total'])*100
DOGE['Percentage_outs'] = DOGE['Outs']/(DOGE['Total'])*100
DOGE['Percentage_diff'] = DOGE['Diff']/(DOGE['Total'])*100

ETH['Percentage_ins'] = ETH['Ins']/(ETH['Total'])*100
ETH['Percentage_outs'] = ETH['Outs']/(ETH['Total'])*100
ETH['Percentage_diff'] = ETH['Diff']/(ETH['Total'])*100

In [17]:
# derive the interval between the first and last in transactions (accumulation interval)
BTC['Interval_in'] = BTC['Last_in'] - BTC['First_in']
DOGE['Interval_in'] = DOGE['Last_in'] - DOGE['First_in']
ETH['Interval_in'] = ETH['Last_in'] - ETH['First_in']

In [18]:
# derive the interval between the first and last out transactions (offloading interval)
BTC['Interval_out'] = BTC['Last_out'] - BTC['First_out']
DOGE['Interval_out'] = DOGE['Last_out'] - DOGE['First_out']
ETH['Interval_out'] = ETH['Last_out'] - ETH['First_out']

In [19]:
# derive the difference between the accumulation and offloading intervals
BTC['Interval_diff'] = BTC['Interval_in'] - BTC['Interval_out'] 
DOGE['Interval_diff'] = DOGE['Interval_in'] - DOGE['Interval_out'] 
ETH['Interval_diff'] = ETH['Interval_in'] - ETH['Interval_out'] 

In [20]:
# derive the interval between the first in and first out transactions (time it took until the first out transaction)
BTC['Interval_first'] = BTC['First_out'] - BTC['First_in']
DOGE['Interval_first'] = DOGE['First_out'] - DOGE['First_in']
ETH['Interval_first'] = ETH['First_out'] - ETH['First_in']

In [21]:
# convert the intervals in days
BTC['Interval_in_days'] = BTC['Interval_in'].astype('timedelta64[D]')
BTC['Interval_out_days'] = BTC['Interval_out'].astype('timedelta64[D]')
BTC['Interval_first_days'] = BTC['Interval_first'].astype('timedelta64[D]')

DOGE['Interval_in_days'] = DOGE['Interval_in'].astype('timedelta64[D]')
DOGE['Interval_out_days'] = DOGE['Interval_out'].astype('timedelta64[D]')
DOGE['Interval_first_days'] = DOGE['Interval_first'].astype('timedelta64[D]')

ETH['Interval_in_days'] = ETH['Interval_in'].astype('timedelta64[D]')
ETH['Interval_out_days'] = ETH['Interval_out'].astype('timedelta64[D]')
ETH['Interval_first_days'] = ETH['Interval_first'].astype('timedelta64[D]')

In [22]:
# derive the average accumulation and offloading per day
BTC["Avg_acc_in"] = BTC['In_amount']/BTC['Interval_in_days']
BTC["Avg_acc_out"] = BTC['Out_amount']/BTC['Interval_out_days']

DOGE["Avg_acc_in"] = DOGE['In_amount']/DOGE['Interval_in_days']
DOGE["Avg_acc_out"] = DOGE['Out_amount']/DOGE['Interval_out_days']

ETH["Avg_acc_in"] = ETH['In_amount']/ETH['Interval_in_days']
ETH["Avg_acc_out"] = ETH['Out_amount']/ETH['Interval_out_days']

In [23]:
# derive the difference between the average accumulation and offloading
BTC["Diff_avg_acc"] = BTC["Avg_acc_in"]-BTC["Avg_acc_out"]
DOGE["Diff_avg_acc"] = DOGE["Avg_acc_in"]-DOGE["Avg_acc_out"]
ETH["Diff_avg_acc"] = ETH["Avg_acc_in"]-ETH["Avg_acc_out"]

In [24]:
# derive the interval in days from the last transaction to the snaphot time (12.07.2022 19:00 CEST -> 17:00 UTC)
snapshot = datetime(2022, 7, 12, 17, 0, 0)

BTC["Last_transaction"] = BTC[["First_in", "First_out", "Last_in", "Last_out"]].max(1)
BTC["Interval_from_last_tx"] = snapshot - BTC["Last_transaction"]
BTC["Interval_from_last_tx_days"] = BTC["Interval_from_last_tx"].astype('timedelta64[D]')

DOGE["Last_transaction"] = DOGE[["First_in", "First_out", "Last_in", "Last_out"]].max(1)
DOGE["Interval_from_last_tx"] = snapshot - DOGE["Last_transaction"]
DOGE["Interval_from_last_tx_days"] = DOGE["Interval_from_last_tx"].astype('timedelta64[D]')

ETH["Last_transaction"] = ETH[["First_in", "First_out", "Last_in", "Last_out"]].max(1)
ETH["Interval_from_last_tx"] = snapshot - ETH["Last_transaction"]
ETH["Interval_from_last_tx_days"] = ETH["Interval_from_last_tx"].astype('timedelta64[D]')

In [25]:
# derive binary column if last transaction planned for the future
BTC["Last_future"] = BTC['Interval_from_last_tx_days']<0
DOGE["Last_future"] = DOGE['Interval_from_last_tx_days']<0
ETH["Last_future"] = ETH['Interval_from_last_tx_days']<0

In [26]:
# derive the interval in days from genesis to the first in transaction
BTC["genesis"] = genesis_B
BTC['First in'] = pd.to_datetime(BTC['First_in'], format="%Y-%m-%d %H:%M:%S")
BTC["genesis_to_first_in"] = BTC['First in']-BTC["genesis"]
BTC["genesis_to_first_in_days"] = BTC["genesis_to_first_in"].astype('timedelta64[D]')

DOGE["genesis"] = genesis_D
DOGE['First in'] = pd.to_datetime(DOGE['First_in'], format="%Y-%m-%d %H:%M:%S")
DOGE["genesis_to_first_in"] = DOGE['First in']-DOGE["genesis"]
DOGE["genesis_to_first_in_days"] = DOGE["genesis_to_first_in"].astype('timedelta64[D]')

ETH["genesis"] = genesis_E
ETH['First in'] = pd.to_datetime(ETH['First_in'], format="%Y-%m-%d %H:%M:%S")
ETH["genesis_to_first_in"] = ETH['First in']-ETH["genesis"]
ETH["genesis_to_first_in_days"] = ETH["genesis_to_first_in"].astype('timedelta64[D]')

# DATA CLEANING

### COMMON SENSE CHECKS:
1) In_amount > Out_amount
    - drop 3 ETH addresses where the in amount is less than the out amount
2) Last Out before Last In (Interval_out > 0)

3) First In before First Out, Last Out, Last In (Interval_in, Interval_first > 0)
    - drop 3 ETH addresses where the first transaction is not First In
    
* drop 3 ETH addresses where exact number of in and out transactions could not be determined


In [27]:
btc = BTC.copy()
doge = DOGE.copy()
eth = ETH.copy()

### 1) In_amount > Out_amount

In [28]:
btc['1'] = (btc['In_amount'] - btc['Out_amount'])<0
doge['1'] = (doge['In_amount'] - doge['Out_amount'])<0
eth['1'] = (eth['In_amount'] - eth['Out_amount'])<0

In [29]:
print(sum(btc['1']), sum(doge['1']), sum(eth['1']))

0 0 3


In [30]:
eth[eth['1']==True] # 1 is wETH, tried to fill them in manually, but failed

Unnamed: 0,Address,Total,Balance,Label,Outs,Ins,In_amount,Out_amount,First_in,Last_in,First_out,Last_out,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Last_transaction,Interval_from_last_tx,Interval_from_last_tx_days,Last_future,genesis,First in,genesis_to_first_in,genesis_to_first_in_days,1
1,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,8723819,4682719.0,,0,8723819,16991780.0,18165950.0,2017-01-18 15:31:00,2022-07-12 15:31:00,NaT,NaT,42,3.853555,8723819,100.0,0.0,100.0,2001 days 00:00:00,NaT,NaT,NaT,2001.0,,,8491.645048,,,2022-07-12 15:31:00,0 days 01:29:00,0.0,False,2015-07-30,2017-01-18 15:31:00,538 days 15:31:00,538.0,True
16,0x1b3cb81e51011b549d78bf720b0d924ac763a7c2,193,347300.0,,3,190,0.009175,212700.0,2015-07-30 15:26:00,2022-07-04 06:41:00,2022-04-25 03:36:00,2022-06-18 19:58:00,42,0.285804,187,98.445596,1.554404,96.891192,2530 days 15:15:00,54 days 16:22:00,2475 days 22:53:00,2460 days 12:10:00,2530.0,54.0,2460.0,4e-06,3938.888889,-3938.888885,2022-07-04 06:41:00,8 days 10:19:00,8.0,False,2015-07-30,2015-07-30 15:26:00,0 days 15:26:00,0.0,True
21,0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,3240056,298324.1,Exchange,100000,100000,4905204.0,5060937.0,2019-06-14 15:58:00,2022-07-12 15:58:00,2019-06-14 15:58:00,2022-07-12 15:59:00,42,0.2455,0,3.086366,3.086366,0.0,1124 days 00:00:00,1124 days 00:01:00,-1 days +23:59:00,0 days 00:00:00,1124.0,1124.0,0.0,4364.060166,4502.612802,-138.552636,2022-07-12 15:59:00,0 days 01:01:00,0.0,False,2015-07-30,2019-06-14 15:58:00,1415 days 15:58:00,1415.0,True


In [31]:
# drop 3 ETH addresses where the in amount is less than the out amount
ETH = ETH.drop([1, 16, 21])
eth = eth.drop([1, 16, 21])

### 2) Last Out before Last In (Interval_out > 0)

In [32]:
btc['21'] = btc['Interval_out'].dt.total_seconds() >= 0
btc['22'] = btc['Interval_out'].isna()
btc['2'] = btc['21']|btc['22']

doge['21'] = doge['Interval_out'].dt.total_seconds() >= 0
doge['22'] = doge['Interval_out'].isna()
doge['2'] = doge['21']|doge['22']

eth['21'] = eth['Interval_out'].dt.total_seconds() >= 0
eth['22'] = eth['Interval_out'].isna()
eth['2'] = eth['21']|eth['22']

In [33]:
print(sum(btc['2']), sum(doge['2']), sum(eth['2']))

100 100 97


### 3) First In before First Out, Last Out, Last In (Interval_in, Interval_first > 0)

In [34]:
btc["First_transaction"] = btc[["First_in", "First_out", "Last_in", "Last_out"]].min(1)
btc["3"] = (btc["First_in"] != btc["First_transaction"])

doge["First_transaction"] = doge[["First_in", "First_out", "Last_in", "Last_out"]].min(1)
doge["3"] = (doge["First_in"] != doge["First_transaction"])

eth["First_transaction"] = eth[["First_in", "First_out", "Last_in", "Last_out"]].min(1)
eth["3"] = (eth["First_in"] != eth["First_transaction"])

In [35]:
print(sum(btc['3']), sum(doge['3']), sum(eth['3']))

0 0 3


In [36]:
eth[eth['3']==True] 

Unnamed: 0,Address,Total,Balance,Label,Outs,Ins,In_amount,Out_amount,First_in,Last_in,First_out,Last_out,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Last_transaction,Interval_from_last_tx,Interval_from_last_tx_days,Last_future,genesis,First in,genesis_to_first_in,genesis_to_first_in_days,1,21,22,2,First_transaction,3
14,0xe92d1a43df510f82c66382592a047d288f85226f,82,450000.050096,,1,81,450000.05054,0.0,2019-02-24 14:48:00,2022-07-04 06:47:00,2019-02-21 05:57:00,2019-02-21 05:57:00,42,0.370319,80,98.780488,1.219512,97.560976,1225 days 15:59:00,0 days,1225 days 15:59:00,-4 days +15:09:00,1225.0,0.0,-4.0,367.34698,,,2022-07-04 06:47:00,8 days 10:13:00,8.0,False,2015-07-30,2019-02-24 14:48:00,1305 days 14:48:00,1305.0,False,True,False,True,2019-02-21 05:57:00,True
18,0xca8fa8f0b631ecdb18cda619c4fc9d197c8affca,75,325000.481649,,1,74,325000.482315,0.0,2019-02-24 14:51:00,2022-07-04 06:48:00,2015-08-08 15:44:00,2015-08-08 15:44:00,42,0.267453,73,98.666667,1.333333,97.333333,1225 days 15:57:00,0 days,1225 days 15:57:00,-1296 days +00:53:00,1225.0,0.0,-1296.0,265.306516,,,2022-07-04 06:48:00,8 days 10:12:00,8.0,False,2015-07-30,2019-02-24 14:51:00,1305 days 14:51:00,1305.0,False,True,False,True,2015-08-08 15:44:00,True
25,0x8103683202aa8da10536036edef04cdd865c225e,33,275000.02743,,1,32,275000.027985,0.0,2019-02-24 14:51:00,2022-07-04 06:49:00,2019-02-21 09:20:00,2019-02-21 09:20:00,42,0.226306,31,96.969697,3.030303,93.939394,1225 days 15:58:00,0 days,1225 days 15:58:00,-4 days +18:29:00,1225.0,0.0,-4.0,224.489819,,,2022-07-04 06:49:00,8 days 10:11:00,8.0,False,2015-07-30,2019-02-24 14:51:00,1305 days 14:51:00,1305.0,False,True,False,True,2019-02-21 09:20:00,True


In [37]:
# drop 3 ETH addresses where the first transaction is not First In
ETH = ETH.drop([14, 18, 25])
eth = eth.drop([14, 18, 25])

In [38]:
eth[eth['Ins']==10000] 

Unnamed: 0,Address,Total,Balance,Label,Outs,Ins,In_amount,Out_amount,First_in,Last_in,First_out,Last_out,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Last_transaction,Interval_from_last_tx,Interval_from_last_tx_days,Last_future,genesis,First in,genesis_to_first_in,genesis_to_first_in_days,1,21,22,2,First_transaction,3
38,0x25eaff5b179f209cf186b1cdcbfa463a69df4c45,24605,209616.60078,Exchange,10000,10000,2047950.0,1837788.0,2020-04-14 16:13:00,2022-07-12 16:13:00,2020-04-14 16:13:00,2022-07-12 16:13:00,42,0.1725,0,40.642146,40.642146,0.0,819 days,819 days,0 days,0 days,819.0,819.0,0.0,2500.550039,2243.941134,256.608904,2022-07-12 16:13:00,00:47:00,0.0,False,2015-07-30,2020-04-14 16:13:00,1720 days 16:13:00,1720.0,False,True,False,True,2020-04-14 16:13:00,False


In [39]:
eth[eth['Ins']==100000] 

Unnamed: 0,Address,Total,Balance,Label,Outs,Ins,In_amount,Out_amount,First_in,Last_in,First_out,Last_out,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Last_transaction,Interval_from_last_tx,Interval_from_last_tx_days,Last_future,genesis,First in,genesis_to_first_in,genesis_to_first_in_days,1,21,22,2,First_transaction,3
10,0x28c6c06298d514db089934071355e5743bf21d60,7176663,535758.913885,Exchange,100000,100000,50751750.0,50183780.0,2021-04-22 15:45:00,2022-07-12 15:01:00,2021-04-22 15:45:00,2022-07-12 15:45:00,42,0.440893,0,1.393405,1.393405,0.0,445 days 23:16:00,446 days,-1 days +23:16:00,0 days,445.0,446.0,0.0,114048.86924,112519.690252,1529.178988,2022-07-12 15:45:00,01:15:00,0.0,False,2015-07-30,2021-04-22 15:45:00,2093 days 15:45:00,2093.0,False,True,False,True,2021-04-22 15:45:00,False
11,0xa7efae728d2936e78bda97dc267687568dd593f3,1550394,523442.860616,Exchange,100000,100000,28210950.0,27686450.0,2019-10-12 15:48:00,2022-07-12 15:14:00,2019-10-12 15:48:00,2022-07-12 15:48:00,42,0.430757,0,6.449973,6.449973,0.0,1003 days 23:26:00,1004 days,-1 days +23:26:00,0 days,1003.0,1004.0,0.0,28126.568783,27576.148145,550.420638,2022-07-12 15:48:00,01:12:00,0.0,False,2015-07-30,2019-10-12 15:48:00,1535 days 15:48:00,1535.0,False,True,False,True,2019-10-12 15:48:00,False


### *) drop 3 ETH addresses where exact number of in and out transactions could not be determined


In [40]:
ETH = ETH.drop([10, 11, 38])
eth = eth.drop([10, 11, 38])

## Preliminary Stats

In [41]:
btc.describe()

Unnamed: 0,Ins,Outs,Balance,In_amount,Out_amount,Total,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Interval_from_last_tx,Interval_from_last_tx_days,genesis_to_first_in,genesis_to_first_in_days
count,100.0,100.0,100.0,91.0,91.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100,44,44,44,100.0,44.0,44.0,91.0,35.0,34.0,100,100.0,100,100.0
mean,4403.68,4269.16,29637.9,587460.4,557234.1,8672.84,38.48,0.155238,134.52,85.557285,14.442715,71.11457,1203 days 07:29:24,671 days 16:19:55.909090,401 days 11:00:06.818181,117 days 14:48:08.181818,1202.84,671.318182,117.227273,inf,inf,-inf,32 days 12:57:03.600000,31.97,3702 days 11:43:14.399999,3701.95
std,40575.803325,40286.594713,36226.687049,3235026.0,3230321.0,80861.006108,7.47586,0.189749,555.652142,20.105353,20.105353,40.210707,1314 days 22:11:14.757956,626 days 07:13:59.535021,842 days 11:01:23.974201,261 days 14:15:58.846154,1314.850629,626.185565,261.606918,,,,72 days 16:28:02.850060,72.688689,1313 days 10:33:57.491840,1313.336067
min,1.0,0.0,9904.0,9904.341,0.0,1.0,33.0,0.051875,1.0,50.023645,0.0,0.04729,0 days 00:00:00,0 days 00:00:00,-11 days +13:27:00,0 days 00:00:00,0.0,0.0,0.0,2.443198,1.214392e-05,-inf,0 days 03:40:00,0.0,458 days 00:47:00,458.0
25%,19.0,0.0,10449.25,10398.26,0.0,19.0,34.0,0.054731,9.0,62.496847,0.0,24.993693,269 days 23:37:45,123 days 00:40:15,3 days 22:30:00,0 days 04:19:00,269.75,122.5,0.0,16.417169,123.2315,-1743.746438,8 days 03:29:45,7.5,3402 days 04:10:15,3401.25
50%,50.5,0.0,15373.0,24495.35,0.0,53.0,34.0,0.080521,29.0,100.0,0.0,100.0,721 days 11:37:00,599 days 20:45:00,37 days 06:41:30,4 days 09:44:30,721.0,599.0,3.5,66.557333,800.8653,-1.338341,25 days 13:30:00,25.0,4179 days 12:35:30,4179.0
75%,136.5,44.0,31812.5,66835.09,21203.76,179.5,42.0,0.166628,64.0,100.0,37.503153,100.0,1417 days 02:09:30,1054 days 15:06:45,419 days 21:33:30,71 days 22:33:45,1416.75,1054.5,71.75,282.513306,69803.69,22.068733,25 days 14:05:00,25.0,4620 days 01:16:15,4619.75
max,405985.0,403015.0,252597.0,30133800.0,30086440.0,809000.0,62.0,1.323056,4721.0,100.0,49.976355,100.0,4455 days 00:05:00,2511 days 18:53:00,4378 days 10:04:00,1325 days 00:26:00,4455.0,2511.0,1325.0,inf,inf,104.307435,477 days 20:28:00,477.0,4938 days 00:16:00,4938.0


In [42]:
doge.describe()

Unnamed: 0,Ins,Outs,Balance,In_amount,Out_amount,Total,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Interval_from_last_tx,Interval_from_last_tx_days,genesis_to_first_in,genesis_to_first_in_days
count,100.0,100.0,100.0,99.0,99.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100,55,55,55,100.0,55.0,55.0,99.0,54.0,54.0,100,100.0,100,100.0
mean,9258.98,8520.56,917915800.0,4586579000.0,3688590000.0,17779.54,34.0,0.691879,738.42,81.03887,18.96113,62.07774,689 days 02:54:38.400000,421 days 19:04:58.909090,388 days 13:11:26.181818,90 days 17:10:36,688.71,421.327273,90.363636,inf,inf,-inf,91 days 03:24:43.800000,90.53,2355 days 05:03:40.199999,2354.7
std,43268.636943,42506.744019,3108210000.0,13228470000.0,12417090000.0,85703.257078,0.0,2.342813,3598.358818,20.566877,20.566877,41.133755,807 days 12:38:21.736758,379 days 22:31:09.931611,751 days 21:38:24.707899,336 days 19:05:33.009418,807.452451,379.835932,336.789657,,,,80 days 06:35:14.829934,80.17474,829 days 17:27:15.652000,829.708133
min,1.0,0.0,104141700.0,5.041212,0.0,1.0,34.0,0.078497,1.0,50.014881,0.0,0.029762,0 days 00:00:00,0 days 00:00:00,-99 days +18:11:00,0 days 00:03:00,0.0,0.0,0.0,0.01200289,0.0,-inf,0 days 03:14:00,0.0,36 days 10:15:00,36.0
25%,14.0,0.0,120335300.0,146510200.0,0.0,14.75,34.0,0.090703,8.0,56.359208,0.0,12.718415,234 days 23:52:00,173 days 10:10:30,6 days 15:55:30,0 days 05:58:00,234.0,173.0,0.0,420285.5,945282.3,-2400463.0,11 days 01:58:15,10.75,2032 days 16:03:00,2032.0
50%,49.0,4.5,191574200.0,481000000.0,5.041212,50.0,34.0,0.144399,24.0,89.727741,10.272259,79.455482,300 days 16:42:30,344 days 00:53:00,58 days 06:40:00,7 days 20:26:00,300.5,344.0,7.0,1159801.0,9473272.0,42801.91,77 days 19:24:00,77.0,2711 days 04:32:00,2710.5
75%,245.25,97.25,529968600.0,1991028000.0,1034534000.0,404.5,34.0,0.399464,64.25,100.0,43.640792,100.0,1002 days 22:52:30,542 days 16:44:30,269 days 15:10:30,39 days 02:39:00,1002.5,542.0,38.5,15761570.0,46199030.0,498904.6,191 days 20:36:15,191.0,2819 days 12:15:45,2818.75
max,259988.0,259377.0,29161940000.0,92233720000.0,92132150000.0,519365.0,34.0,21.98081,29263.0,100.0,49.985119,100.0,3062 days 16:48:00,1542 days 11:55:00,3062 days 16:48:00,2017 days 13:17:00,3062.0,1542.0,2017.0,inf,inf,57626560.0,195 days 11:39:00,195.0,3138 days 03:52:00,3138.0


In [43]:
eth.describe()

Unnamed: 0,Total,Balance,Outs,Ins,In_amount,Out_amount,Address_length,Percentage,Diff,Percentage_ins,Percentage_outs,Percentage_diff,Interval_in,Interval_out,Interval_diff,Interval_first,Interval_in_days,Interval_out_days,Interval_first_days,Avg_acc_in,Avg_acc_out,Diff_avg_acc,Interval_from_last_tx,Interval_from_last_tx_days,genesis_to_first_in,genesis_to_first_in_days
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91,28,28,28,91.0,28.0,28.0,91.0,28.0,26.0,91,91.0,91,91.0
mean,14825.087912,435190.1,417.934066,14407.153846,2269360.0,1843844.0,42.0,0.358132,13989.21978,88.10276,11.89724,76.205521,823 days 23:35:53.406593,684 days 09:21:55.714285,371 days 17:22:53.571428,16 days 20:11:06.428571,823.593407,684.035714,16.642857,inf,inf,,275 days 18:22:25.714285,275.351648,1431 days 15:27:04.615384,1431.296703
std,86352.206061,1387805.0,1637.72409,85508.29618,9398955.0,9261792.0,0.0,1.142068,84687.650929,26.307431,26.307431,52.614861,732 days 04:58:17.985326,593 days 06:53:51.839649,746 days 07:37:02.084988,55 days 13:12:05.036788,732.102785,593.11622,55.499893,,,,482 days 09:34:32.334394,482.288511,590 days 09:41:47.967763,590.363061
min,3.0,120000.0,0.0,1.0,0.01705597,0.0,42.0,0.098752,-8468.0,1.310948,0.0,-97.378105,0 days 00:00:00,0 days 00:00:00,-651 days +18:37:00,0 days 00:02:00,0.0,0.0,0.0,6.741491e-06,1.230769,-inf,0 days 00:34:00,0.0,0 days 15:26:00,0.0
25%,7.0,150000.0,0.0,6.0,150000.0,0.0,42.0,0.12344,4.0,97.01426,0.0,94.02852,30 days 10:46:00,3 days 06:36:00,-1 days +21:42:00,0 days 00:08:00,30.0,2.75,0.0,153.7663,380.908,11.326625,4 days 22:49:00,4.5,1220 days 00:19:00,1220.0
50%,48.0,169861.7,0.0,40.0,215810.1,0.05,42.0,0.139785,31.0,100.0,0.0,100.0,636 days 06:03:00,639 days 04:11:30,10 days 17:28:00,0 days 00:27:30,636.0,639.0,0.0,1316.27,1842.075,198.201992,13 days 01:44:00,13.0,1230 days 20:59:00,1230.0
75%,288.0,247543.0,1.0,156.5,721141.9,296371.3,42.0,0.203711,66.5,100.0,2.98574,100.0,1311 days 22:16:00,1218 days 13:04:00,493 days 06:24:15,5 days 19:34:45,1311.5,1218.0,5.0,14636.5,58629.28,384.788163,190 days 15:08:30,190.5,1904 days 06:48:00,1904.0
max,767737.0,13022810.0,9070.0,758667.0,85473180.0,84949600.0,42.0,10.716873,749597.0,100.0,98.689052,100.0,2530 days 15:25:00,1810 days 20:54:00,2438 days 11:39:00,288 days 02:45:00,2530.0,1810.0,288.0,inf,inf,inf,1319 days 16:45:00,1319.0,2526 days 13:45:00,2526.0


In [44]:
BTC.to_csv("Data/Cleaned/FinalBTC.csv")
DOGE.to_csv("Data/Cleaned/FinalDOGE.csv")
ETH.to_csv("Data/Cleaned/FinalETH.csv")