## Data Pruning

In [1]:
import numpy as np
import pandas as pd

## Attributes

tx_hash: Hash of the bitcoin transaction.​

indegree: Number of transactions that are inputs of tx_hash​

outdegree: Number of transactions that are outputs of tx_hash.​

in_btc: Number of bitcoins on each incoming edge to tx_hash.​

out_btc: Number of bitcoins on each outgoing edge from tx_hash.​

total_btc: Net number of bitcoins flowing in and out from tx_hash.​

mean_in_btc: Average number of bitcoins flowing in for tx_hash.​

mean_out_btc: Average number of bitcoins flowing out for tx_hash. ​

in-malicious: Will be 1 if the tx_hash is an input of a malicious transaction.​

out-malicious: Will be 1 if the tx_hash is an output of a malicious transaction.​

is-malicious: Will be 1 if the tx_hash is a malicious transaction​

out_and_tx_malicious: Will be 1 if the tx_hash is a malicious transaction or an output of a malicious transaction.​

all_malicious: Will be 1 if the tx_hash is a malicious transaction or an output of a malicious transaction or input of a malicious transaction.​

In [2]:
data = pd.read_csv('../Data/DG_out.csv')
data.head()

Unnamed: 0,tx_hash,indegree,outdegree,in_btc,out_btc,total_btc,mean_in_btc,mean_out_btc,in_malicious,out_malicious,is_malicious,out_and_tx_malicious,all_malicious
0,0437cd7f8525ceed2324359c2d0ba26006d92d856a9c20...,0,1,0.0,50.0,50.0,0.0,50.0,0,0,0,0,0
1,f4184fc596403b9d638783cf57adfe4c75c605f6356fbc...,1,2,50.0,50.0,100.0,50.0,25.0,0,0,0,0,0
2,ea44e97271691990157559d0bdd9959e02790c34db6c00...,1,1,10.0,10.0,20.0,10.0,10.0,0,0,0,0,0
3,a16f3ce4dd5deb92d98ef5cf8afeaf0775ebca408f708b...,1,1,40.0,30.0,70.0,40.0,30.0,0,0,0,0,0
4,591e91f809d716912ca1d4a9295e70c3e78bab077683f7...,1,2,30.0,30.0,60.0,30.0,15.0,0,0,0,0,0


In [3]:
data.describe()

Unnamed: 0,indegree,outdegree,in_btc,out_btc,total_btc,mean_in_btc,mean_out_btc,in_malicious,out_malicious,is_malicious,out_and_tx_malicious,all_malicious
count,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0,30248130.0
mean,2.148699,2.148699,54.8415,54.8415,109.683,49.2398,28.77087,4.039919e-05,2.148893e-06,1.553815e-06,3.570468e-06,4.383741e-05
std,7.421447,4.512048,1303.303,1301.473,2602.527,972.921,712.5567,0.006355907,0.001465909,0.00124652,0.001889565,0.006620838
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,0.1,0.0995,0.205872,0.064,0.05025776,0.0,0.0,0.0,0.0,0.0
50%,1.0,2.0,0.9325,0.959,1.964759,0.5350261,0.49,0.0,0.0,0.0,0.0,0.0
75%,2.0,2.0,8.591912,9.063657,18.4,5.614287,4.519481,0.0,0.0,0.0,0.0,0.0
max,1932.0,1322.0,550000.0,500020.7,1050000.0,499259.6,500000.0,1.0,1.0,1.0,1.0,1.0


In [4]:
data.drop(columns=['mean_in_btc','mean_out_btc','out_and_tx_malicious','all_malicious'], inplace=True)
data.head()

Unnamed: 0,tx_hash,indegree,outdegree,in_btc,out_btc,total_btc,in_malicious,out_malicious,is_malicious
0,0437cd7f8525ceed2324359c2d0ba26006d92d856a9c20...,0,1,0.0,50.0,50.0,0,0,0
1,f4184fc596403b9d638783cf57adfe4c75c605f6356fbc...,1,2,50.0,50.0,100.0,0,0,0
2,ea44e97271691990157559d0bdd9959e02790c34db6c00...,1,1,10.0,10.0,20.0,0,0,0
3,a16f3ce4dd5deb92d98ef5cf8afeaf0775ebca408f708b...,1,1,40.0,30.0,70.0,0,0,0
4,591e91f809d716912ca1d4a9295e70c3e78bab077683f7...,1,2,30.0,30.0,60.0,0,0,0


In [5]:
data.drop(columns=['tx_hash','total_btc'], inplace=True)

In [6]:
data.to_csv('../Data/main_data.csv', index=False)

In [7]:
data[data['is_malicious'] == 1]

Unnamed: 0,indegree,outdegree,in_btc,out_btc,in_malicious,out_malicious,is_malicious
385224,478,2,25000.62,25000.01,0,0,1
2242631,191,2,1999.042164,1999.024664,0,0,1
2252738,6,2,2000.01,2000.01,0,0,1
2256885,7,2,20555.01,20555.01,0,0,1
2264158,4,2,3000.010028,3000.010028,0,0,1
2280816,4,2,10000.01,10000.01,0,0,1
2300186,11,2,3000.010022,3000.010022,0,0,1
2300888,10,2,3000.010026,3000.010026,0,0,1
2419568,34,2,3094.458251,3094.458251,0,0,1
2497660,2,1,4.9885,4.988,0,0,1
