In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

base: str = "./datasets/"

# Load Transactions dataset

In [2]:
transactions = pd.read_csv(
    base + "transactions.csv",
    names=['timestamp', 'blockId', 'txId', 'isCoinbase', 'fee'],
    dtype={'blockId': np.int32, 'txId': np.int32, 'isCoinbase': np.int8},
    parse_dates=True
)

#print(transactions.info())
#print(pd.concat([transactions.head(), transactions.tail()]))
#print(transactions.head())
#print(transactions.tail())

'''
max vals
timestamp: 1356997957 -> int64
blockId: 214562 -> int32
txId: 10572826 -> int32
isCoinbase -> np.bool_
fee: 17179869184 -> int64
'''

#print(transactions.nlargest(5, 'fee'))


'\nmax vals\ntimestamp: 1356997957 -> int64\nblockId: 214562 -> int32\ntxId: 10572826 -> int32\nisCoinbase -> np.bool_\nfee: 17179869184 -> int64\n'

# Load Inputs dataset

In [3]:
inputs = pd.read_csv(
    base + "inputs.csv",
    names=['txId', 'prevTxId', 'prevTxPos'],
    dtype={'txId': np.int32, 'prevTxId': np.int32, 'prevTxPos': np.int16},
)

# Load Outputs dataset 

In [4]:
outputs = pd.read_csv(
    base + "outputs.csv",
    names=['txId', 'txPos', 'addressId', 'amount', 'scriptType'],
    dtype={'txId': np.int32, 'txPos': np.int16, 'addressId': np.int32, 'amount': np.int64, 'scriptType': np.int8}
)

#print(outputs.info())

# Load Mappings dataset

In [None]:
mappings = pd.read_csv(
    base + "mappings.csv",
    names=['addressId', 'hash'],
    dtype={'addressId': str, 'hash': np.int32}
)

# Script table

In [None]:
data = {'scriptCode': [0,1,2,3,4,5,6,7], 
        'scriptType':["Unknown", "P2PK", "P2KH", "P2SH", "RETURN", "EMPTY", "P2WPKH", "P2WSH"],
        'scriptSize': [0, 153, 180, 291, 0, 0, 0, 0]
       }

scripts = pd.DataFrame(
    data,
    #dtypes={'scriptCode': np.int8, 'scriptType': str, 'scriptSize': np.int8}
                  
)
print(scripts.info())
print(scripts)


# Size of transactions

In [None]:
#Df to have the num of transactions for every timestamp
transCount = transactions \
    .groupby('timestamp')['txId'].count() \
    .reset_index() \
    .rename(columns={'txId':'txCount'})
print(transCount)

transCount.plot.line(title='Numero di transazioni nel tempo', x='timestamp', y='txCount')

# Utils

In [22]:
def getTransactionInputs(inputsDf: pd.DataFrame, txId: int) -> pd.DataFrame:
    ''' Get all inputs of a transaction '''
    cond = inputsDf['txId'] == txId
    return inputsDf[cond]

def getTransactionOutputs(outputsDf: pd.DataFrame, txId: int) -> pd.DataFrame:
    ''' Get all outputs of a transaction '''
    
    cond = outputsDf['txId'] == txId
    return outputsDf[cond]

def getTransactionSize(transactionsDf: pd.DataFrame, inputsDf: pd.DataFrame, outputsDf: pd.DataFrame, txId: int) -> float:
    ''' Ottieni la dimensione di una transazione '''
    
    INPUT_SIZE = 40
    OUTPUT_SIZE = 9

    num_inputs = len(getTransactionInputs(inputsDf, txId))
    num_outputs = len(getTransactionOutputs(outputsDf, txId))
    script_size = 0

    # TODO: num_outputs > 1 ==> script diversi; quale contare?
    return INPUT_SIZE * num_inputs + OUTPUT_SIZE * num_outputs + script_size

def getTransactionsAtTimestamp(transactionsDf: pd.DataFrame, timestamp: int) -> pd.DataFrame:
    """ Get all transaction at a given timestamp 
    
    Returns:
    DF<timestamp, blockId, txId, isCoinbase, fee>"""

    cond = transactionsDf['timestamp'] == timestamp
    return transactionsDf[cond]


def getCongestionAtTs(timestamp: int, transactionsDf: pd.DataFrame, inputsDf: pd.DataFrame, outputsDf: pd.DataFrame) -> int:
    """ Get the congestion at a given timestamp """
    
    transactions_at_ts = getTransactionsAtTimestamp(transactionsDf, timestamp).drop(columns=["timestamp", "blockId", "isCoinbase", "fee"])

    #tx_sizes = [getTransactionSize(transactionsDf, inputsDf, outputsDf, txId) for txId in transactions_at_ts['txId']]
    #return sum(tx_sizes)    
    transactions_at_ts['size'] = getTransactionSize(transactions_at_ts['timestamp'])
    return transactions_at_ts['size'].sum()


def getAverageFeeAtTimestamp(transactionsDf: pd.DataFrame, timestamp: int) -> float:
    tx_ts: pd.DataFrame = getTransactionsAtTimestamp(transactionsDf, timestamp)

    return tx_ts['fee'].mean()


def getTsCongestionDf(transactionsDf: pd.DataFrame, inputsDf: pd.DataFrame, outputsDf: pd.DataFrame) -> pd.DataFrame:
    ''' STEPS:
    Filter transactionsDf (remove coinbases)
    '''
    
    not_coinbase = transactionsDf['isCoinbase'] == 0
    transactionsDf = transactionsDf[not_coinbase]

    n = 572828
    
    timestamps = transactionsDf['timestamp'][n:n+500]
    #congs = timestamps.apply(getCongestionAtTs, args=(transactionsDf, inputsDf, outputsDf))
    # Vectorize congs
    congs = getCongestionAtTs(timestamps, transactionsDf, inputsDf, outputsDf)
    fees = timestamps.apply(lambda ts: getAverageFeeAtTimestamp(transactionsDf, ts))
    
    return pd.DataFrame({'Timestamp': timestamps, 'Congestion': congs, 'Fee': fee})

In [None]:
# test getTransactionInputs, getTransactionOutputs
TX_ID = 10888

txInputs = getTransactionInputs(inputs, TX_ID)
print(f'#tx_inputs: {len(txInputs)}')
print(txInputs)

txOutputs = getTransactionOutputs(outputs, TX_ID)
print(f'#tx_outputs: {len(txOutputs)}')
print(txOutputs)

In [None]:
#get all transactions with more then 1 output
'''
for id in transactions['txId']:
    n_outputs = len(getTransactionOutputs(outputs, id))

    if n_outputs != 1:
        print(f'txId:{id:}, num_outputs: {n_outputs}')

print("end")'''

In [23]:
# test getTransactionsAtTimestamp
timestamp = 1356997591
transactions_ncb = transactions[transactions['isCoinbase'] == 0]

tx_ts = getTransactionsAtTimestamp(transactions_ncb, timestamp)

print(tx_ts)

           timestamp  blockId      txId  isCoinbase    fee
10572428  1356997591   214561  10572426           0      0
10572429  1356997591   214561  10572427           0      0
10572430  1356997591   214561  10572428           0      0
10572431  1356997591   214561  10572429           0  20000
10572432  1356997591   214561  10572430           0      0
...              ...      ...       ...         ...    ...
10572564  1356997591   214561  10572562           0      0
10572565  1356997591   214561  10572563           0      0
10572566  1356997591   214561  10572564           0      0
10572567  1356997591   214561  10572565           0      0
10572568  1356997591   214561  10572566           0      0

[141 rows x 5 columns]


In [19]:
# test getCongestionAtTs
congestion = getCongestionAtTs(timestamp, transactions_ncb, inputs, outputs)
print(f'congestion @{timestamp}: {congestion}')

fee = getAverageFeeAtTimestamp(transactions_ncb, timestamp)
print(f'avg fee @{timestamp}: {fee}')

KeyError: 'timestamp'

In [None]:
# test getAverageFeeAtTimestamp
fee = getAverageFeeAtTimestamp(transactions_ncb, timestamp)
print(f'avg fee @{timestamp}: {fee}')

In [None]:
# fee test
from datetime import datetime

ts_start = 1231006505
ts_end = 1356997957

random_ts = transactions.sample(n=5)['timestamp'].to_list()
print(random_ts)
#ts = random.randint(ts_start, ts_end)

for ts in random_ts:
    fee = getAverageFeeAtTimestamp(transactions, ts)
    print(f'avg fee @{ts}: {fee}')


In [None]:
#congDf: pd.DataFrame = getTsCongestionDf(transactions, inputs, outputs)

#print(congDf.info())

'''print(congDf.head())
print(congDf.tail())'''

#congDf['Fee'].plot(kind='hist')


In [None]:
# plots
fig, axs = plt.subplots(1, 3, figsize=(20, 16))  # 1 rows, 3 column

congDf.plot(kind='line', x='Congestion', y='Fee', title='Andamento fee rispetto congestione', ax=axs[0])
congDf.plot(kind='line', x='Timestamp', y='Congestion', title='Andamento congestione nel tempo', ax=axs[1])
congDf.plot(kind='line', x='Timestamp', y='Fee', title='Andamento fee nel tempo', ax=axs[2])


In [8]:
fullI = transactions.merge(inputs, on='txId')
print(fullI.info())

print(fullI.head())
print(fullI.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21378770 entries, 0 to 21378769
Data columns (total 7 columns):
 #   Column      Dtype
---  ------      -----
 0   timestamp   int64
 1   blockId     int32
 2   txId        int32
 3   isCoinbase  int8 
 4   fee         int64
 5   prevTxId    int32
 6   prevTxPos   int16
dtypes: int16(1), int32(3), int64(2), int8(1)
memory usage: 632.0 MB
None
    timestamp  blockId  txId  isCoinbase  fee  prevTxId  prevTxPos
0  1231731025      170   171           0    0         9          0
1  1231740133      181   183           0    0       171          1
2  1231740736      182   185           0    0       183          1
3  1231742062      183   187           0    0       185          1
4  1231744600      187   192           0    0       187          0
           timestamp  blockId      txId  isCoinbase     fee  prevTxId  \
21378765  1356997957   214562  10572823           0  100000  10572820   
21378766  1356997957   214562  10572824           0  1000

In [7]:
fullO = transactions.merge(outputs, on='txId')
print(fullO.info())
print(fullO.head(), fullI.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24613803 entries, 0 to 24613802
Data columns (total 9 columns):
 #   Column      Dtype
---  ------      -----
 0   timestamp   int64
 1   blockId     int32
 2   txId        int32
 3   isCoinbase  int8 
 4   fee         int64
 5   txPos       int16
 6   addressId   int32
 7   amount      int64
 8   scriptType  int8 
dtypes: int16(1), int32(3), int64(3), int8(2)
memory usage: 938.9 MB
None
    timestamp  blockId  txId  isCoinbase  fee  txPos  addressId      amount  \
0  1231006505        0     0           1    0      0          0  5000000000   
1  1231469665        1     1           1    0      0          1  5000000000   
2  1231469744        2     2           1    0      0          2  5000000000   
3  1231470173        3     3           1    0      0          3  5000000000   
4  1231470988        4     4           1    0      0          4  5000000000   

   scriptType  
0           1  
1           1  
2           1  
3           1  
4   