### Important:
The data used in this exercise has already been cleaned. However, I included this notebook with a basic data cleaning step, as the process may vary depending on the context. For example, if the token is a stablecoin, it might be necessary to remove very small or very large values. Zero-value transactions could be relevant for identifying network activity, or they might indicate failed transactions or contract issues. Repeated values in short time spans may suggest arbitrage or testing behaviour. Therefore, the cleaning strategy should be adapted to the analysis goal.

In [8]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('../data/token_transfers.csv')

input_file = '../data/token_transfers.csv'

output_filename = 'token_transfers_cleaned.csv'
output_path = '../data/cleaned'
os.makedirs(output_path, exist_ok=True)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5280131 entries, 0 to 5280130
Data columns (total 7 columns):
 #   Column             Dtype  
---  ------             -----  
 0   block_number       int64  
 1   transaction_index  int64  
 2   from_address       object 
 3   to_address         object 
 4   time_stamp         int64  
 5   contract_address   object 
 6   value              float64
dtypes: float64(1), int64(3), object(3)
memory usage: 282.0+ MB


Unnamed: 0,block_number,transaction_index,from_address,to_address,time_stamp,contract_address,value
0,14669683,7,0xd30b438df65f4f788563b2b3611bd6059bff4ad9,0xda816e2122a8a39b0926bfa84edd3d42477e9efd,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,18.67
1,14669683,45,0x4941834ed1428089ee76252f6f9d767e800499b0,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,10000.0
2,14669683,46,0x2c1f9a20711e14f8484a41123e20d1b06858ebea,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,9942.313005
3,14669683,47,0x7784c4f53aa5f03bd6d3ca670c9d9c887cc38cb0,0x3cd751e6b0078be393132286c442345e5dc49699,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,234.364097
4,14669683,48,0x66589b8278470d58d4112f3d065de0f75734312e,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,9800.0


In [9]:
import re

df_excluded = pd.DataFrame()

# standardizations (valid with etherium)
df['from_address'] = df['from_address'].str.lower()
df['to_address'] = df['to_address'].str.lower()
df['contract_address'] = df['contract_address'].str.lower()

def is_valid_eth_address(addr):
    return isinstance(addr, str) and re.fullmatch(r"0x[a-f0-9]{40}", addr) is not None

# Remove lines with missing data in essential columns
essential_cols = ['block_number', 'transaction_index', 'from_address', 'to_address', 'time_stamp', 'contract_address', 'value']
missing_rows = df[df[essential_cols].isnull().any(axis=1)]
df_excluded = pd.concat([df_excluded, missing_rows])
df = df.drop(missing_rows.index.tolist())

# Remove trasactions with value < 0
zero_value_rows = df[df['value'] < 0.0]
df_excluded = pd.concat([df_excluded, zero_value_rows])
df = df.drop(zero_value_rows.index.tolist())

# Remove invalid address
invalid_from = df[~df['from_address'].apply(is_valid_eth_address)]
invalid_to = df[~df['to_address'].apply(is_valid_eth_address)]
invalid_contract = df[~df['contract_address'].apply(is_valid_eth_address)]
invalid_rows = pd.concat([invalid_from, invalid_to, invalid_contract]).drop_duplicates()
df_excluded = pd.concat([df_excluded, invalid_rows])
df = df.drop(invalid_rows.index.tolist())

# Convert timestamp to datetime
df.rename(columns={'time_stamp': 'timestamp'}, inplace=True)
df['timestamp'] = df['timestamp'].astype(int)  # keep original
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['date'] = df['datetime'].dt.date

# Summary
print(f"Total original: {len(df) + len(df_excluded)}")
print(f"Valids: {len(df)}")
print(f"Excludeds: {len(df_excluded)}")
print(f"Tax of validation: {(len(df) / (len(df) + len(df_excluded)) * 100):.2f}%")
df_excluded.head()

Total original: 5280131
Valids: 5280131
Excludeds: 0
Tax of validation: 100.00%


Unnamed: 0,block_number,transaction_index,from_address,to_address,time_stamp,contract_address,value


In [11]:
# Basic descriptive statistics
print("Descriptive:")
print(df['value'].describe())
print(f"\nPerccentiles: {df['value'].quantile([0.001, 0.01, 0.05, 0.95, 0.99, 0.999])}")

# Identify imposible values
test_values = df[df['value'] >= 1e15]
print(f"\nExtreme values (>= 1e15): {len(test_values)}")

# Analisys per contract
print("\nAnalisys per contract:")
for contract in df['contract_address'].value_counts().head(5).index:
    contract_data = df[df['contract_address'] == contract]
    print(f"{str(contract)[:10]}...: min={contract_data['value'].min():.4f}, max={contract_data['value'].max():.4f}")

# Remove outliers
q_high = df['value'].quantile(0.9999) # 99.99%
q_low = df['value'].quantile(0.0001) # 0.01%

outlier_high = df[df['value'] > q_high]
outlier_low = df[df['value'] < q_low]

print(f"\nHighest outliers (> {q_high:.2f}): {len(outlier_high)}")
print(f"Lowest outliers (< {q_low:.2f}): {len(outlier_low)}")

# Remove and update tracking
df_excluded = pd.concat([df_excluded, outlier_high, outlier_low])
df = df[(df['value'] >= q_low) & (df['value'] <= q_high)]

print(f"\nAfter outliers removal: {len(df)} valid transactions")

Descriptive:
count    5.279677e+06
mean     4.089588e+05
std      9.681637e+06
min      0.000000e+00
25%      3.871390e+02
50%      2.137060e+03
75%      1.717948e+04
max      1.000000e+09
Name: value, dtype: float64

Perccentiles: 0.001    3.000000e-06
0.010    1.787850e+00
0.050    3.000000e+01
0.950    4.281914e+05
0.990    4.408496e+06
0.999    6.498208e+07
Name: value, dtype: float64

Extreme values (>= 1e15): 0

Analisys per contract:
0xdac17f95...: min=0.0000, max=880271921.1312
0xa0b86991...: min=0.0000, max=555098591.9100
0x6b175474...: min=0.0000, max=493186350.9474
0xd2877702...: min=0.0000, max=999999999.0000
0xa47c8bf3...: min=0.0000, max=185562507.5551

Highest outliers (> 423175115.25): 528
Lowest outliers (< 0.00): 0

After outliers removal: 5279149 valid transactions


Direct links for mapping

USDT (Tether)
https://etherscan.io/token/0xdac17f958d2ee523a2206206994597c13d831ec7

USDC (USD Coin)
https://etherscan.io/token/0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48

DAI (Multi-Collateral DAI)
https://etherscan.io/token/0x6b175474e89094c44da98b954eedeac495271d0f

UST (TerraClassicUSD)
https://etherscan.io/token/0xa47c8bf37f92abed4a126bda807a7b7498661acd

PAX (Pax Dollar)
https://etherscan.io/token/0x8e870d67f660d95d5be530380d0ec0bd388289e1

WLUNA (Wrapped LUNA Classic)
https://etherscan.io/token/0xd2877702675e6ceb975b4a1dff9fb7baf4c91ea9

In [14]:
# Map contract address to token names
contract_to_token = {
    '0xdac17f958d2ee523a2206206994597c13d831ec7': 'USDT',
    '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48': 'USDC',
    '0x6b175474e89094c44da98b954eedeac495271d0f': 'DAI',
    '0xa47c8bf37f92abed4a126bda807a7b7498661acd': 'UST',
    '0x8e870d67f660d95d5be530380d0ec0bd388289e1': 'PAX',
    '0xd2877702675e6ceb975b4a1dff9fb7baf4c91ea9': 'WLUNA'
}


df['stablecoin'] = df['contract_address'].map(contract_to_token)
df = df[~df['stablecoin'].isnull()]

print(f"Contract Addresses: {df['contract_address'].unique()}")
print(f"\nStablecoin: {df['stablecoin'].unique()}")


df.to_csv(f'{output_path}/{output_filename}')


Contract Addresses: ['0xdac17f958d2ee523a2206206994597c13d831ec7'
 '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48'
 '0xa47c8bf37f92abed4a126bda807a7b7498661acd'
 '0x6b175474e89094c44da98b954eedeac495271d0f'
 '0x8e870d67f660d95d5be530380d0ec0bd388289e1'
 '0xd2877702675e6ceb975b4a1dff9fb7baf4c91ea9']

Stablecoin: ['USDT' 'USDC' 'UST' 'DAI' 'PAX' 'WLUNA']


OSError: [Errno 28] No space left on device

In [15]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5279149 entries, 0 to 5280130
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   block_number       int64         
 1   transaction_index  int64         
 2   from_address       object        
 3   to_address         object        
 4   timestamp          int64         
 5   contract_address   object        
 6   value              float64       
 7   datetime           datetime64[ns]
 8   date               object        
 9   stablecoin         object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 443.0+ MB


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


Unnamed: 0,block_number,transaction_index,from_address,to_address,timestamp,contract_address,value,datetime,date,stablecoin
0,14669683,7,0xd30b438df65f4f788563b2b3611bd6059bff4ad9,0xda816e2122a8a39b0926bfa84edd3d42477e9efd,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,18.67,2022-04-28 00:30:15,2022-04-28,USDT
1,14669683,45,0x4941834ed1428089ee76252f6f9d767e800499b0,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,10000.0,2022-04-28 00:30:15,2022-04-28,USDT
2,14669683,46,0x2c1f9a20711e14f8484a41123e20d1b06858ebea,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,9942.313005,2022-04-28 00:30:15,2022-04-28,USDT
3,14669683,47,0x7784c4f53aa5f03bd6d3ca670c9d9c887cc38cb0,0x3cd751e6b0078be393132286c442345e5dc49699,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,234.364097,2022-04-28 00:30:15,2022-04-28,USDT
4,14669683,48,0x66589b8278470d58d4112f3d065de0f75734312e,0x28c6c06298d514db089934071355e5743bf21d60,1651105815,0xdac17f958d2ee523a2206206994597c13d831ec7,9800.0,2022-04-28 00:30:15,2022-04-28,USDT
