### Important:
The data used in this exercise has already been cleaned. However, I included this notebook with a basic data cleaning step, as the process may vary depending on the context. For example, if the token is a stablecoin, it might be necessary to remove very small or very large values. Zero-value transactions could be relevant for identifying network activity, or they might indicate failed transactions or contract issues. Repeated values in short time spans may suggest arbitrage or testing behaviour. Therefore, the cleaning strategy should be adapted to the analysis goal.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/token_transfers_V3.0.0.csv')

df.info()
df.head()

In [None]:
import re

df_excluded = pd.DataFrame()

# standardizations (valid with etherium)
df['from_address'] = df['from_address'].str.lower()
df['to_address'] = df['to_address'].str.lower()
df['contract_address'] = df['contract_address'].str.lower()

# Basic function to validate ethereum address
def is_valid_eth_address(addr):
    return isinstance(addr, str) and re.fullmatch(r"0x[a-f0-9]{40}", addr) is not None

# Remove lines with missing data in essential columns
essential_cols = ['block_number', 'transaction_index', 'from_address', 'to_address', 'time_stamp', 'contract_address', 'value']
missing_rows = df[df[essential_cols].isnull().any(axis=1)]
df_excluded = pd.concat([df_excluded, missing_rows])
df = df.drop(missing_rows.index.tolist())

# Remove trasactions with value < 0
zero_value_rows = df[df['value'] < 0.0]
df_excluded = pd.concat([df_excluded, zero_value_rows])
df = df.drop(zero_value_rows.index.tolist())

# Remove invalid address
invalid_from = df[~df['from_address'].apply(is_valid_eth_address)]
invalid_to = df[~df['to_address'].apply(is_valid_eth_address)]
invalid_contract = df[~df['contract_address'].apply(is_valid_eth_address)]
invalid_rows = pd.concat([invalid_from, invalid_to, invalid_contract]).drop_duplicates()
df_excluded = pd.concat([df_excluded, invalid_rows])
df = df.drop(invalid_rows.index.tolist())

# Convert timestamp to datetime
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='s')
df['date'] = df['time_stamp'].dt.date

# Summary
print(f"Total original: {len(df) + len(df_excluded)}")
print(f"Valids: {len(df)}")
print(f"Excludeds: {len(df_excluded)}")
print(f"Tax of validation: {(len(df) / (len(df) + len(df_excluded)) * 100):.2f}%")
df_excluded.head()

In [None]:
# Basic descriptive statistics
print("Descriptive:")
print(df['value'].describe())
print(f"\nPerccentiles: {df['value'].quantile([0.001, 0.01, 0.05, 0.95, 0.99, 0.999])}")

# Identify imposible values
test_values = df[df['value'] >= 1e15]
print(f"\nExtreme values (>= 1e15): {len(test_values)}")

# Analisys per contract
print("\nAnalisys per contract:")
for contract in df['contract_address'].value_counts().head(5).index:
    contract_data = df[df['contract_address'] == contract]
    print(f"{str(contract)[:10]}...: min={contract_data['value'].min():.4f}, max={contract_data['value'].max():.4f}")

# Remove outliers
q_high = df['value'].quantile(0.9999) # 99.99%
q_low = df['value'].quantile(0.0001) # 0.01%

outlier_high = df[df['value'] > q_high]
outlier_low = df[df['value'] < q_low]

print(f"\nHighest outliers (> {q_high:.2f}): {len(outlier_high)}")
print(f"Lowest outliers (< {q_low:.2f}): {len(outlier_low)}")

# Remove and update tracking
df_excluded = pd.concat([df_excluded, outlier_high, outlier_low])
df = df[(df['value'] >= q_low) & (df['value'] <= q_high)]

print(f"\nAfter outliers removal: {len(df)} valid transactions")