<a href="https://colab.research.google.com/github/epowell101/mscGNN-work/blob/main/EDA_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import pandas as pd
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# Step 1: File Exploration
# -------------------------
# Define the exact path to your Parquet file in Google Drive
exact_path = '/content/drive/My Drive/ETH data/tx_data_loki_gr15.parquet'

# Load the Parquet file into a DataFrame
df = pd.read_parquet(exact_path)

# Remove rows where 'from_address' or 'to_address' is None
df = df[df['from_address'].notna() & df['to_address'].notna()]

# Randomly sample X% of the data (adjust the fraction as needed)
sample_df = df.sample(frac=0.10)

# Initialize a directed graph
G = nx.DiGraph()

# Loop through all rows in the DataFrame to add nodes and edges to the graph
for idx, row in sample_df.iterrows():
    G.add_edge(row['from_address'], row['to_address'],
               block_timestamp=row['block_timestamp'],
               eth_value=row['eth_value'],
               gas_used=row['gas_used'],  # Added gas_used
               tx_fee=row['tx_fee'])      # Added tx_fee

# Updating node types based on their appearance in 'from_address' and 'to_address'
# Here, the change is to use sample_df instead of df for node type assignment
for node in G.nodes():
    node_type = []
    if node in sample_df['from_address'].values:  # Changed from df to sample_df
        node_type.append('From')
    if node in sample_df['to_address'].values:    # Changed from df to sample_df
        node_type.append('To')
    if node in sample_df['EOA'].values:           # Changed from df to sample_df
        node_type.append('EOA')
    G.nodes[node]['type'] = node_type

# Initialize a dictionary to store the depth for each EOA
depth_dict = {}

# Define the maximum depth you're interested in
max_depth = 10

# Loop through all unique EOAs to calculate depth
# The change here is to use sample_df instead of df for the unique EOAs list
for eoa in sample_df['EOA'].unique():  # Changed from df to sample_df
    visited = set()
    to_explore = [(eoa, 0)]
    while to_explore:
        current_node, current_depth = to_explore.pop(0)
        if current_node in visited or current_depth > max_depth:
            continue
        if current_node not in G:  # Added this check to handle missing nodes
            continue
        visited.add(current_node)
        neighbors = list(G.successors(current_node))
        to_explore.extend((neighbor, current_depth + 1) for neighbor in neighbors)
    depth_dict[eoa] = len(visited)

# Count or list EOAs connected to other EOAs
eoa_to_eoa_count = 0
eoa_to_eoa_list = []

# Use sample_df for the unique EOAs list
for eoa in sample_df['EOA'].unique():  # Changed from df to sample_df
    for neighbor in G.successors(eoa):
        if 'EOA' in G.nodes[neighbor].get('type', []):  # Changed the condition to check for 'EOA' in the list
            eoa_to_eoa_count += 1
            eoa_to_eoa_list.append((eoa, neighbor))

print(f"Number of EOA to EOA connections: {eoa_to_eoa_count}")
print(f"List of EOA to EOA connections: {eoa_to_eoa_list}")

# Convert depth dictionary to DataFrame for easier manipulation and plotting
depth_df = pd.DataFrame(list(depth_dict.items()), columns=['EOA', 'Depth'])

# Basic statistics on depth
print("Depth statistics:")
print(depth_df['Depth'].describe())

# Step 3: Basic EDA
# -----------------
# View first few rows
print("First few rows of the DataFrame:")
print(df.head())

# Number of unique EOA, From and To
unique_eoas = df['EOA'].nunique()
print(f"Number of unique EOAs: {unique_eoas}")

unique_from=df['from_address'].nunique()
print(f"Number of unique from addresses: {unique_from}")

unique_to=df['to_address'].nunique()
print(f"Number of unique to addresses: {unique_to}")

# Summary statistics
print("Summary statistics:")
print(df.describe())

# Measure depth of transactions for each EOA
depth_dict = {}  # Initialize a dictionary to store the depth for each EOA
max_depth = 5  # Replace with the maximum depth you're interested in

# Convert depth dictionary to DataFrame for easier manipulation and plotting
depth_df = pd.DataFrame(list(depth_dict.items()), columns=['EOA', 'Depth'])

# Basic statistics on depth
print("Depth statistics:")
print(depth_df['Depth'].describe())

# Convert depth dictionary to DataFrame for easier manipulation and plotting
depth_df = pd.DataFrame(list(depth_dict.items()), columns=['EOA', 'Depth'])

# Basic statistics on depth
print("Depth statistics:")
print(depth_df['Depth'].describe())

# Calculate centrality metrics for all nodes
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
# eigenvector_centrality = nx.eigenvector_centrality(G)

# Create dictionaries to store centrality metrics for the two groups
eoa_to_eoa_centrality = {}
other_eoa_centrality = {}

# Populate the dictionaries
for eoa in sample_df['EOA'].unique():
    if eoa in G:
        centrality_metrics = {
            'degree': degree_centrality.get(eoa, 0),
            'closeness': closeness_centrality.get(eoa, 0),
            'betweenness': betweenness_centrality.get(eoa, 0),
        }

        if eoa in [e[0] for e in eoa_to_eoa_list]:
            eoa_to_eoa_centrality[eoa] = centrality_metrics
        else:
            other_eoa_centrality[eoa] = centrality_metrics

# Create DataFrames for the two groups
eoa_to_eoa_df = sample_df[sample_df['EOA'].isin([e[0] for e in eoa_to_eoa_list])]
other_eoa_df = sample_df[~sample_df['EOA'].isin([e[0] for e in eoa_to_eoa_list])]

# Calculate average transaction sizes and their variance
avg_size_eoa_to_eoa = eoa_to_eoa_df['eth_value'].mean()
var_size_eoa_to_eoa = eoa_to_eoa_df['eth_value'].var()

avg_size_other_eoa = other_eoa_df['eth_value'].mean()
var_size_other_eoa = other_eoa_df['eth_value'].var()

# Calculate mean and standard deviation for each group
eoa_to_eoa_mean = eoa_to_eoa_df.mean()
eoa_to_eoa_std = eoa_to_eoa_df.std()

other_eoa_mean = other_eoa_df.mean()
other_eoa_std = other_eoa_df.std()

# Make sure that all the arrays have the same length by taking only the common metrics
common_metrics = set(eoa_to_eoa_mean.index) & set(eoa_to_eoa_std.index) & set(other_eoa_mean.index) & set(other_eoa_std.index)

# Filter the Series objects to include only the common metrics
eoa_to_eoa_mean = eoa_to_eoa_mean[common_metrics]
eoa_to_eoa_std = eoa_to_eoa_std[common_metrics]
other_eoa_mean = other_eoa_mean[common_metrics]
other_eoa_std = other_eoa_std[common_metrics]

# Convert dictionaries to DataFrames for easier manipulation
eoa_to_eoa_centrality_df = pd.DataFrame.from_dict(eoa_to_eoa_centrality, orient='index')
other_eoa_centrality_df = pd.DataFrame.from_dict(other_eoa_centrality, orient='index')

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Metric': list(common_metrics),
    'EOA_to_EOA_Mean': eoa_to_eoa_mean.values,
    'EOA_to_EOA_Std': eoa_to_eoa_std.values,
    'Other_EOA_Mean': other_eoa_mean.values,
    'Other_EOA_Std': other_eoa_std.values
})

# Display the summary DataFrame
print(summary_df)

print(f"EOA to EOA Avg Size: {avg_size_eoa_to_eoa}, Variance: {var_size_eoa_to_eoa}")
print(f"Other EOA Avg Size: {avg_size_other_eoa}, Variance: {var_size_other_eoa}")

# Calculate mean and standard deviation for centrality metrics
eoa_to_eoa_centrality_mean = eoa_to_eoa_centrality_df.mean()
eoa_to_eoa_centrality_std = eoa_to_eoa_centrality_df.std()

other_eoa_centrality_mean = other_eoa_centrality_df.mean()
other_eoa_centrality_std = other_eoa_centrality_df.std()

# Create a summary DataFrame for centrality metrics
centrality_summary_df = pd.DataFrame({
    'Centrality_Metric': list(eoa_to_eoa_centrality_mean.index),
    'EOA_to_EOA_Mean': eoa_to_eoa_centrality_mean.values,
    'EOA_to_EOA_Std': eoa_to_eoa_centrality_std.values,
    'Other_EOA_Mean': other_eoa_centrality_mean.values,
    'Other_EOA_Std': other_eoa_centrality_std.values
})

# Display the summary DataFrame for centrality metrics
print("Summary statistics for centrality metrics:")
print(centrality_summary_df)

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

def df_to_markdown(df):
    fmt = ['---' for _ in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return df_formatted.to_markdown(index=False)

# Convert summary DataFrame to Markdown
summary_markdown = df_to_markdown(summary_df)
print(summary_markdown)

# Convert centrality summary DataFrame to Markdown
centrality_summary_markdown = df_to_markdown(centrality_summary_df)
print(centrality_summary_markdown)


# Data distribution (use histograms or boxplots)
# sns.pairplot(df)
# plt.show()

# Step 4: Data Preprocessing
# ---------------------------
# Perform any required preprocessing steps here. This will be influenced by your script.
# For example, if you want to bucketize a column:
# df['amount_bucket'] = pd.cut(df['amount'], bins=[0, 1000, 10000, 100000, float('inf')])

# Step 5: Save Processed Data
# ---------------------------
# Save the DataFrame as a new Parquet file
processed_file_path = '/content/drive/My Drive/ETH data/Sybildata_first.parquet'
df.to_parquet(processed_file_path)


Mounted at /content/drive


In [6]:
# creating a function for a prettier table

from IPython.display import display, HTML

def display_styled_dataframe(df):
    """
    Display a DataFrame with formatting in Jupyter Notebook
    """
    float_columns = df.select_dtypes(include=['float64']).columns
    format_dict = {col: "{:.2}" for col in float_columns}
    styled_df = df.style.format(format_dict)
    display(styled_df)

# Using the function
display_styled_dataframe(summary_df) # summary
display_styled_dataframe(centrality_summary_df) # centrality summary


Unnamed: 0,Metric,EOA_to_EOA_Mean,EOA_to_EOA_Std,Other_EOA_Mean,Other_EOA_Std
0,eth_value,0.25,2.5,0.25,2.0
1,tx_fee,0.0055,0.01,0.0053,0.031
2,gas_limit,230000.0,690000.0,170000.0,270000.0
3,gas_used,130000.0,320000.0,110000.0,150000.0
4,__row_index,29000.0,21000.0,27000.0,21000.0


Unnamed: 0,Centrality_Metric,EOA_to_EOA_Mean,EOA_to_EOA_Std,Other_EOA_Mean,Other_EOA_Std
0,degree,0.0011,0.0012,0.00045,0.00053
1,closeness,0.00011,0.00032,6.9e-05,0.00026
2,betweenness,1.6e-07,5.7e-07,4.5e-08,3.5e-07


Centrality Metrics
1. Degree Centrality
Degree Centrality measures the number of edges a node has. In the context of Ethereum transactions, it indicates how many different addresses an EOA is interacting with. A higher degree centrality signifies that the EOA is involved in more transactions, either as a sender or a receiver.

2. Closeness Centrality
Closeness Centrality gauges how close a node is to all other nodes in the network, based on the shortest paths. For every pair of nodes, you find the shortest path between them and then average those lengths. In the Ethereum network, a lower average length means the EOA can reach other addresses through fewer hops, making it more central in the network.

3. Betweenness Centrality
Betweenness Centrality quantifies how often a node appears on the shortest paths between other nodes. In this context, a higher betweenness centrality indicates that the EOA acts as a kind of "bridge" within the network, connecting various parts of the Ethereum ecosystem.