# Node embeddings

This notebook generates and evaluates node embeddings using Diff2Vec, Role2Vec, and DeepWalk algorithms.
The performance is measured with ENS address pairs as ground-truth.
Each method is run 10 times and the average rank of the target address is reported.

Additionally, a two-step for address clustering, approach adding time-of-day and/or normalized gas information to the embeddings, is tested.
However, adding this information did not improve the average rank of the target.

More information on the node embedding algorithms can be found here:
<code>https://github.com/benedekrozemberczki/karateclub/tree/master</code>

Including the following methods:
- Diff2Vec: <code>https://github.com/benedekrozemberczki/diff2vec</code>
- Role2Vec: <code>https://github.com/benedekrozemberczki/role2vec</code>
- DeepWalk: <code>https://github.com/benedekrozemberczki/karateclub/blob/master/karateclub/node_embedding/neighbourhood/deepwalk.py</code>

In [4]:
from sklearn.preprocessing import StandardScaler
from utils import *

### Read CSVs

In [6]:
all_transfers_df = pd.read_csv('../data/intra_all_transfers.csv', index_col=[0], low_memory=False) # excluding zero-value transfers does not make a significant difference for the resulting graph (nodes: 51566, edges: 249302)
ens_pairs = pd.read_csv('../data/ens_pairs.csv', index_col=[0])

### Create the network graph

In [7]:
## Create network graph
G = nx.from_pandas_edgelist(all_transfers_df, 'from', 'to', create_using=nx.MultiDiGraph())
G = clean_graph(G)

# Get the largest connected component (assuming graph is undirected)
largest_cc = max(nx.connected_components(G), key=len)
G_cc = G.subgraph(largest_cc)

# Recode the graph's nodes, node_map maps from address to index
G_cc, node_map = recode_graph(G_cc)

# Create reverse map (from indices to addresses)
idx_map = dict(zip(node_map.values(),node_map.keys()))
ordered_addresses = [idx_map[idx] for idx in range(len(node_map))]

print(f'Nodes: {len(G_cc.nodes())}\nEdges: {len(G_cc.edges())}')

Nodes: 51566
Edges: 249302


### Run node embedding methods once and store the embeddings as a dataframe

In [32]:
# Diff2Vec
diff2vec = Diff2Vec(diffusion_number=10, diffusion_cover=40, dimensions=128, window_size=5, learning_rate=0.025)
embeddings_d2v, emb_d2v_df = fit_model(G_cc, diff2vec, ordered_addresses)
emb_d2v_df.to_csv('../data/embeddings/diff2vec.csv')

# Role2Vec
role2vec = Role2Vec(walk_number=10, walk_length=40, dimensions=128, window_size=5, learning_rate=0.025, epochs=5)
embeddings_r2v, emb_r2v_df = fit_model(G_cc, role2vec, ordered_addresses)
emb_r2v_df.to_csv('../data/embeddings/role2vec.csv')

# DeepWalk
deepWalk = DeepWalk(walk_number=10, walk_length=40, dimensions=128, window_size=5, learning_rate=0.025, epochs=5)
embeddings_dw, emb_dw_df = fit_model(G_cc, deepWalk, ordered_addresses)
emb_dw_df.to_csv('../data/embeddings/deepWalk.csv')

### Evaluation - Conduct 10 separate experiments and calculate the average rank for each target address

In [8]:
def calculate_average_rank(model, iterations=5):
    rank_arr = []

    for i in tqdm(range(iterations)):
        embeddings, emb_df = fit_model(G_cc, model, ordered_addresses)
        faiss_index = DistCalculation(embeddings, node_map)

        ranks = []
        for _, row in ens_pairs.iterrows():
            rank, _ = faiss_index.foo(row['addr1'], row['addr2'])
            ranks.append(rank)

        # Add the ranks to the array
        rank_arr.append(ranks)

    rank_arr = np.array(rank_arr)
    # Convert None values to np.nan
    rank_arr = [[np.nan if val is None else val for val in sublist] for sublist in rank_arr]
    average_rank = np.nanmean(rank_arr, axis=0)

    return average_rank

In [21]:
# Diff2Vec
diff2vec = Diff2Vec(diffusion_number=10, diffusion_cover=40, dimensions=128, window_size=5, learning_rate=0.025)
average_rank_diff2vec = calculate_average_rank(diff2vec, iterations=10)
average_rank_diff2vec

100%|██████████| 10/10 [1:48:29<00:00, 650.93s/it]
  average_rank = np.nanmean(rank_arr, axis=0)


array([  2.5, 577.3,   2.8, 106.8,   nan,  11.1,  30.7,  44.3,  16.3,
         1. , 129.1,   nan,   1. ,   1.9,  10.4,   4. ,   4.2,   5.7,
        52.3,   9. , 474.2,   1.7,   2.7,   1.3,   1. ,   3.7,  16.9,
         3.6,   1.5,   2.7,   2. ,  64.8,   3. ,  11. ,  52.1,  10.3,
         4.2,   2.3,  34. ])

In [22]:
# Role2Vec
role2vec = Role2Vec(walk_number=10, walk_length=40, dimensions=128, window_size=5, learning_rate=0.025, epochs=5)
average_rank_role2vec = calculate_average_rank(role2vec, iterations=10)
average_rank_role2vec

100%|██████████| 10/10 [1:37:16<00:00, 583.60s/it]
  average_rank = np.nanmean(rank_arr, axis=0)


array([  6.2,  33.3,   2.5,  71.1,   nan,   3. ,   1. ,  91.2,  10.4,
         1. ,  10. ,   nan,   1. ,   1. ,   1. ,   1. ,   1.5,   5.6,
       373.8,   9. ,  86.7,   5. ,   1. ,   3.4,   1. ,   2. ,  14.4,
         2.6,   2. ,   1. ,   2. ,  13.6,   1.1,  19.3,   5.6,  68.9,
         4. ,   2.9, 178.6])

In [23]:
# DeepWalk
deepWalk = DeepWalk(walk_number=10, walk_length=40, dimensions=128, window_size=5, learning_rate=0.025, epochs=5)
average_rank_deepWalk = calculate_average_rank(deepWalk, iterations=10)
average_rank_deepWalk

100%|██████████| 10/10 [41:27<00:00, 248.74s/it]
  average_rank = np.nanmean(rank_arr, axis=0)


array([  2.7,  58.8,   3.3,  20.1,   nan,   1.4,   1. , 545.4,   4.1,
         1. , 605.4,   nan,   1.8,   1. ,   1. ,  18.2,  10.3,  27.2,
       108.5,   9. ,  51.8,   2.4,   1. ,   2.3,   2.4,   2.2, 284. ,
         2.2,   1.7,   1.6,   1.1, 101.2,   1.6,   1.7,   3.4,  47.3,
       227. ,   3.4, 138.6])

### Combine the results and compute mean, median and std for each method

In [24]:
# Combine numpy arrays
average_rank_array = np.column_stack((average_rank_diff2vec, average_rank_role2vec, average_rank_deepWalk))
average_rank = pd.DataFrame(average_rank_array, columns=['Diff2Vec', 'Role2Vec', 'DeepWalk'])
average_rank.to_csv('../data/average_rank.csv', index=False)

# Calculate stats
mean = average_rank.mean()
median = average_rank.median()
std = average_rank.std()

# Print results
print('Mean:\n', mean)
print('\nMedian:\n', median)
print('\nStandard Deviation:\n', std)

Mean:
 Diff2Vec    46.037838
Role2Vec    28.072973
DeepWalk    62.083784
dtype: float64

Median:
 Diff2Vec    4.2
Role2Vec    3.4
DeepWalk    3.3
dtype: float64

Standard Deviation:
 Diff2Vec    120.504083
Role2Vec     68.814003
DeepWalk    139.931865
dtype: float64


# Two-step approach


**Initialize distance calculation classes**
To not always recalculate the node embeddings, they were stored as CSV files. These embeddings are used to initialize the distance calculation class.

In [9]:
#Diff2Vec
emb_d2v_df = pd.read_csv('../data/embeddings/diff2vec.csv', index_col='address')
embeddings_d2v = emb_d2v_df.values
faiss_index_d2v = DistCalculation(embeddings_d2v, node_map)

#Role2Vec
emb_r2v_df = pd.read_csv('../data/embeddings/role2vec.csv', index_col='address')
embeddings_r2v = emb_r2v_df.values
faiss_index_r2v = DistCalculation(embeddings_r2v, node_map)

#Deepwalk
emb_dw_df = pd.read_csv('../data/embeddings/deepWalk.csv', index_col='address')
embeddings_dw = emb_dw_df.values
faiss_index_deepWalk = DistCalculation(embeddings_dw, node_map)

### Add Time-of-Day-Activity and/or gas price selection vectors

In [73]:
# Prepare Vectors
timeOfDay = pd.read_csv('../data/timeOfDay.csv', index_col='from')
normalizedGas = pd.read_csv('../data/normalizedGas.csv', index_col='from')
stats_df = timeOfDay.merge(normalizedGas, left_index=True, right_index=True, how='inner')

# Normalize
scaler = StandardScaler()
stats_normalized = pd.DataFrame(scaler.fit_transform(stats_df), columns=stats_df.columns, index=stats_df.index)

def filter_dataframe(df, argument, include_hist=False):
    if argument.lower() == 'tod':
        # keep only the 'ToD' related columns
        selected_columns = [col for col in df.columns if 'ToD' in col]
    elif argument.lower() == 'gas':
        # keep only the 'normalized_gasPrice' related columns
        selected_columns = [col for col in df.columns if 'normalized_gasPrice' in col]
    elif argument.lower() == 'both':
        # keep both 'ToD' and 'normalized_gasPrice' related columns
        selected_columns = df.columns
    else:
        raise ValueError("Invalid argument. Expected 'ToD', 'normalized_gasPrice', or 'both'.")

    if not include_hist:
        # If include_hist is False, remove columns with 'hist_bin_' in their names
        selected_columns = [col for col in selected_columns if 'hist_bin_' not in col]

    return df[selected_columns]


def compute_relative_rank(df, source_address, target_address, num_neighbors=10):

    if source_address in node_map and target_address in node_map:
        # Get k nearest neighbors for specified address
        D, I = faiss_index_d2v.get_dist_idx(node_map[source_address])
        nearest_neighbors = list(I[0:num_neighbors])
        nearest_neighbors_addresses = [idx_map[j] for j in nearest_neighbors]
        nearest_neighbors_addresses.insert(0, source_address)
        if target_address not in nearest_neighbors_addresses:
            print(f'Target address not in nearest {num_neighbors} neighbors')
            return None

        # Further filter the dataframe
        df_filtered = df[df.index.isin(nearest_neighbors_addresses)]

        # Compute the Euclidean distance between the source address and all other addresses
        distances = np.linalg.norm(df_filtered.loc[source_address] - df_filtered, axis=1)

        # Convert the distances to a DataFrame
        df_distances = pd.DataFrame(distances, columns=['distance'], index=df_filtered.index)
        df_distances.sort_values(by='distance', inplace=True)

        target_rank = df_distances.index.get_loc(target_address)

        return target_rank
    else:
        return None

In [74]:
# Test
df = filter_dataframe(stats_normalized, 'tod', include_hist=True)
source_address = '0x5500c014dc83d18c60858195b42f2e61f877323e'
target_address = '0x29d8bf1894fb077edcd5a6ee8ed475b6aed5dbb4'
rank = compute_relative_rank(df, source_address, target_address)
print(rank)

2


In [75]:
# Initialize the DistCalculation object
calc = DistCalculation(embeddings_r2v, node_map) # embeddings and node_map should be defined previously

# Function to apply to each row of the dataframe
def compute_rank_distance(row):
    rank, distance = calc.get_rank(row['addr1'], row['addr2'])
    return pd.Series([rank, distance])

# Apply the function to each row
ens_pairs[['Rank', 'Distance']] = ens_pairs.apply(compute_rank_distance, axis=1)
ens_pairs

Unnamed: 0_level_0,addr1,addr2,Rank,Distance
ens_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anisofim.eth,0x85e5472752a6f00775faca4d5179bde1081571b0,0xbf886e3069d0dfd64e384c93da322f775faa8876,5.0,0.435835
arisalzberg.eth,0x94e59547b8c68924380c90e729488f3e79ff8d22,0x8977e750a620607f3db83436360ae1bdacb28b82,21.0,0.978317
atearnz.eth,0x5500c014dc83d18c60858195b42f2e61f877323e,0x29d8bf1894fb077edcd5a6ee8ed475b6aed5dbb4,3.0,0.494003
awedjob.eth,0xc939519869c946e4bdca8fd0b6459048c4aebae2,0xe1eedbd1e08478707c794e7e8b1ee623f5fa6d64,34.5,0.835938
captvicky.eth,0x33e51a1141c44cc8bcb9b50c4a3c7ab59f4cf68c,0x42b21ca9cc96d0f71ed2533b33d659207a88f784,,
disruptor.eth,0xb84914f420b6de809ac1ed3d4427d90ed596b41c,0x50839a40d9d7ea49e2c6fd76ee9d184524e4c072,2.0,0.443547
dragonkiller.eth,0x4fdafeadb7c2f127e3c1719a99c01680cc8ddab1,0xffca46238dd656de99ac616da099dcb6dfb0f2f4,1.0,0.243687
eibriel.eth,0x05c351382db8d770207f319d96ac1184c3717ede,0xcf10cd8b5dc2323b1eb6de6164647756bad4de4d,113.0,1.006223
epdrabbit.eth,0x26846c9083c4ab525de593178e3f2f82afb8ba83,0x0786a24145fef2c60a38237e8671332899ce7c1f,9.0,0.774624
erikarand.eth,0xe005c90eb8b38938cfe722e7069118414145709b,0xd9c0e1af68d08c8c00d418431e8c036662a82e37,1.0,0.550495


In [76]:
# Only evaluate address pairs where the target address is already within the 10 nearest neighbors
ens_pairs_filtered = ens_pairs[ens_pairs['Rank'] <= 10].copy()

ToD_noHist = filter_dataframe(stats_normalized, 'ToD', include_hist=False)
ens_pairs_filtered['ToD_noHist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(ToD_noHist, row['addr1'], row['addr2']),
    axis=1
)

ToD_Hist = filter_dataframe(stats_normalized, 'ToD', include_hist=True)
ens_pairs_filtered['ToD_Hist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(ToD_Hist, row['addr1'], row['addr2']),
    axis=1
)

nGas_noHist = filter_dataframe(stats_normalized, 'gas', include_hist=False)
ens_pairs_filtered['nG_noHist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(nGas_noHist, row['addr1'], row['addr2']),
    axis=1
)

nGas_Hist = filter_dataframe(stats_normalized, 'gas', include_hist=True)
ens_pairs_filtered['nG_Hist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(nGas_Hist, row['addr1'], row['addr2']),
    axis=1
)

both_noHist = filter_dataframe(stats_normalized, 'both', include_hist=False)
ens_pairs_filtered['both_noHist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(both_noHist, row['addr1'], row['addr2']),
    axis=1
)

both_Hist = filter_dataframe(stats_normalized, 'both', include_hist=True)
ens_pairs_filtered['both_Hist'] = ens_pairs_filtered.apply(
    lambda row: compute_relative_rank(both_Hist, row['addr1'], row['addr2']),
    axis=1
)

Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors
Target address not in nearest 20 neighbors


In [77]:
# Exclude the columns if they exist in numeric_cols
exclude_columns = ['addr1', 'addr2', 'Distance']
numeric_cols = ens_pairs_filtered.select_dtypes(include=[np.number])
columns_to_drop = [col for col in exclude_columns if col in numeric_cols.columns]
numeric_cols = numeric_cols.drop(columns=columns_to_drop)

# Compute statistics of numeric columns
df_mean = numeric_cols.mean()
df_median = numeric_cols.median()
df_std = numeric_cols.std()

# Create a new dataframe capturing the statistics
stats_df = pd.DataFrame({
    'Column_Name': numeric_cols.columns,
    'mean': df_mean.values,
    'median': df_median.values,
    'std': df_std.values
})

print(stats_df)

   Column_Name      mean  median       std
0         Rank  3.810345     2.0  3.860084
1   ToD_noHist  4.000000     3.0  3.633180
2     ToD_Hist  5.500000     3.0  4.925444
3    nG_noHist  6.846154     4.0  6.116812
4      nG_Hist  7.307692     6.0  6.051573
5  both_noHist  4.923077     3.0  3.979177
6    both_Hist  5.615385     4.0  4.775579
