# Cluster

In this step we will group the attacker accounts and bot contract into attacker clusters.

In [1]:
import collections
import json
import os

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from pandas.plotting import register_matplotlib_converters
from web3 import Web3


register_matplotlib_converters()

%matplotlib inline

plt.style.use("seaborn")

Consider installing rusty-rlp to improve pyrlp performance with a rust based backend


You can change the data directory if you use a different project structure.

In [2]:
data_directory = os.path.join("..", "..", "..", "data")

Decompress the results file (if needed).

In [3]:
results_file_path = os.path.join(data_directory, "insertion_results.json")
if not os.path.exists(results_file_path):
    with tarfile.open(compressed_path, "r", encoding="utf-8") as compressed_file:
        compressed_file.extract("insertion_results.json", data_directory)

Read the results one line at a time. Each line is json encoded.

In [4]:
with open(results_file_path, "r", encoding="utf-8") as results_file:
    results = []
    line = results_file.readline().strip()
    while line != "":
        results.append(json.loads(line))
        line = results_file.readline().strip()

We create a graph where each node is either an attacker account (the first or the second in the sandwich) or a bot contract. Also we create edges between an attacker account and a bot contract when they are respectively the source and the target of an insertion attack transaction.

In [5]:
role_graph = nx.Graph()
bots = set()

# iterate through each result
for result in results:
    if result["interface"] == "bot":
        # shortcuts
        first_transaction = result["first_transaction"]
        second_transaction = result["second_transaction"]

        first_attacker = first_transaction["from"]
        second_ataccker = second_transaction["from"]
        bot = result["bot_address"]

        # keep the bots in a set
        bots.add(bot)

        # if the first attacker is new
        if not role_graph.has_node(first_attacker):
            # add the first attacker node
            role_graph.add_node(first_attacker, role="attacker")

        # if the second attacker is new
        if not role_graph.has_node(second_ataccker):
            # add the second attacker node
            role_graph.add_node(second_ataccker, role="attacker")

        # if the bot is new
        if not role_graph.has_node(bot):
            # add the bot node with empty stats
            role_graph.add_node(bot, role="bot", attacks=0, cost=0, profit=0)
            
        # update bot stats
        role_graph.nodes[bot]["attacks"] += 1
        role_graph.nodes[bot]["cost"] += result["cost_usd"]
        role_graph.nodes[bot]["profit"] += result["profit_usd"]

        # if the edge between the first attacker and the bot is new
        if not role_graph.has_edge(first_attacker, bot):
            # add the attack edge
            role_graph.add_edge(first_attacker, bot, role="attack")

        # if the edge between the second attacker and the bot is new
        if not role_graph.has_edge(second_ataccker, bot):
            # add the attack edge
            role_graph.add_edge(second_ataccker, bot, role="attack")

Find all the byte code files (downloaded in step 1) and keep them by bot address.

In [6]:
bytecode_file_name_by_bot = {}
bytecode_directory = os.path.join(data_directory, "insertion_bot_bytecode")

# iterate through each file in the byte code directory
for file_name in os.listdir(bytecode_directory):
    # validate the extension
    assert file_name.endswith(".bin")
    
    # validate the name format {address}-{block}.bin
    tokens = file_name.split("-")
    assert len(tokens) == 2
    
    # keep the file name by bot address
    bot = tokens[0]
    bytecode_file_name_by_bot[bot] = file_name

A function to load the byte code of a bot.

In [7]:
def load_bot_bytecode(bot):
    bot = Web3.toChecksumAddress(bot)
    if bot in bytecode_file_name_by_bot:
        file_name = bytecode_file_name_by_bot[bot]
        file_path = os.path.join(bytecode_directory, file_name)
        with open(file_path, "rb") as bytecode_file:
            return bytecode_file.read()
    else:
        # return no bytes by default
        return bytes()

Check that there are no bots with empty byte code (can happen when we download the byte code of a destroyed contract).

In [8]:
print("Bots with empty bytecode:")
no_bytecode_count = 0
for bot in bots:
    bytecode = load_bot_bytecode(bot)
    if len(bytecode) == 0:
        print(bot)
        no_bytecode_count += 1
if no_bytecode_count == 0:
    print("None")

Bots with empty bytecode:
None


Go through every bot pair and create an edge between them if they have exactly the same byte code.

In [9]:
bytecode_matched_by_bot = {}

# sort the boty by address to do this deterministically
sorted_bots = sorted(bots)

# iterate through each sorted bot (skip the last one)
for i, bot in enumerate(sorted_bots[:-1]):
    assert role_graph.has_node(bot)
    
    if bot not in bytecode_matched_by_bot:
        bytecode_matched_by_bot[bot] = True  # not really necessary, we will not visit this one again

        bytecode = load_bot_bytecode(bot)
        
        # do no match if the bot has no byte code
        # this can happen if we did not download the code yet
        # or if the bot was actually an account and should be ckecked
        if len(bytecode) == 0:
            continue
            
        # iterate through each sorted bot after the current one
        for other_bot in sorted_bots[i + 1:]:
            assert role_graph.has_node(other_bot)
            
            other_bytecode = load_bot_bytecode(other_bot)
            
            # same problem if another bot has no byte code
            if len(other_bytecode) == 0:
                continue
            
            # if the byte codes match add an edge between them
            if bytecode == other_bytecode:
                bytecode_matched_by_bot[other_bot] = True
                
                assert not role_graph.has_edge(bot, other_bot)
                role_graph.add_edge(bot, other_bot, role="bytecode_match")

Create the clusters using an algorithm to detect connected components in the graph.

In [10]:
cluster_id = 1
bot_hash_to_cluster_id = {}
attacker_hash_to_cluster_id = {}
clusters_by_id = {}
cluster_stats = []
big_cluster_count = 0

# iterate through each connected component of the graph
for component in nx.connected_components(role_graph):
    # count members by role
    role_counter = collections.Counter([role_graph.nodes[node]["role"] for node in component])
    
    # check if there is more than one bot
    if role_counter["bot"] > 1:
        big_cluster_count += 1
        
    cluster = []
    unique_bots_in_cluster = []
    
    attacks = 0
    cost = 0
    profit = 0
    
    # for each member of the component
    for member_hash in component:
        member_role = role_graph.nodes[member_hash]["role"]
        cluster.append({"hash": member_hash, "role": member_role})
        
        # if the member is a bot
        if member_role == "bot":
            assert member_hash not in bot_hash_to_cluster_id
            bot_hash_to_cluster_id[member_hash] = cluster_id
            
            # accumulate variables
            attacks += role_graph.nodes[member_hash]["attacks"]
            cost += role_graph.nodes[member_hash]["cost"]
            profit += role_graph.nodes[member_hash]["profit"]
            
            # check if the byte code is unique
            bytecode = load_bot_bytecode(member_hash)
            bytecode_matched = False
            for other_member_hash in unique_bots_in_cluster:
                other_bytecode = load_bot_bytecode(other_member_hash)
                if bytecode == other_bytecode:
                    bytecode_matched = True
                    break
                    
            # no other member has the same byte code
            if not bytecode_matched:
                unique_bots_in_cluster.append(member_hash)
            
        # if the member is an attacker
        elif member_role == "attacker":
            assert member_hash not in attacker_hash_to_cluster_id
            attacker_hash_to_cluster_id[member_hash] = cluster_id

    # save the cluster
    clusters_by_id[cluster_id] = cluster

    # save the cluster stats
    cluster_stats.append({
        "Cluster ID": cluster_id,
        "Attacks": attacks,
        "Cost": cost,
        "Profit": profit,
        "Attacker Accounts": role_counter["attacker"],
        "Bot Contracts": role_counter["bot"],
        "Unique Bot Count": len(unique_bots_in_cluster)
    })

    # next cluster
    cluster_id += 1
    
print("Amount of clusters:", len(clusters_by_id))
print("Amount of clusters with more than one bot:", big_cluster_count)

Amount of clusters: 98
Amount of clusters with more than one bot: 57


Save the all the results.

In [11]:
# json file with a list of hash and role per member
with open(os.path.join(data_directory, "insertion_clusters.json"), "w") as json_file:
    json.dump(clusters_by_id, json_file, indent=2)
    
# json file with a map of bot hash to cluster id
with open(os.path.join(data_directory, "insertion_bot_hash_to_cluster_id.json"), "w") as json_file:
    json.dump(bot_hash_to_cluster_id, json_file, indent=2)
    
# json file with a map of attacker hash to cluster id
with open(os.path.join(data_directory, "insertion_attacker_hash_to_cluster_id.json"), "w") as json_file:
    json.dump(attacker_hash_to_cluster_id, json_file, indent=2)
    
# csv file with cluster stats
df_cluster_stats = pd.DataFrame(cluster_stats)
df_cluster_stats.to_csv(os.path.join(data_directory, "insertion_cluster_stats.csv"), index=False)

Show the collected cluster stats.

In [12]:
df_cluster_stats[[
        "Cost",
        "Profit",
        "Attacks",
        "Attacker Accounts",
        "Bot Contracts"
        ]].describe()

Unnamed: 0,Cost,Profit,Attacks,Attacker Accounts,Bot Contracts
count,98.0,98.0,98.0,98.0,98.0
mean,38807.627235,130246.9,1979.77551,14.867347,4.806122
std,135351.998952,462464.4,6053.684,90.588868,10.093029
min,0.9808,-2319.415,1.0,1.0,1.0
25%,43.83728,-9.775363,4.25,1.0,1.0
50%,419.739034,691.4771,68.5,2.0,2.0
75%,3510.93721,8350.461,529.25,3.0,4.0
max,686850.374887,2262412.0,39162.0,891.0,80.0


Save the cluster collected stats in a latex table for the publication.

In [13]:
latex = df_cluster_stats[[
        "Cost",
        "Profit",
        "Attacks",
        "Attacker Accounts",
        "Bot Contracts"
        ]]\
      .describe()\
      .to_latex(index=True, formatters={
        "Cost": "{:,.2f}".format,
        "Profit": "{:,.2f}".format,
        "Attacks": "{:.2f}".format,
        "Attacker Accounts": "{:.2f}".format,
        "Bot Contracts": "{:.2f}".format,
        })

with open(os.path.join(data_directory, "insertion_cluster_stats.tex"), "w") as latex_file:
    latex_file.write(latex)