Methodology was based on Section 6.2 of paper "Quantifying measurement quality and load distribution in Tor": https://dl.acm.org/doi/pdf/10.1145/3427228.3427238

* I can always create a new container image with a Tor process representing each client and it will always choose a new guard node, so I don't have to change the default Tor definitions
* Log all relays along with their data, specifically geographical location, used in each client circuit.
* In the paper, the authors logged over 8.6 million circuits, representing about 275,000 circuits per day
* Drop the circuit immediately after creation to ensure not overloading the guard relays
* Wait 16 minutes between each circuit

In [1]:
!pip install stem
!pip install retrying
!pip install joblib



In [2]:
import subprocess
import requests
import shlex
import traceback
import time
from stem import CircStatus
from stem.control import Controller
import stem.process
import stem.descriptor.remote
from retrying import retry
import joblib
from tqdm import tqdm




In [3]:
BASE_DIR = './results/data/'
SOCKS_PORT = 9050
CONTROL_PORT = 9051

GUARD_NODES_FILE = f"{BASE_DIR}client_guard_nodes.joblib"
MIDDLE_NODES_FILE = f"{BASE_DIR}client_middle_nodes.joblib"
EXIT_NODES_FILE = f"{BASE_DIR}client_exit_nodes.joblib"

In [4]:
def stop_tor(tor_process):
    tor_process.terminate()
    tor_process.wait()  # Wait for the process to finish

In [5]:
def start_tor():
    command = "/opt/homebrew/opt/tor/bin/tor -f data/torrc"
    tor_process = subprocess.Popen(shlex.split(command))
    return tor_process

### More stem references
* https://tor.stackexchange.com/questions/7049/stem-how-to-get-current-in-use-circuit
* https://github.com/webfp/tor-browser-selenium/blob/main/examples/stem_adv.py
* https://stem.torproject.org/tutorials/to_russia_with_love.html#custom-path-selection
* https://stem.torproject.org/api.html

In [6]:
class RelayGeolocationException(Exception):
    def __init__(self, message):
        super().__init__(message)

In [7]:
def get_geolocation(ip_address):
    url = f"http://ip-api.com/json/{ip_address}" # Limited by 45 requests per minute
    response = requests.get(url)
    if response.status_code != 200:
        raise RelayGeolocationException(f"Could not fetch geolocation for {ip_address} due to {response.status_code} error")
    data = response.json()

    print(f"IP Address: {ip_address}")
    print(f"Location: {data['city']}, {data['region']}, {data['country']}, {data['countryCode']}")
    print(f"ISP: {data['isp']}, AS: {data['as']}, Org: {data['org']}")
    print("=" * 30)

    return ip_address, data['city'], data['region'], data['country'], data['countryCode'], data['isp'], data['as'], data['org']

In [8]:
def get_circuit_data(controller, circuit):
    guard_ip, middle_ip, exit_ip = None, None, None
    print("Circuit characteristics:", circuit)
    if circuit.status == CircStatus.BUILT and len(circuit.path) >=3:
        guard_fingerprint = circuit.path[0][0]  # The first hop is the guard relay
        print("GUARD", circuit.path[0])
        middle_fingerprint = circuit.path[1][0]
        print("MIDDLE", circuit.path[1])
        exit_fingerprint = circuit.path[2][0]
        print("LAST", circuit.path[2])

        # Get relay details for each hop
        guard_relay = controller.get_network_status(guard_fingerprint)
        middle_relay = controller.get_network_status(middle_fingerprint)
        exit_relay = controller.get_network_status(exit_fingerprint)

        print("Guard relay flags", guard_relay.flags)
        print("Middle relay flags", middle_relay.flags)
        print("Exit relay flags", exit_relay.flags)

        guard_ip =  guard_relay.address
        middle_ip = middle_relay.address
        exit_ip = exit_relay.address

    else:
        print("Circuit skipped")

    return guard_ip, middle_ip, exit_ip

### Getting Tor descriptors
* https://stem.torproject.org/api/descriptor/remote.html

In [9]:
guard_nodes = {}
middle_nodes = {}
exit_nodes = {}

In [10]:
# Retries this block 5 times, interleaved by 10000 miliseconds
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=10)
def create_new_circuit_and_close(controller, client_id):
    print(f"RETRY create_new_circuit({client_id})")

    circuit_id = controller.new_circuit(await_build=True, timeout=60)
    circuit = controller.get_circuits()[-1]

    guard_ip, middle_ip, exit_ip = get_circuit_data(controller, circuit)
    
    try:
        ip, city, region, country, country_code, isp, a_sys, org = get_geolocation(guard_ip)
        if ip in guard_nodes:
            guard_nodes[ip]['count'] += 1
        else:
            guard_nodes[ip] = {
                'city': city,
                'region': region,
                'country': country, 
                'country_code': country_code,
                'isp': isp, 
                'as': a_sys, 
                'org': org,
                'count': 1
            }
        
        ip, city, region, country, country_code, isp, a_sys, org = get_geolocation(middle_ip)  
        if ip in middle_nodes:
            middle_nodes[ip]['count'] += 1
        else:
            middle_nodes[ip] = {
                'city': city,
                'region': region,
                'country': country, 
                'country_code': country_code,
                'isp': isp, 
                'as': a_sys, 
                'org': org,
                'count': 1
            }

        ip, city, region, country, country_code, isp, a_sys, org = get_geolocation(exit_ip)
        if ip in exit_nodes:
            exit_nodes[ip]['count'] += 1
        else:
            exit_nodes[ip] = {
                'city': city,
                'region': region,
                'country': country, 
                'country_code': country_code,
                'isp': isp, 
                'as': a_sys, 
                'org': org,
                'count': 1
            }
    
    except RelayGeolocationException as e:
        traceback.print_exc()
        sys.exit(0)

    controller.close_circuit(circuit_id)

In [11]:
def change_guard_node(controller):
    # We need to have the line "UseEntryGuards 0" in the torrc for this to work
    controller.signal("NEWNYM")
    print("Requested a new identity (including new guard nodes)")

In [12]:

def generate_client_circuits(client_id):
    # Has to be Control Port
    with Controller.from_port(address="127.0.0.1", port=CONTROL_PORT) as controller:
        # try:
        controller.authenticate(password="")
        print("Successfully authenticated with Tor control port")

        print("Created {} preemptive circuits".format(len(controller.get_circuits())))
        print("Tor will chose a random one from these to start the next session")

        print("Forcing the creation of new circuits")
        #for client_id in range(num_clients):
        print("=== Client {}".format(client_id))
        try:
            create_new_circuit_and_close(controller, client_id)
        except Exception as e:
            print(f"create_new_circuit_and_close() failed for client {client_id} after 10 attempts. Continuing ...")
            time.sleep(10)
        change_guard_node(controller)
    

### Check if Tor is already running
First, restart the kernel. Then, run the following commands:
```
sudo lsof -i -P | grep LISTEN | grep 9050
sudo kill -9 95441
```

In [13]:
# Start Tor process in the background
tor_process = start_tor()
time.sleep(10)
print("Wait for Tor process to start ...")

try:
    num_clients = 10000
    for client_id in tqdm(range(num_clients)):
        generate_client_circuits(client_id)
        #stop_tor(tor_process)
        time.sleep(10)
        print("Exited Tor process cleanly at the end")

        # So that it updates at every iteration
        joblib.dump(guard_nodes, GUARD_NODES_FILE)
        joblib.dump(middle_nodes, MIDDLE_NODES_FILE)
        joblib.dump(exit_nodes, EXIT_NODES_FILE)

except Exception as e:
    traceback.print_exc()
    #if tor_process:
        #stop_tor(tor_process)
        #time.sleep(10)
        #print("Wait for Tor process to stop ...")
        #print("Exited Tor process cleanly at the end")

finally:
    stop_tor(tor_process)
    print("Wait for Tor process to stop ...")
    time.sleep(10)
    print("Exited Tor process cleanly at the end")


    

Aug 17 18:36:25.655 [notice] Tor 0.4.7.14 running on Darwin with Libevent 2.1.12-stable, OpenSSL 3.1.2, Zlib 1.2.11, Liblzma N/A, Libzstd N/A and Unknown N/A as libc.
Aug 17 18:36:25.655 [notice] Tor can't help you if you use it wrong! Learn how to be safe at https://support.torproject.org/faq/staying-anonymous/
Aug 17 18:36:25.656 [notice] Read configuration file "/Users/danielalopes/coverage_analysis_ndss_2024/data/torrc".
Aug 17 18:36:25.658 [warn] CookieAuthFileGroupReadable is set, but will have no effect: you must specify an explicit CookieAuthFile to have it group-readable.
Aug 17 18:36:25.659 [notice] Opening Socks listener on 127.0.0.1:9050
Aug 17 18:36:25.659 [notice] Opened Socks listener connection (ready) on 127.0.0.1:9050
Aug 17 18:36:25.659 [notice] Opening Control listener on 127.0.0.1:9051
Aug 17 18:36:25.659 [notice] Opened Control listener connection (ready) on 127.0.0.1:9051
Aug 17 18:36:25.000 [notice] Parsing GEOIP IPv4 file /opt/homebrew/Cellar/tor/0.4.7.14/share

  0%|          | 0/10000 [00:00<?, ?it/s]

Aug 17 18:36:35.000 [notice] New control connection opened from 127.0.0.1.
Successfully authenticated with Tor control port
Created 6 preemptive circuits
Tor will chose a random one from these to start the next session
Forcing the creation of new circuits
=== Client 0
RETRY create_new_circuit(0)
RETRY create_new_circuit(0)
Circuit characteristics: CIRC 8 BUILT $DB1629B59707F744A0C7933E56B6802786FFC317~deimos,$2D938F19EAF660D902C656B5E6002F39B45C4BE4~VoxBox,$B09AB667E97470AAAA590077383A24437226A127~NTH11R3 BUILD_FLAGS=NEED_CAPACITY PURPOSE=GENERAL TIME_CREATED=2023-08-17T17:36:39.866708
GUARD ('DB1629B59707F744A0C7933E56B6802786FFC317', 'deimos')
MIDDLE ('2D938F19EAF660D902C656B5E6002F39B45C4BE4', 'VoxBox')
LAST ('B09AB667E97470AAAA590077383A24437226A127', 'NTH11R3')
Guard relay flags ['Fast', 'Guard', 'HSDir', 'Running', 'Stable', 'V2Dir', 'Valid']
Middle relay flags ['Fast', 'HSDir', 'Running', 'Stable', 'V2Dir', 'Valid']
Exit relay flags ['Exit', 'Fast', 'Running', 'Stable', 'Valid']

  0%|          | 1/10000 [00:14<41:38:28, 14.99s/it]

Exited Tor process cleanly at the end
Aug 17 18:36:50.000 [notice] New control connection opened from 127.0.0.1.
Successfully authenticated with Tor control port
Created 6 preemptive circuits
Tor will chose a random one from these to start the next session
Forcing the creation of new circuits
=== Client 1
RETRY create_new_circuit(1)
Circuit characteristics: CIRC 9 BUILT $1EDE608C5E190C8682A2F827664E758416104670~W4LS3R,$18474F89EC3E8ABE40148B424C8DD5399242CEC1~squid,$13F7EAE731CA4600951986921E08ECAB9B1D2AF6~CanopoIT BUILD_FLAGS=NEED_CAPACITY PURPOSE=GENERAL TIME_CREATED=2023-08-17T17:36:50.697438
GUARD ('1EDE608C5E190C8682A2F827664E758416104670', 'W4LS3R')
MIDDLE ('18474F89EC3E8ABE40148B424C8DD5399242CEC1', 'squid')
LAST ('13F7EAE731CA4600951986921E08ECAB9B1D2AF6', 'CanopoIT')
Guard relay flags ['Fast', 'Guard', 'HSDir', 'Running', 'Stable', 'V2Dir', 'Valid']
Middle relay flags ['Fast', 'Guard', 'HSDir', 'Running', 'Stable', 'V2Dir', 'Valid']
Exit relay flags ['Fast', 'Guard', 'HSDir', 

  0%|          | 1/10000 [00:21<59:15:06, 21.33s/it]


Aug 17 18:36:57.000 [notice] Catching signal TERM, exiting cleanly.
Wait for Tor process to stop ...
Exited Tor process cleanly at the end


KeyboardInterrupt: 

In [None]:
guard_nodes = joblib.load(GUARD_NODES_FILE)
print("\nguard_nodes", guard_nodes)
print("len(guard_nodes)", len(guard_nodes))