# Introduction

This notebook benchmarks the performance of synchronous vs asynchronous retrieval of phishing entries from various feeds. For the asynchronous retrieval, it uses the `asyncio` library to run multiple feed retrievals concurrently. The synchronous retrieval runs each feed one after the other.


# Installation


In [1]:
!pip install phishing-web-collector>=0.2.0


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import libraries  

In [2]:
import asyncio
import shutil
import time
from pathlib import Path

from phishing_web_collector import FeedSource
from phishing_web_collector.feed_manager import SOURCES_MAP

# Configure experiment


In [4]:
sources = [
        FeedSource.AD_GUARD_HOME,
        FeedSource.BINARY_DEFENCE_IP,
        FeedSource.BLOCKLIST_DE_IP,
        FeedSource.BOTVRIJ,
        FeedSource.C2_INTEL_DOMAIN,
        FeedSource.C2_TRACKER_IP,
        FeedSource.CERT_PL,
        FeedSource.DANGEROUS_DOMAINS,
        FeedSource.GREEN_SNOW_IP,
        FeedSource.MALWARE_WORLD,
        FeedSource.MIRAI_SECURITY_IP,
        FeedSource.OPEN_PHISH,
        FeedSource.PHISHING_ARMY,
        FeedSource.PHISHING_DATABASE,
        FeedSource.PHISH_STATS,
        FeedSource.PHISH_TANK,
        FeedSource.PROOF_POINT_IP,
        FeedSource.THREAT_VIEW_DOMAIN,
        FeedSource.TWEET_FEED,
        FeedSource.URL_ABUSE,
        FeedSource.URL_HAUS,
        FeedSource.VALDIN,
]
N_RUNS = 5
SYNC_DIR = Path("sync_check")
ASYNC_DIR = Path("async_check")

def clear_dir(path: Path):
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True)


# Async and Sync functions

In [5]:

def retrieve_all_sync() -> float:
    """Retrieve all phishing entries from all feeds synchronously (sequentially)."""
    providers = [SOURCES_MAP[source]("sync_check") for source in
                 sources]
    start = time.perf_counter()
    entries = []
    for provider in providers:
        entries.extend(provider.retrieve_sync())
    duration = time.perf_counter() - start
    print(f"Sync took {duration:.2f} seconds")
    return duration


async def retrieve_all() -> float:
    """Retrieve all phishing entries from all feeds asynchronously."""
    providers = [SOURCES_MAP[source]("async_check") for source in
                 sources]
    start = time.perf_counter()
    results = await asyncio.gather(*(provider.retrieve() for provider in providers))
    entries = [entry for result in results for entry in result]
    duration = time.perf_counter() - start
    print(f"Async took {duration:.2f} seconds")
    return duration

# Benchmark logic

In [6]:

async def run_benchmark():
    sync_times = []
    async_times = []

    for i in range(N_RUNS):
        print(f"\n--- Run {i + 1} ---")
        clear_dir(SYNC_DIR)
        clear_dir(ASYNC_DIR)

        # Async
        async_time = await retrieve_all()
        async_times.append(async_time)
        print(f"Async took {async_time:.2f} s")

        # Sync
        sync_time = retrieve_all_sync()
        sync_times.append(sync_time)
        print(f"Sync took {sync_time:.2f} s")

    avg_async = sum(async_times) / N_RUNS
    avg_sync = sum(sync_times) / N_RUNS

    print(f"\nAverage async time: {avg_async:.2f} s")
    print(f"Average sync time: {avg_sync:.2f} s")

# Run the benchmark

In [7]:
await run_benchmark()


--- Run 1 ---
Async took 13.45 seconds
Async took 13.45 s
Sync took 25.26 seconds
Sync took 25.26 s

--- Run 2 ---
Async took 15.54 seconds
Async took 15.54 s
Sync took 27.62 seconds
Sync took 27.62 s

--- Run 3 ---
Async took 16.79 seconds
Async took 16.79 s
Sync took 29.09 seconds
Sync took 29.09 s

--- Run 4 ---
Async took 13.97 seconds
Async took 13.97 s
Sync took 29.26 seconds
Sync took 29.26 s

--- Run 5 ---
Async took 15.96 seconds
Async took 15.96 s
Sync took 27.00 seconds
Sync took 27.00 s

Average async time: 15.14 s
Average sync time: 27.65 s
