# Introduction

This notebook benchmarks the performance of synchronous vs asynchronous retrieval of phishing entries from various feeds. For the asynchronous retrieval, it uses the `asyncio` library to run multiple feed retrievals concurrently. The synchronous retrieval runs each feed one after the other.


# Installation


In [16]:
!pip phishing-web-collector

ERROR: unknown command "phishing-web-collector"



# Import libraries  

In [17]:
import asyncio
import shutil
import time
from pathlib import Path

from phishing_web_collector import FeedSource
from phishing_web_collector.feed_manager import SOURCES_MAP

# Configure experiment


In [18]:
sources = list(FeedSource)
N_RUNS = 5
SYNC_DIR = Path("sync_check")
ASYNC_DIR = Path("async_check")

def clear_dir(path: Path):
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True)


# Async and Sync functions

In [19]:

def retrieve_all_sync() -> float:
    """Retrieve all phishing entries from all feeds synchronously (sequentially)."""
    providers = [SOURCES_MAP[source]("sync_check") for source in
                 sources]
    start = time.perf_counter()
    entries = []
    for provider in providers:
        entries.extend(provider.retrieve())
    duration = time.perf_counter() - start
    print(f"Sync took {duration:.2f} seconds")
    return duration


async def retrieve_all() -> float:
    """Retrieve all phishing entries from all feeds asynchronously."""
    providers = [SOURCES_MAP[source]("async_check") for source in
                 sources]
    start = time.perf_counter()
    results = await asyncio.gather(
        *(asyncio.to_thread(provider.retrieve) for provider in
          providers)
    )
    entries = [entry for result in results for entry in result]
    duration = time.perf_counter() - start
    print(f"Async took {duration:.2f} seconds")
    return duration

# Benchmark logic

In [20]:

async def run_benchmark():
    sync_times = []
    async_times = []

    for i in range(N_RUNS):
        print(f"\n--- Run {i + 1} ---")
        clear_dir(SYNC_DIR)
        clear_dir(ASYNC_DIR)

        # Async
        async_time = await retrieve_all()
        async_times.append(async_time)
        print(f"Async took {async_time:.2f} s")

        # Sync
        sync_time = retrieve_all_sync()
        sync_times.append(sync_time)
        print(f"Sync took {sync_time:.2f} s")

    avg_async = sum(async_times) / N_RUNS
    avg_sync = sum(sync_times) / N_RUNS

    print(f"\nAverage async time: {avg_async:.2f} s")
    print(f"Average sync time: {avg_sync:.2f} s")

# Run the benchmark

In [21]:
await run_benchmark()


--- Run 1 ---


Skipping save - No data fetched for PhishingDatabase
No data found for feed: PhishingDatabase
Failed to fetch https://phishstats.info/phish_score.csv - Status: 404
Skipping save - No data fetched for PhishStats


Async took 3.29 seconds
Async took 3.29 s


RuntimeError: asyncio.run() cannot be called from a running event loop