# Get data on the currently running crawlers

In [1]:
from functools import partial
from subprocess import run

import pandas as pd

run = partial(run, text=True, capture_output=True)


def running_crawls():
    """Get details of currently running spiders.

    Get a DataFrame showing the following details:

    * pid: Process ID. Use this to identify (or stop) the spider that you want.
    * started: The time when this spider has started.
    * elapsed: The elapsed time since the spider started.
    * %mem: The percentage of memory that this spider is consuming.
    * %cpu: The percentage of CPU that this spider is consuming.
    * args: The full command that was used to start this spider. Use this to identify
      the spider(s) that you want to know about.
    * output_file: The path to the output file for each running crawl job.
    * crawled_urls: The current number of lines in ``output_file``.
    """
    ps = run(["ps", "xo", "pid,start,etime,%mem,%cpu,args"])
    ps_stdout = ps.stdout.splitlines()
    df = pd.DataFrame(
        [line.split(maxsplit=5) for line in ps_stdout[1:]], columns=ps_stdout[0].split()
    )
    df["output_file"] = df["ARGS"].str.extract(r"-o (.*?\.jl)")[0]
    df_subset = df[df["ARGS"].str.contains("scrapy runspider")].reset_index(drop=True)
    if df_subset.empty:
        return pd.DataFrame()
    crawled_lines = run(["wc", "-l"] + df["output_file"].str.cat(sep=" ").split())
    crawl_urls = [
        int(line.strip().split()[0]) for line in crawled_lines.stdout.splitlines()
    ]
    crawl_urls = crawl_urls[: min(len(crawl_urls), len(df_subset))]
    df_subset["crawled_urls"] = crawl_urls
    df_subset.columns = df_subset.columns.str.lower()
    return df_subset


In [2]:
running = running_crawls()
running

Unnamed: 0,pid,started,elapsed,%mem,%cpu,args,output_file,crawled_urls
0,51735,3:15PM,19:01,0.5,0.0,/Library/Frameworks/Python.framework/Versions/...,/Users/me/Desktop/temp/time_crawl.jl,87
1,51808,3:15PM,18:41,1.2,0.0,/Library/Frameworks/Python.framework/Versions/...,/Users/me/Desktop/temp/nytimes_crawl.jl,96
2,51899,3:16PM,18:08,1.4,0.0,/Library/Frameworks/Python.framework/Versions/...,/Users/me/Desktop/temp/shopify_crawl.jl,80
3,51975,3:16PM,17:53,0.7,0.0,/Library/Frameworks/Python.framework/Versions/...,/Users/me/Desktop/temp/amazon_crawl.jl,75


In [4]:
running['args'][1].split()

['/Library/Frameworks/Python.framework/Versions/3.12/Resources/Python.app/Contents/MacOS/Python',
 '/Users/me/venv312/bin/scrapy',
 'runspider',
 '/Users/me/venv312/lib/python3.12/site-packages/advertools/spider.py',
 '-a',
 'url_list=https://nytimes.com/',
 '-a',
 'allowed_domains=nytimes.com',
 '-a',
 'follow_links=True',
 '-a',
 'exclude_url_params=None',
 '-a',
 'include_url_params=None',
 '-a',
 'exclude_url_regex=None',
 '-a',
 'include_url_regex=None',
 '-a',
 'css_selectors=None',
 '-a',
 'xpath_selectors=None',
 '-o',
 '/Users/me/Desktop/temp/nytimes_crawl.jl',
 '-s',
 'DOWNLOAD_DELAY=20',
 '-s',
 'CLOSESPIDER_PAGECOUNT=200']