# Scrape and load data
Last updated: March 16, 2024

## Scrape the NC General Statues Table of Contents

In [3]:
import logging
import re
import sys
from urllib.parse import urljoin
from strip_tags import strip_tags
from tqdm.notebook import tqdm
import redis
import requests
from bs4 import BeautifulSoup

redis_client = redis.Redis(host="localhost", port=63791, db=0, protocol=3)

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)

STATUTE_RE = re.compile(r".*GS_([\d\w-]+)")
CHAPTER_NUMBER_RE = re.compile(r"Chapter ([\d\w]+)")
SECTION_TITLE_RE = re.compile(r"([\w ]+)\.$")

logger = logging.getLogger("scraper")


def fetch_page(path):
    content = redis_client.get(path)
    if not content:
        uri = urljoin("https://www.ncleg.gov/", path)
        content = requests.get(uri).content
        redis_client.set(path, content)
    return BeautifulSoup(content, features="html.parser")


def parse_section(section):
    page_soup = fetch_page(section.get("href"))
    return {
        "raw_html": str(page_soup),
        "statute": STATUTE_RE.match(section.get("href")).group(1),
        "content": strip_tags(str(page_soup), ["p"]),
    }


def scrape_chapter_sections(chapter_page_soup):
    chapter = chapter_page_soup.select_one("h1.section-title").text
    chapter_name = chapter.split("-")[1].strip().strip(".")
    chapter_number = CHAPTER_NUMBER_RE.match(chapter.split("-")[0]).group(1)
    section_anchors = chapter_page_soup.select(
        "div#chapter div.row a[href*=BySection][href*=HTML]"
    )
    sections = []
    for section in tqdm(section_anchors, desc=f"Chapter {chapter_number}"):
        row = section.find_parent("div", class_="row")
        title_tag = row.select_one("div.row").select("a")[1]
        try:
            section_name = SECTION_TITLE_RE.search(title_tag.text).group(1).strip()
        except AttributeError:
            section_name = ""
        try:
            data = parse_section(section)
        except requests.exceptions.ConnectTimeout:
            logger.error(section.get("href"))
            continue
        data["chapter_number"] = chapter_number
        data["chapter_name"] = chapter_name
        data["section_name"] = section_name
        data["url"] = urljoin("https://www.ncleg.gov/", section.get("href"))
        sections.append(data)
    return sections


def scrape_chapters(stop=""):
    data = []
    soup = fetch_page("/Laws/GeneralStatutesTOC")
    chapter_anchors = soup.select(
        "div#gsTOC div.row a[title*=Chapter]:-soup-contains('Chapter')"
    )
    for anchor in chapter_anchors:
        chapter_page_soup = fetch_page(anchor.get("href"))
        sections = scrape_chapter_sections(chapter_page_soup)
        data.extend(sections)
        if stop and sections[0]["chapter_number"] == stop:
            break
    return data

In [4]:
%%time

sections = scrape_chapters(stop="35")

Chapter 1:   0%|          | 0/919 [00:00<?, ?it/s]

Chapter 1A:   0%|          | 0/74 [00:00<?, ?it/s]

Chapter 1B:   0%|          | 0/8 [00:00<?, ?it/s]

Chapter 1C:   0%|          | 0/70 [00:00<?, ?it/s]

Chapter 1D:   0%|          | 0/13 [00:00<?, ?it/s]

Chapter 1E:   0%|          | 0/20 [00:00<?, ?it/s]

Chapter 1F:   0%|          | 0/7 [00:00<?, ?it/s]

Chapter 1G:   0%|          | 0/5 [00:00<?, ?it/s]

Chapter 2:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 3:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 4:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 5:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 5A:   0%|          | 0/34 [00:00<?, ?it/s]

Chapter 6:   0%|          | 0/58 [00:00<?, ?it/s]

Chapter 7:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 7A:   0%|          | 0/721 [00:00<?, ?it/s]

Chapter 7B:   0%|          | 0/608 [00:00<?, ?it/s]

Chapter 8:   0%|          | 0/190 [00:00<?, ?it/s]

Chapter 8A:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 8B:   0%|          | 0/10 [00:00<?, ?it/s]

Chapter 8C:   0%|          | 0/67 [00:00<?, ?it/s]

Chapter 9:   0%|          | 0/26 [00:00<?, ?it/s]

Chapter 10:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 10A:   0%|          | 0/17 [00:00<?, ?it/s]

Chapter 10B:   0%|          | 0/175 [00:00<?, ?it/s]

Chapter 11:   0%|          | 0/12 [00:00<?, ?it/s]

Chapter 12:   0%|          | 0/5 [00:00<?, ?it/s]

Chapter 13:   0%|          | 0/5 [00:00<?, ?it/s]

Chapter 14:   0%|          | 0/1130 [00:00<?, ?it/s]

Chapter 15:   0%|          | 0/144 [00:00<?, ?it/s]

Chapter 15A:   0%|          | 0/768 [00:00<?, ?it/s]

Chapter 15B:   0%|          | 0/38 [00:00<?, ?it/s]

Chapter 15C:   0%|          | 0/13 [00:00<?, ?it/s]

Chapter 16:   0%|          | 0/6 [00:00<?, ?it/s]

Chapter 17:   0%|          | 0/45 [00:00<?, ?it/s]

Chapter 17A:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 17B:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 17C:   0%|          | 0/24 [00:00<?, ?it/s]

Chapter 17D:   0%|          | 0/4 [00:00<?, ?it/s]

Chapter 17E:   0%|          | 0/29 [00:00<?, ?it/s]

Chapter 17F:   0%|          | 0/17 [00:00<?, ?it/s]

Chapter 18:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 18A:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 18B:   0%|          | 0/189 [00:00<?, ?it/s]

Chapter 18C:   0%|          | 0/97 [00:00<?, ?it/s]

Chapter 19:   0%|          | 0/36 [00:00<?, ?it/s]

Chapter 19A:   0%|          | 0/55 [00:00<?, ?it/s]

Chapter 20:   0%|          | 0/857 [00:00<?, ?it/s]

Chapter 21:   0%|          | 0/5 [00:00<?, ?it/s]

Chapter 22:   0%|          | 0/5 [00:00<?, ?it/s]

Chapter 22A:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 22B:   0%|          | 0/20 [00:00<?, ?it/s]

Chapter 22C:   0%|          | 0/6 [00:00<?, ?it/s]

Chapter 23:   0%|          | 0/49 [00:00<?, ?it/s]

Chapter 24:   0%|          | 0/41 [00:00<?, ?it/s]

Chapter 25:   0%|          | 0/639 [00:00<?, ?it/s]

Chapter 25A:   0%|          | 0/46 [00:00<?, ?it/s]

Chapter 25B:   0%|          | 0/4 [00:00<?, ?it/s]

Chapter 25C:   0%|          | 0/13 [00:00<?, ?it/s]

Chapter 26:   0%|          | 0/13 [00:00<?, ?it/s]

Chapter 27:   0%|          | 0/10 [00:00<?, ?it/s]

Chapter 28:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 28A:   0%|          | 0/237 [00:00<?, ?it/s]

Chapter 28B:   0%|          | 0/10 [00:00<?, ?it/s]

Chapter 28C:   0%|          | 0/22 [00:00<?, ?it/s]

Chapter 29:   0%|          | 0/31 [00:00<?, ?it/s]

Chapter 30:   0%|          | 0/35 [00:00<?, ?it/s]

Chapter 31:   0%|          | 0/84 [00:00<?, ?it/s]

Chapter 31A:   0%|          | 0/16 [00:00<?, ?it/s]

Chapter 31B:   0%|          | 0/12 [00:00<?, ?it/s]

Chapter 31C:   0%|          | 0/12 [00:00<?, ?it/s]

Chapter 31D:   0%|          | 0/40 [00:00<?, ?it/s]

Chapter 32:   0%|          | 0/48 [00:00<?, ?it/s]

Chapter 32A:   0%|          | 0/54 [00:00<?, ?it/s]

Chapter 32C:   0%|          | 0/48 [00:00<?, ?it/s]

Chapter 33:   0%|          | 0/1 [00:00<?, ?it/s]

Chapter 33A:   0%|          | 0/24 [00:00<?, ?it/s]

Chapter 33B:   0%|          | 0/22 [00:00<?, ?it/s]

Chapter 34:   0%|          | 0/20 [00:00<?, ?it/s]

Chapter 35:   0%|          | 0/36 [00:00<?, ?it/s]

CPU times: user 17.7 s, sys: 302 ms, total: 18 s
Wall time: 23.1 s


In [5]:
len(sections)

8112

## Load into PostgreSQL database

In [11]:
import os

import pandas as pd
from sqlalchemy import create_engine, text

In [10]:
pg_engine = create_engine(os.getenv("DATABASE_URL"))
pg_conn = pg_engine.connect()

In [31]:
RESET_DB = False

if RESET_DB:
    with pg_engine.begin() as conn:
        conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
        cursor = conn.exec_driver_sql(
            """
            DROP TABLE statutes;
            CREATE TABLE IF NOT EXISTS statutes (
                statute_id         integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
                created_at         timestamp with time zone NOT NULL DEFAULT NOW(),
                chapter_number     varchar(64) NOT NULL,
                chapter_name       varchar(2048) NOT NULL,
                statute            varchar(64) NOT NULL,
                section_name       varchar(2048) NOT NULL,
                url                varchar(2048) NOT NULL,
                raw_html           text NOT NULL,
                content            text NOT NULL,
                content_embedding  vector(768)
            )
            """
        )
        conn.commit()

In [29]:
df = pd.DataFrame.from_records(sections)
df

Unnamed: 0,raw_html,statute,content,chapter_number,chapter_name,section_name,url
0,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",1-1,Article 1. \nDefinitions. \n§ 1‑1. Remedies.\...,1,Civil Procedure,Remedies,https://www.ncleg.gov/EnactedLegislation/Statu...
1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",1-2,§ 1‑2. Actions.\nAn action is an ordinary pro...,1,Civil Procedure,Actions,https://www.ncleg.gov/EnactedLegislation/Statu...
2,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",1-3,§ 1‑3. Special proceedings.\nEvery other reme...,1,Civil Procedure,Special proceedings,https://www.ncleg.gov/EnactedLegislation/Statu...
3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",1-4,§ 1‑4. Kinds of actions.\nActions are of two ...,1,Civil Procedure,Kinds of actions,https://www.ncleg.gov/EnactedLegislation/Statu...
4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",1-5,§ 1‑5. Criminal action.\nA criminal action is...,1,Civil Procedure,Criminal action,https://www.ncleg.gov/EnactedLegislation/Statu...
...,...,...,...,...,...,...,...
8107,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",35-58_through_35-60,Article 8.\nTemporary Care and Restraint of In...,35,Sterilization Procedures,36,https://www.ncleg.gov/EnactedLegislation/Statu...
8108,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",35-61_through_35-63,Article 9.\nMental Health Council.\n§§ 35‑61 t...,35,Sterilization Procedures,13,https://www.ncleg.gov/EnactedLegislation/Statu...
8109,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",35-64_through_35-69,Article 10.\nInterstate Compact on Mental Heal...,35,Sterilization Procedures,12,https://www.ncleg.gov/EnactedLegislation/Statu...
8110,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",35-70_through_35-72,Article 11.\nMedical Advisory Council to State...,35,Sterilization Procedures,133,https://www.ncleg.gov/EnactedLegislation/Statu...


In [30]:
# Save statutes to DB

df.to_sql(
    "statutes",
    pg_engine,
    if_exists="append",
    index=False,
)

-1

In [23]:
# Save statutes to CSV file too

df.to_csv(
    "output/statutes.csv",
    index=False,
)