In [3]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import re
import tqdm

In [5]:
# crape with beautiful soup and playwright
url = "https://constitutioncenter.org/the-constitution/amendments"

scraped_data = []

async with async_playwright() as p:
    browser = await p.chromium.launch()
    page = await browser.new_page()
    await page.goto(url)
    content = await page.content()

    # get links with href="/the-constitution/amendments/amendment-*"
    soup = BeautifulSoup(content, "html.parser")
    links = soup.find_all("a", href=re.compile("/the-constitution/amendments/amendment-"))

    for i, link in tqdm.tqdm(enumerate(links)):
        # wait one second to avoid getting blocked
        await page.wait_for_timeout(2000)

        data = {
            "number": i + 1
        }

        await page.goto("https://constitutioncenter.org" + link["href"])
        content = await page.content()
        soup = BeautifulSoup(content, "html.parser")
        # get the text
        parent = soup.find("div", class_="col-md-8")
        text = parent.find("p").text
        data["text"] = text

        # get the date ratified
        parent = soup.find("div", class_="card card-body")
        date = parent.find("p").text
        data["ratified"] = date

        scraped_data.append(data)

    await browser.close()

27it [01:03,  2.34s/it]


In [9]:
cleaned = []

for amendment in scraped_data:
    date_sentence_words = amendment["ratified"].split(" ")
    index_of_ratified = date_sentence_words.index("Ratified")
    if index_of_ratified == -1:
        print("error")
        continue
    date = date_sentence_words[index_of_ratified + 1: index_of_ratified + 4]
    date = " ".join(date)
    if date[-1] == ".":
        date = date[:-1]
    cleaned.append({
        "number": amendment["number"],
        "text": amendment["text"],
        "ratified": date
    })

In [10]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
amendments_with_embeddings = []

for amendment in cleaned:
    amendments_with_embeddings.append({
        "number": amendment["number"],
        "text": amendment["text"],
        "ratified": amendment["ratified"],
        "text_embedding": model.encode(amendment["text"]).tolist()
    })

In [17]:
import datetime

def get_date(date_string):
    date = datetime.datetime.strptime(date_string, "%B %d, %Y")
    # date in seconds
    return date.timestamp()

for amendment in amendments_with_embeddings:
    amendment["ratified_seconds"] = get_date(amendment["ratified"])

In [18]:
import json

with open("amendments.json", "w") as f:
    json.dump(amendments_with_embeddings, f)