# Embeddings Overview
Last updated: March 17, 2024

In [5]:
import os

import pandas as pd
from pgvector.psycopg import register_vector
from sqlalchemy import create_engine

In [6]:
pg_engine = create_engine(os.getenv("DATABASE_URL"))

In [7]:
df = pd.read_sql(
    f"""
    SELECT
        *
    FROM statutes
    """,
    pg_engine
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8112 entries, 0 to 8111
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   statute_id         8112 non-null   int64              
 1   created_at         8112 non-null   datetime64[ns, UTC]
 2   chapter_number     8112 non-null   object             
 3   chapter_name       8112 non-null   object             
 4   statute            8112 non-null   object             
 5   section_name       8112 non-null   object             
 6   url                8112 non-null   object             
 7   raw_html           8112 non-null   object             
 8   content            8112 non-null   object             
 9   content_embedding  0 non-null      object             
dtypes: datetime64[ns, UTC](1), int64(1), object(8)
memory usage: 633.9+ KB


## What are embeddings?

In [8]:
%%time

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

CPU times: user 2.92 s, sys: 4.53 s, total: 7.45 s
Wall time: 3.75 s


In [9]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
both = [(sentence, embedding) for sentence, embedding in zip(sentences, embeddings)]
pd.DataFrame.from_records(both, columns=("sentence", "embedding"))

Unnamed: 0,sentence,embedding
0,This framework generates embeddings for each i...,"[0.006416996, 0.007041461, -0.028144179, 0.051..."
1,Sentences are passed as a list of string.,"[0.07764478, -0.059052207, -0.039118372, 0.046..."
2,The quick brown fox jumps over the lazy dog.,"[-0.034292687, -0.001339469, 0.0043361243, -0...."


## Statutes: calculate embeddings

In [10]:
df["content"]

0       Article 1. \nDefinitions. \n§ 1‑1.  Remedies.\...
1       § 1‑2.  Actions.\nAn action is an ordinary pro...
2       § 1‑3.  Special proceedings.\nEvery other reme...
3       § 1‑4.  Kinds of actions.\nActions are of two ...
4       § 1‑49.  Seven years.\nWithin seven years an a...
                              ...                        
8107    Article 8.\nTemporary Care and Restraint of In...
8108    Article 9.\nMental Health Council.\n§§ 35‑61 t...
8109    Article 10.\nInterstate Compact on Mental Heal...
8110    Article 11.\nMedical Advisory Council to State...
8111    Article 12.\nCouncil on Mental Retardation and...
Name: content, Length: 8112, dtype: object

In [11]:
%%time

embeddings = model.encode(df["content"].tolist())

CPU times: user 3min 58s, sys: 4min 44s, total: 8min 43s
Wall time: 1min 48s


In [17]:
df["content_embedding"] = embeddings.tolist()
df[["content_embedding", "statute_id"]]

Unnamed: 0,content_embedding,statute_id
0,"[0.04147131368517876, -0.0322742760181427, 0.0...",1
1,"[-0.033325351774692535, -0.06919444352388382, ...",2
2,"[0.02636847086250782, -0.03957170993089676, 0....",3
3,"[-0.02005830779671669, -0.010654936544597149, ...",4
4,"[0.0192421805113554, 0.0806661993265152, 0.054...",63
...,...,...
8107,"[-0.010719257406890392, 0.0025900101754814386,...",8108
8108,"[0.015394539572298527, -0.02387923374772072, 0...",8109
8109,"[-0.0015342566184699535, 0.036397114396095276,...",8110
8110,"[0.008933783508837223, -0.00815458595752716, 0...",8111


## Statutes: update rows with embeddings

In [22]:
%%time

embeddings_and_ids = df[["content_embedding", "statute_id"]].itertuples(index=False)

with pg_engine.connect() as conn:
    # https://docs.sqlalchemy.org/en/20/faq/connections.html#how-do-i-get-at-the-raw-dbapi-connection-when-using-an-engine
    register_vector(conn.connection.driver_connection)
    with conn.connection.cursor() as cursor:
        cursor.executemany(
            """
            UPDATE statutes SET content_embedding = %s WHERE statute_id = %s
            """, 
            embeddings_and_ids
        )
    conn.connection.commit()

CPU times: user 6.03 s, sys: 70.3 ms, total: 6.1 s
Wall time: 6.1 s


In [25]:
# %%time

# embeddings_and_ids = df[["content_embedding", "statute_id"]].itertuples(index=False)

# with pg_engine.connect() as conn:
#     # https://docs.sqlalchemy.org/en/20/faq/connections.html#how-do-i-get-at-the-raw-dbapi-connection-when-using-an-engine
#     register_vector(conn.connection.driver_connection)
#     with conn.connection.cursor() as cursor:
#         for embedding, statue_id in embeddings_and_ids:
#             cursor.execute('UPDATE statutes SET content_embedding = %s WHERE statute_id = %s', (embedding, statue_id))
#     conn.connection.commit()

In [29]:
pd.read_sql(
    f"""
    SELECT
        statute
        , content
        , content_embedding
    FROM statutes
    ORDER BY statute
    LIMIT 10
    """,
    pg_engine
)

Unnamed: 0,statute,content,content_embedding
0,1-1,Article 1. \nDefinitions. \n§ 1‑1. Remedies.\...,"[0.041471314, -0.032274276, 0.042042375, -0.00..."
1,1-10,§ 1‑10. Plaintiff and defendant.\nIn civil ac...,"[-0.014724571, -0.04928847, 0.031303033, 0.055..."
2,1-100_through_1-104,§§ 1‑100 through 1‑104. Repealed by Session L...,"[-0.018579958, -0.016739732, 0.032597266, 0.01..."
3,1-105,§ 1‑105.1. Service on residents who establish...,"[-0.023890587, -0.031677235, 0.014264807, 0.00..."
4,1-105,§ 1‑105. Service upon nonresident drivers of ...,"[-0.0056505073, 0.014521363, 0.05465263, 0.038..."
5,1-106_through_1-107,§§ 1‑106 through 1‑107.3. Repealed by Session...,"[-0.013538554, -0.026661754, 0.03608902, 0.027..."
6,1-108,§ 1‑108. Defense after judgment set aside.\nI...,"[0.03706608, 0.050787423, 0.029943377, 0.05583..."
7,1-109,Article 9.\nProsecution Bonds.\n§ 1‑109. Bond...,"[0.04233295, 0.0049180645, 0.04980833, 0.02374..."
8,1-11,§ 1‑11. How party may appear.\nA party may ap...,"[-0.017318211, -0.0143290665, 0.0007942389, 0...."
9,1-110,§ 1‑110. Suit as an indigent; counsel; suits ...,"[0.017942425, 0.016565232, 0.021090308, 0.0065..."


In [73]:
# foo = list()

In [20]:
import pandas as pd

import json
import logging
import pathlib
import re
import sys
from urllib.parse import urljoin, urlparse
from strip_tags import strip_tags
from tqdm.notebook import tqdm
from sqlalchemy import create_engine
from pgvector.psycopg import register_vector
from sqlalchemy import text
import redis
import requests
from bs4 import BeautifulSoup

redis_client = redis.Redis(host='localhost', port=63791, db=0, protocol=3)

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)

STATUTE_RE = re.compile(r".*GS_([\d\w-]+)")
CHAPTER_NUMBER_RE = re.compile(r"Chapter ([\d\w]+)")
SECTION_TITLE_RE = re.compile(r"([\w ]+)\.$")

logger = logging.getLogger("scraper")


def fetch_page(path):
    content = redis_client.get(path)
    if not content:
        uri = urljoin("https://www.ncleg.gov/", path)
        content = requests.get(uri).content
        redis_client.set(path, content)
    return BeautifulSoup(content, features="html.parser")


def parse_section(section):
    page_soup = fetch_page(section.get("href"))
    return {
        "raw_html": str(page_soup),
        "statute": STATUTE_RE.match(section.get("href")).group(1),
        "content": strip_tags(str(page_soup), ["p"])
    }


def scrape_chapter_sections(chapter_page_soup):
    chapter = chapter_page_soup.select_one("h1.section-title").text
    chapter_name = chapter.split("-")[1].strip().strip(".")
    chapter_number = CHAPTER_NUMBER_RE.match(chapter.split("-")[0]).group(1)
    section_anchors = chapter_page_soup.select("div#chapter div.row a[href*=BySection][href*=HTML]")
    sections = []
    for section in tqdm(section_anchors, desc=f"Chapter {chapter_number}"):
        row = section.find_parent("div", class_="row")
        title_tag = row.select_one('div.row').select('a')[1]
        try:
            section_name = SECTION_TITLE_RE.search(title_tag.text).group(1).strip()
        except AttributeError:
            section_name = ""
        try:
            data = parse_section(section)
        except requests.exceptions.ConnectTimeout:
            logger.error(section.get("href"))
            continue
        data["chapter_number"] = chapter_number
        data["chapter_name"] = chapter_name
        data["section_name"] = section_name
        data["url"] = urljoin("https://www.ncleg.gov/", section.get("href"))
        sections.append(data)
    return sections

def scrape_chapters():
    data = []
    soup = fetch_page("/Laws/GeneralStatutesTOC")
    chapter_anchors = soup.select("div#gsTOC div.row a[title*=Chapter]:-soup-contains('Chapter')")
    for anchor in chapter_anchors:
        chapter_page_soup = fetch_page(anchor.get("href"))
        sections = scrape_chapter_sections(chapter_page_soup)
        data.extend(sections)
        if sections[0]["chapter_number"] == "35":
            break
    return data

In [3]:
pg_engine = create_engine(os.getenv("DATABASE_URL"))
pg_conn = pg_engine.connect()
# https://docs.sqlalchemy.org/en/20/faq/connections.html#how-do-i-get-at-the-raw-dbapi-connection-when-using-an-engine
connection_fairy = pg_conn.connection
driver_conn = connection_fairy.driver_connection
register_vector(driver_conn)

In [24]:
%%time

from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer("all-MiniLM-L12-v2")all-mpnet-base-v2

CPU times: user 5.61 s, sys: 2.12 s, total: 7.73 s
Wall time: 4.46 s


In [25]:
# # Our sentences to encode
# sentences = [
#     "This framework generates embeddings for each input sentence",
#     "Sentences are passed as a list of string.",
#     "The quick brown fox jumps over the lazy dog."
# ]

# # Sentences are encoded by calling model.encode()
# embeddings = model.encode(sentences)

# # Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", len(embedding))
#     print("")

In [26]:
%%time

embeddings = model.encode(df["content"].tolist())
df["content_embedding"] = embeddings.tolist()

In [27]:
df.to_sql(
    "statutes",
    pg_engine,
    if_exists="append",
    index=False,
)

991

In [56]:
for item in df["content"].tolist():
    print(item)

Article 1.
General Provisions.
§ 143B‑1.  Short title.
This Chapter shall be known and may be cited as the "Executive Organization Act of 1973." (1973, c. 476, s. 1.)
§ 143B‑2.  Interim applicability of the Executive Organization Act of 1973.
The Executive Organization Act of 1973 shall be applicable only to the following named departments:
(1)	Department of Natural and Cultural Resources.
(2)	Department of Health and Human Services.
(3)	Department of Revenue.
(4)	Department of Public Safety.
(5)	Repealed by Session Laws 2012‑83, s. 47, effective June 26, 2012.
(6)	Department of Environmental Quality.
(7)	Department of Transportation.
(8)	Department of Administration.
(9)	Department of Commerce.
(10)	Repealed by Session Laws 2012‑83, s. 47, effective June 26, 2012.
(11)	Department of Information Technology.
(12)	Department of Adult Correction.  (1973, c. 476, s. 2; c. 620, s. 9; c. 1262, ss. 10, 86; 1975, c. 716, s. 5; c. 879, s. 46; 1977, c. 70, s. 22; c. 198, s. 21; c. 771, s. 4; 198

In [120]:
search = "traffic stops"
search_embedding = model.encode(search)

In [123]:
search_df = pd.read_sql(
    f"""
    SELECT
        statute
        , content
        , content_embedding <=> %(search_embedding)s AS nearest
    FROM statutes
    ORDER BY content_embedding <=> %(search_embedding)s ASC
    """,
    pg_engine,
    params={'search_embedding': search_embedding}
)
search_df

Unnamed: 0,statute,content,nearest
0,20-158,§ 20‑158. Vehicle control signs and signals.\...,0.454330
1,20-217,§ 20‑217. Motor vehicles to stop for properly...,0.461436
2,20-161,§ 20‑161. Stopping on highway prohibited; war...,0.494937
3,20-142,§ 20‑142.3. Certain vehicles must stop at rai...,0.499138
4,20-142,§ 20‑142.5. Stop when traffic obstructed.\nNo...,0.536066
...,...,...,...
8107,15-145,§ 15‑145. Form of bill for perjury.\nIn every...,1.117534
8108,28A-21-5,§ 28A‑21‑5. Vouchers presumptive evidence.\nV...,1.117835
8109,10B-102,"§ 10B‑102. (Effective until July 1, 2024) Sco...",1.121110
8110,10B-134,"§ 10B‑134.11. (Effective July 1, 2024) Verifi...",1.121283


In [66]:
sections[1].get("href")

'/EnactedLegislation/Statutes/HTML/BySection/Chapter_143B/GS_143B-2.html'

In [51]:
SECTION_TITLE_RE = re.compile(r"([\w ]+)\.$")

row = sections[0].find_parent("div", class_="row")
title_tag = row.select_one('div.row').select('a')[1]
section_title = SECTION_TITLE_RE.search(title_tag.text).group(1).strip()

'Short title'

In [56]:
list(scrape_sections(soup))

[{'raw_html': '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>\n</title>\n<style type="text/css">\r\n\t\t\t.cs258DCD3C{text-align:center;text-indent:0pt;margin:0pt 0pt 6pt 0pt}\r\n\t\t\t.cs102784{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:bold;font-style:normal;}\r\n\t\t\t.cs23FB0664{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:normal;font-style:normal;}\r\n\t\t\t.csF31EE8DA{text-align:justify;text-indent:-54pt;margin:0pt 0pt 0pt 54pt}\r\n\t\t\t.cs40666F14{text-align:justify;text-indent:18pt;margin:0pt 0pt 0pt 0pt}\r\n\t\t\t.cs4A4384B5{color:#000000;background-color:transparent;font-family:\'Microsoft Sans Serif\';font-size:12pt;font-weight:normal;font-style:normal;}\r\

In [68]:
%%time

all_sections = scrape_chapters()
all_sections

CPU times: user 1.02 s, sys: 36.1 ms, total: 1.06 s
Wall time: 18.5 s


[{'raw_html': '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>\n</title>\n<style type="text/css">\r\n\t\t\t.cs258DCD3C{text-align:center;text-indent:0pt;margin:0pt 0pt 6pt 0pt}\r\n\t\t\t.cs102784{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:bold;font-style:normal;}\r\n\t\t\t.cs23FB0664{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:normal;font-style:normal;}\r\n\t\t\t.csF31EE8DA{text-align:justify;text-indent:-54pt;margin:0pt 0pt 0pt 54pt}\r\n\t\t\t.cs40666F14{text-align:justify;text-indent:18pt;margin:0pt 0pt 0pt 0pt}\r\n\t\t\t.cs4A4384B5{color:#000000;background-color:transparent;font-family:\'Microsoft Sans Serif\';font-size:12pt;font-weight:normal;font-style:normal;}\r\

In [58]:
all_sections

In [96]:
sections = soup.select("div#chapter div.row a[href*=BySection][href*=HTML]")

In [93]:
section = sections[0]

In [138]:
parse_section(sections[60])

{'raw_html': '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>\n</title>\n<style type="text/css">\r\n\t\t\t.csF31EE8DA{text-align:justify;text-indent:-54pt;margin:0pt 0pt 0pt 54pt}\r\n\t\t\t.cs102784{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:bold;font-style:normal;}\r\n\t\t\t.cs40666F14{text-align:justify;text-indent:18pt;margin:0pt 0pt 0pt 0pt}\r\n\t\t\t.cs23FB0664{color:#000000;background-color:transparent;font-family:\'Times New Roman\';font-size:12pt;font-weight:normal;font-style:normal;}\r\n\t\t</style>\n</head>\n<body>\n<p class="csF31EE8DA"><span class="cs102784">§ 143B‑53.1. \xa0Appropriation, allotment, and expenditure of funds for historic and archeological property.</span></p><p class="cs40666F14" style="tab-stops:

'143B-1'

In [134]:
page_text

'Article 1.\nGeneral Provisions.\n§ 143B‑1. \xa0Short title.\nThis Chapter shall be known and may be cited as the "Executive Organization Act of 1973." (1973, c. 476, s. 1.)'

In [3]:
r = requests.get(ROOT_URI)
soup = BeautifulSoup(r.content, features="html.parser")

In [56]:
r = requests.get("https://www.ncleg.gov/Laws/GeneralStatuteSections/Chapter143B")
soup = BeautifulSoup(r.content, features="html.parser")

In [94]:
section.get("href")

'/EnactedLegislation/Statutes/HTML/BySection/Chapter_143B/GS_143B-1.html'

In [60]:

chapter_name

'Executive Organization Act of 1973'

In [61]:
chapter_number = chapter.split("-")[0]

In [62]:
chapter_number

'Chapter 143B '

('143B', 'Executive Organization Act of 1973')

In [68]:
urlparse(ROOT_URI)

ParseResult(scheme='https', netloc='www.ncleg.gov', path='/Laws/GeneralStatutesTOC', params='', query='', fragment='')

'https://www.ncleg.gov/Laws/GeneralStatutesTOC'