# Embeddings Overview
Last updated: March 17, 2024

In [5]:
import os

import pandas as pd
from pgvector.psycopg import register_vector
from sqlalchemy import create_engine

In [6]:
pg_engine = create_engine(os.getenv("DATABASE_URL"))

In [7]:
df = pd.read_sql(
    f"""
    SELECT
        *
    FROM statutes
    """,
    pg_engine
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8112 entries, 0 to 8111
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   statute_id         8112 non-null   int64              
 1   created_at         8112 non-null   datetime64[ns, UTC]
 2   chapter_number     8112 non-null   object             
 3   chapter_name       8112 non-null   object             
 4   statute            8112 non-null   object             
 5   section_name       8112 non-null   object             
 6   url                8112 non-null   object             
 7   raw_html           8112 non-null   object             
 8   content            8112 non-null   object             
 9   content_embedding  0 non-null      object             
dtypes: datetime64[ns, UTC](1), int64(1), object(8)
memory usage: 633.9+ KB


## What are embeddings?

In [8]:
%%time

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

CPU times: user 2.92 s, sys: 4.53 s, total: 7.45 s
Wall time: 3.75 s


In [9]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
both = [(sentence, embedding) for sentence, embedding in zip(sentences, embeddings)]
pd.DataFrame.from_records(both, columns=("sentence", "embedding"))

Unnamed: 0,sentence,embedding
0,This framework generates embeddings for each i...,"[0.006416996, 0.007041461, -0.028144179, 0.051..."
1,Sentences are passed as a list of string.,"[0.07764478, -0.059052207, -0.039118372, 0.046..."
2,The quick brown fox jumps over the lazy dog.,"[-0.034292687, -0.001339469, 0.0043361243, -0...."


## Statutes: compute embeddings

In [10]:
df["content"]

0       Article 1. \nDefinitions. \n§ 1‑1.  Remedies.\...
1       § 1‑2.  Actions.\nAn action is an ordinary pro...
2       § 1‑3.  Special proceedings.\nEvery other reme...
3       § 1‑4.  Kinds of actions.\nActions are of two ...
4       § 1‑49.  Seven years.\nWithin seven years an a...
                              ...                        
8107    Article 8.\nTemporary Care and Restraint of In...
8108    Article 9.\nMental Health Council.\n§§ 35‑61 t...
8109    Article 10.\nInterstate Compact on Mental Heal...
8110    Article 11.\nMedical Advisory Council to State...
8111    Article 12.\nCouncil on Mental Retardation and...
Name: content, Length: 8112, dtype: object

In [11]:
%%time

embeddings = model.encode(df["content"].tolist())

CPU times: user 3min 58s, sys: 4min 44s, total: 8min 43s
Wall time: 1min 48s


In [17]:
df["content_embedding"] = embeddings.tolist()
df[["content_embedding", "statute_id"]]

Unnamed: 0,content_embedding,statute_id
0,"[0.04147131368517876, -0.0322742760181427, 0.0...",1
1,"[-0.033325351774692535, -0.06919444352388382, ...",2
2,"[0.02636847086250782, -0.03957170993089676, 0....",3
3,"[-0.02005830779671669, -0.010654936544597149, ...",4
4,"[0.0192421805113554, 0.0806661993265152, 0.054...",63
...,...,...
8107,"[-0.010719257406890392, 0.0025900101754814386,...",8108
8108,"[0.015394539572298527, -0.02387923374772072, 0...",8109
8109,"[-0.0015342566184699535, 0.036397114396095276,...",8110
8110,"[0.008933783508837223, -0.00815458595752716, 0...",8111


## Statutes: update rows with embeddings

In [22]:
%%time

embeddings_and_ids = df[["content_embedding", "statute_id"]].itertuples(index=False)

with pg_engine.connect() as conn:
    # https://docs.sqlalchemy.org/en/20/faq/connections.html#how-do-i-get-at-the-raw-dbapi-connection-when-using-an-engine
    register_vector(conn.connection.driver_connection)
    with conn.connection.cursor() as cursor:
        cursor.executemany(
            """
            UPDATE statutes SET content_embedding = %s WHERE statute_id = %s
            """, 
            embeddings_and_ids
        )
    conn.connection.commit()

CPU times: user 6.03 s, sys: 70.3 ms, total: 6.1 s
Wall time: 6.1 s


In [25]:
# %%time

# embeddings_and_ids = df[["content_embedding", "statute_id"]].itertuples(index=False)

# with pg_engine.connect() as conn:
#     # https://docs.sqlalchemy.org/en/20/faq/connections.html#how-do-i-get-at-the-raw-dbapi-connection-when-using-an-engine
#     register_vector(conn.connection.driver_connection)
#     with conn.connection.cursor() as cursor:
#         for embedding, statue_id in embeddings_and_ids:
#             cursor.execute('UPDATE statutes SET content_embedding = %s WHERE statute_id = %s', (embedding, statue_id))
#     conn.connection.commit()

In [29]:
pd.read_sql(
    f"""
    SELECT
        statute
        , content
        , content_embedding
    FROM statutes
    ORDER BY statute
    LIMIT 10
    """,
    pg_engine
)

Unnamed: 0,statute,content,content_embedding
0,1-1,Article 1. \nDefinitions. \n§ 1‑1. Remedies.\...,"[0.041471314, -0.032274276, 0.042042375, -0.00..."
1,1-10,§ 1‑10. Plaintiff and defendant.\nIn civil ac...,"[-0.014724571, -0.04928847, 0.031303033, 0.055..."
2,1-100_through_1-104,§§ 1‑100 through 1‑104. Repealed by Session L...,"[-0.018579958, -0.016739732, 0.032597266, 0.01..."
3,1-105,§ 1‑105.1. Service on residents who establish...,"[-0.023890587, -0.031677235, 0.014264807, 0.00..."
4,1-105,§ 1‑105. Service upon nonresident drivers of ...,"[-0.0056505073, 0.014521363, 0.05465263, 0.038..."
5,1-106_through_1-107,§§ 1‑106 through 1‑107.3. Repealed by Session...,"[-0.013538554, -0.026661754, 0.03608902, 0.027..."
6,1-108,§ 1‑108. Defense after judgment set aside.\nI...,"[0.03706608, 0.050787423, 0.029943377, 0.05583..."
7,1-109,Article 9.\nProsecution Bonds.\n§ 1‑109. Bond...,"[0.04233295, 0.0049180645, 0.04980833, 0.02374..."
8,1-11,§ 1‑11. How party may appear.\nA party may ap...,"[-0.017318211, -0.0143290665, 0.0007942389, 0...."
9,1-110,§ 1‑110. Suit as an indigent; counsel; suits ...,"[0.017942425, 0.016565232, 0.021090308, 0.0065..."
