In [1]:
import logging
from pathlib import Path

from lxml import etree as ET
import polars as pl

import embed
from embed.demos import usc

In [2]:
# Consider switching to DEBUG if this doesn't produce enough info.
logging.basicConfig(level=logging.INFO)

In [3]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [4]:
data_dir = Path('../../data/usc/')
usc.extract_usc(data_dir, download=True)
usc42root = ET.parse(data_dir / usc.USC_STEM / 'usc42.xml').getroot()

In [5]:
# For now, we work just with sections small enough to embed without spliting.
sections = usc.get_direct_sections(usc42root)

In [6]:
# What we want to do: 
#  Save serializatinos of sections that are embeddeable

In [7]:
# identifier, status, text
# identifier, status, text, embedding

In [8]:
df = pl.DataFrame(
    {
        "Identifier": section.attrib['identifier'], 
        "Status": section.get('status'),
        "Text": text,
    }
    for section in sections[100:110]  # FIXME: After initial testing, stop slicing.
    if embed.count_tokens(text := usc.serialize_xml_clean(section)) <= embed.TOKEN_LIMIT
)

In [9]:
df

Identifier,Status,Text
str,bool,str
"""/us/usc/t42/s238b""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238b.</num><heading> Disposition of m…"
"""/us/usc/t42/s238c""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238c.</num><heading> Transportation o…"
"""/us/usc/t42/s238d""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238d.</num><heading> Availability of …"
"""/us/usc/t42/s238e""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238e.</num><heading> Transfer of fund…"
"""/us/usc/t42/s238f""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238f.</num><heading> Availability of …"
"""/us/usc/t42/s238g""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238g.</num><heading> Wearing of unifo…"
"""/us/usc/t42/s238h""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238h.</num><heading> Biennial report<…"
"""/us/usc/t42/s238i""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238i.</num><heading> Memorials and ot…"
"""/us/usc/t42/s238j""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238j.</num><heading> Evaluation of pr…"
"""/us/usc/t42/s238k""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 238k.</num><heading> Contract authori…"


In [10]:
# df.filter(pl.col('Identifier') == '/us/usc/t42/s1...1j').get_column('Text')[0]
df['Text'][0]

'<section xmlns="http://xml.house.gov/schemas/uslm/1.0"><num>§\u202f238b.</num><heading> Disposition of money collected for care of patients</heading><content>\n<p>Money collected as provided by law for expenses incurred in the care and treatment of foreign seamen, and money received for the care and treatment of pay patients, including any amounts received from any executive department on account of care and treatment of pay patients, shall be covered into the appropriation from which the expenses of such care and treatment were paid.</p>\n</content><sourceCredit>(<ref>July 1, 1944, ch. 373</ref>, title II, §\u202f233, formerly title V, §\u202f503, <ref>58 Stat. 710</ref>, renumbered title XXI, §\u202f2103, <ref>Pub. L. 98–24, §\u202f2(a)(1)</ref>, <date>Apr. 26, 1983</date>, <ref>97 Stat. 176</ref>; renumbered title XXIII, §\u202f2303, <ref>Pub. L. 99–660, title III, §\u202f311(a)</ref>, <date>Nov. 14, 1986</date>, <ref>100 Stat. 3755</ref>; renumbered title XXV, §\u202f2503, <ref>Pu

In [11]:
from collections.abc import Sequence
isinstance(df['Text'], Sequence)

False

In [12]:
type(df['Text'])

polars.series.series.Series

In [13]:
df['Text'][0]

'<section xmlns="http://xml.house.gov/schemas/uslm/1.0"><num>§\u202f238b.</num><heading> Disposition of money collected for care of patients</heading><content>\n<p>Money collected as provided by law for expenses incurred in the care and treatment of foreign seamen, and money received for the care and treatment of pay patients, including any amounts received from any executive department on account of care and treatment of pay patients, shall be covered into the appropriation from which the expenses of such care and treatment were paid.</p>\n</content><sourceCredit>(<ref>July 1, 1944, ch. 373</ref>, title II, §\u202f233, formerly title V, §\u202f503, <ref>58 Stat. 710</ref>, renumbered title XXI, §\u202f2103, <ref>Pub. L. 98–24, §\u202f2(a)(1)</ref>, <date>Apr. 26, 1983</date>, <ref>97 Stat. 176</ref>; renumbered title XXIII, §\u202f2303, <ref>Pub. L. 99–660, title III, §\u202f311(a)</ref>, <date>Nov. 14, 1986</date>, <ref>100 Stat. 3755</ref>; renumbered title XXV, §\u202f2503, <ref>Pu

In [14]:
texts = df['Text'].to_list()

In [15]:
len(texts)

10

In [16]:
embeddings = embed.embed_many(texts)

In [17]:
embeddings

array([[ 0.01002261,  0.01300304,  0.02757402, ..., -0.02876348,
        -0.01050246, -0.02968262],
       [ 0.00114846,  0.01286544,  0.00877311, ..., -0.01285876,
        -0.00676706, -0.01158826],
       [ 0.00323707,  0.0193338 , -0.00365216, ..., -0.02534527,
         0.00191468, -0.02210328],
       ...,
       [-0.00553012,  0.02558189,  0.00691265, ..., -0.02263429,
        -0.00694984, -0.02980047],
       [-0.01751399,  0.01630028, -0.0036444 , ..., -0.01440793,
        -0.00047431, -0.03774252],
       [-0.0020269 ,  0.01241931, -0.00832583, ..., -0.0173791 ,
         0.00873584, -0.02766901]], dtype=float32)

In [18]:
query = embeddings @ embed.embed_one('laying people of rank to rest when they have died')

In [19]:
query

array([0.7257557 , 0.78433096, 0.7010728 , 0.71104324, 0.72935516,
       0.74076974, 0.7150748 , 0.723279  , 0.6990161 , 0.6882933 ],
      dtype=float32)

In [20]:
query.argmax()

1