In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

In [2]:
# Download the state of the union dataset
sotu_url = "https://raw.githubusercontent.com/BrianWeinstein/state-of-the-union/refs/heads/master/transcripts.csv"
sotu_full = pd.read_csv(sotu_url)

# Split them so that each line is one paragraph instead of a full speech
sotu = sotu_full.assign(paragraph=sotu_full['transcript'].str.split('\n')).explode('paragraph')
sotu.drop(columns=["transcript"])
sotu = sotu.reset_index(drop=True)

# Take a small sample of paragraphs
sotu_sample = sotu.sample(2000, random_state=42)

In [None]:
# Prepare embedding model
model_name_or_path="Alibaba-NLP/gte-multilingual-base"
model = SentenceTransformer(model_name_or_path, trust_remote_code=True)


In [4]:
# Embed the texts
embeddings = [model.encode(x, normalize_embeddings=True) for x in sotu_sample["paragraph"].to_list()]
embeddings = np.array(embeddings)


In [7]:
# Compute embedding for query
query = "America first"
query_embedding = model.encode([query], normalize_embeddings=True)


In [8]:
# Retrieve best-matching text from query
scores = embeddings @ query_embedding.transpose()
sotu_sample["cosine"] = scores

sotu_best = sotu_sample.sort_values(by='cosine', ascending=False)[["date", "president", "paragraph", "cosine"]]
sotu_best.head(10)


Unnamed: 0,date,president,paragraph,cosine
35,2018-01-30,Donald J. Trump,"Together, we are rediscovering the American way.",0.725819
7326,1972-01-20,Richard Nixon,DEVELOPING RURAL AMERICA,0.708726
230,2017-02-28,Donald J. Trump,"When we have all of this, we will have made Am...",0.706038
1747,1999-01-19,William J. Clinton,"My fellow Americans, this is our moment. Let u...",0.700362
2408,1991-01-29,George Bush,"Yes, the United States bears a major share of ...",0.700012
2476,1990-01-31,George Bush,"God bless all of you, and may God bless this g...",0.694463
1921,1997-02-04,William J. Clinton,America is far more than a place. It is an ide...,0.69055
8188,1966-01-12,Lyndon B. Johnson,In public statements and in private communicat...,0.690104
9853,1953-02-02,Dwight D. Eisenhower,Application of America's influence in world af...,0.689911
2767,1984-01-25,Ronald Reagan,Our progress in space—taking giant steps for a...,0.689634
