In [1]:
import polars as pl

## Speeches

In [2]:
presidents = pl.read_csv("datasets/presidents.csv", columns=["President Name", "Party"]).rename({"President Name": "President"})

In [3]:
speeches = (
    pl
    .read_parquet("datasets/speeches_sotu.parquet")
    .rename({"Transcript": "Text"})
    .with_columns(
        pl.col("Date").str.to_date()
    )
    .with_columns(
        pl.col("Date").dt.year().alias("Year")
    )
)

In [11]:
presidents.join(speeches, on="President").group_by("Party").count()

Party,count
str,u32
"""Republican""",13
"""Democratic""",14


In [9]:
speeches

Date,President,Text,Year
date,str,str,i32
1988-01-25,"""Ronald Reagan""","""Mr. Speaker, M…",1988
1990-01-31,"""George H. W. B…","""Mr. President,…",1990
1991-01-29,"""George H. W. B…","""Mr. President …",1991
1992-01-28,"""George H. W. B…","""Mr. Speaker an…",1992
1994-01-25,"""Bill Clinton""","""Thank you very…",1994
1995-01-24,"""Bill Clinton""","""Mr. President,…",1995
1996-01-23,"""Bill Clinton""","""Mr. Speaker, M…",1996
1997-02-04,"""Bill Clinton""","""Mr. Speaker, M…",1997
1998-01-27,"""Bill Clinton""","""Mr. Speaker, M…",1998
1999-01-19,"""Bill Clinton""","""Mr. Speaker, M…",1999


## Embeddings

In [1]:
import embeddings_loader

In [2]:
loader = embeddings_loader.EmbeddingsLoader.voyage_prompts_loader()

In [3]:
loader.filename(extension="parquet")

'embeddings-years-literals-voyage-lite.parquet'

In [4]:
embeddings = loader.load_embeddings()#.reset_index(names="Year")

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1989,-0.009492,0.004519,0.028364,-0.042345,0.045038,0.000359,0.01688,0.028328,0.025432,-0.010775,...,-0.050587,-0.019475,0.071158,-0.041924,-0.018593,0.016568,0.001355,-0.05049,0.018346,-0.047144
1990,-0.000748,0.021579,0.033496,-0.055933,0.036463,0.000836,0.01699,0.017238,0.033606,-0.018769,...,-0.037796,-0.025032,0.081453,-0.034967,-0.044615,0.020624,-0.018477,-0.062114,0.020896,-0.060677
1991,-0.015098,0.011177,0.040743,-0.066637,0.032793,0.000632,0.00808,0.013265,0.006706,-0.013123,...,-0.033628,-0.0078,0.076751,-0.03081,-0.039061,0.021892,-0.009802,-0.040754,0.016588,-0.060861
1992,-0.019593,-0.011171,0.027419,-0.068543,0.044011,0.00933,-0.000855,0.009343,0.01244,-0.017023,...,-0.037337,-0.008758,0.060082,-0.051211,-0.020645,0.00356,0.000137,-0.049002,0.017913,-0.053519
1993,-0.001401,-0.000202,0.024326,-0.064624,0.051343,0.021028,-0.00019,0.025222,0.019891,-0.005967,...,-0.056764,-0.015343,0.075034,-0.047856,-0.0284,0.011146,0.005409,-0.044861,0.017671,-0.064174
1994,-0.001724,0.000654,0.029167,-0.073056,0.054183,0.027543,0.005945,0.012701,0.01097,-0.024226,...,-0.03772,-0.025983,0.07151,-0.045829,-0.041014,0.019445,-0.009533,-0.031333,0.01115,-0.060245
1995,-0.000413,-0.010333,0.041797,-0.057455,0.054694,0.021206,0.003181,0.025571,0.00106,-0.009709,...,-0.027077,-0.031363,0.075858,-0.044108,-0.020799,0.032363,-0.000529,-0.037806,0.0341,-0.054072
1996,-0.001297,-0.001772,0.041424,-0.04592,0.041411,-0.004255,0.01974,0.017424,0.00489,-0.015895,...,-0.028413,-0.023681,0.077297,-0.053484,-0.040801,0.020224,-0.002627,-0.041497,0.0282,-0.039052
1997,-0.007869,0.003674,0.026867,-0.048211,0.027559,-0.00622,0.00012,0.011272,0.01381,-0.008648,...,-0.056684,-0.029629,0.075486,-0.048085,-0.031204,0.022977,0.006504,-0.041596,0.014651,-0.030144
1998,-0.002681,0.004841,0.02017,-0.03537,0.04638,0.0024,0.01145,0.019704,0.015691,-0.013881,...,-0.055279,-0.02097,0.059458,-0.046934,-0.024485,0.013105,0.003757,-0.055356,0.013424,-0.042955


In [None]:
embeddings.to_parquet(f"embeddings/{loader.filename(extension='parquet')}", index=False)

## Prompts