In [100]:
import polars as pl

In [None]:
queries = []

for f in "AT BA BE BG CZ DK EE ES FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA".split():
    q = (
        pl.scan_ndjson(
            f"/cache/nikolal/parlasent/ParlaMint-{f}.meta.jsonl",
        )
        .with_columns(country=pl.lit(f), len=pl.col("text").str.len_chars())
        .unnest(pl.col("metadata"))
        .select(
            pl.col("len"),
            pl.col("logit"),
            pl.col("Speaker_MP"),
            pl.col("Speaker_role"),
            pl.col("Speaker_ID"),
            pl.col("Party_status"),
            pl.col("country"),
        )
    )

    queries.append(q)
df = pl.concat(pl.collect_all(queries))

df.limit(5)

In [None]:
df.select(pl.col("country")).unique().sort(pl.col("country"))


In [None]:
speakers_with_more_than_100_sentences = (
    df.group_by("Speaker_ID")
    .agg(pl.col("logit").count().alias("logitcount"))
    .filter(pl.col("logitcount") > 100)
    .select(pl.col("Speaker_ID"))
)

df = df.filter(
    pl.col("Speaker_ID").is_in(speakers_with_more_than_100_sentences),
    pl.col("Speaker_role") != "Chairperson",
    pl.col("Speaker_MP") == "MP",
)
df.shape

In [107]:
gb1 = (
    df.group_by("Speaker_ID", "Party_status")
    .agg(
        pl.col("logit").dot("len") / pl.sum("len"),
    )
    .pivot(on="Party_status", index="Speaker_ID", values="logit")
)


gb2 = df.group_by(
    "Speaker_ID",
).agg(
    (pl.col("logit").dot("len") / pl.sum("len")).alias("General"),
    pl.col("logit").count().alias("num_sentences"),
)
gb = (
    gb1.join(gb2, on="Speaker_ID")
    .select(pl.exclude("-"))
    .drop_nulls()
    .join(
        df.select(
            pl.col("country"),
            pl.col("Speaker_ID"),
        ),
        on="Speaker_ID",
    )
)


In [None]:
gb