In [14]:
import polars as pl

In [15]:
queries = []

for f in "AT BA BE BG CZ DK EE ES FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA".split():
    q = (
        pl.scan_ndjson(
            f"/cache/nikolal/parlasent/ParlaMint-{f}.meta.jsonl",
        )
        .with_columns(country=pl.lit(f), len=pl.col("text").str.len_chars())
        .unnest(pl.col("metadata"))
        .select(
            pl.col("len"),
            pl.col("logit"),
            pl.col("Speaker_MP"),
            pl.col("Speaker_role"),
            pl.col("Speaker_ID"),
            pl.col("Party_status"),
            pl.col("country"),
        )
    )

    queries.append(q)
df = pl.concat(pl.collect_all(queries))

df.limit(5)

len,logit,Speaker_MP,Speaker_role,Speaker_ID,Party_status,country
u32,f64,str,str,str,str,str
47,3.653868,"""MP""","""Chairperson""","""PAD_88386""","""Coalition""","""AT"""
109,2.15837,"""MP""","""Chairperson""","""PAD_88386""","""Coalition""","""AT"""
4,3.300054,"""MP""","""Chairperson""","""PAD_88386""","""Coalition""","""AT"""
40,3.191045,"""MP""","""Chairperson""","""PAD_88386""","""Coalition""","""AT"""
38,3.371469,"""MP""","""Chairperson""","""PAD_88386""","""Coalition""","""AT"""


In [16]:
df.select(pl.col("country")).unique().sort(pl.col("country"))


country
str
"""AT"""
"""BA"""
"""BE"""
"""BG"""
"""CZ"""
…
"""RS"""
"""SE"""
"""SI"""
"""TR"""


In [17]:
speakers_with_more_than_100_sentences = (
    df.group_by("Speaker_ID")
    .agg(pl.col("logit").count().alias("logitcount"))
    .filter(pl.col("logitcount") > 100)
    .select(pl.col("Speaker_ID"))
)

df = df.filter(
    pl.col("Speaker_ID").is_in(speakers_with_more_than_100_sentences),
    pl.col("Speaker_role") != "Chairperson",
    pl.col("Speaker_MP") == "MP",
)
df.shape

(47396890, 7)

In [55]:
gb1 = (
    df.group_by("Speaker_ID", "Party_status")
    .agg(
        pl.col("logit").dot("len") / pl.sum("len"),
    )
    .pivot(on="Party_status", index="Speaker_ID", values="logit")
)


gb2 = (
    df.group_by(
        "Speaker_ID",
    )
    .agg(
        (pl.col("logit").dot("len") / pl.sum("len")).alias("General"),
        pl.col("logit").count().alias("num_sentences"),
        pl.col("country").mode(),
    )
    .with_columns(country=pl.col("country").list[0])
)


per_speaker = (
    gb1.join(
        gb2,
        on="Speaker_ID",
        how="left",
    )
    .select(pl.exclude("-"))
    .drop_nulls()
    .select(
        pl.col("country"),
        pl.col("Speaker_ID"),
        pl.col("General"),
        pl.col("Coalition"),
        pl.col("Opposition"),
        pl.col("num_sentences"),
    )
)

per_speaker


country,Speaker_ID,General,Coalition,Opposition,num_sentences
str,str,f64,f64,f64,u32
"""PT""","""IsabelMariaMousinhodeAlmeidaGa…",1.963599,3.106387,1.893432,1812
"""SI""","""MogeRudolf""",1.895731,2.224799,1.72167,9934
"""PT""","""NilzaMaríliaMouzinhodeSena""",1.705012,1.625739,1.71578,520
"""HR""","""HabekMario""",1.988347,2.520753,1.649344,2247
"""ES""","""CarmenCalvoPoyato""",2.020232,1.966734,2.112815,1620
…,…,…,…,…,…
"""SI""","""RugeljBojan""",2.381377,2.381163,2.415186,1642
"""IT""","""BerardiRoberto""",2.012537,2.745127,1.904571,297
"""SI""","""PučnikMateja""",2.109586,2.680431,1.97649,1003
"""FI""","""TeuvoHakkarainen""",1.571673,1.597999,1.480148,3918


In [56]:
per_speaker.filter(pl.col("country") == "AT")

country,Speaker_ID,General,Coalition,Opposition,num_sentences
str,str,f64,f64,f64,u32
"""AT""","""PAD_01079""",2.305509,2.563825,1.979034,4044
"""AT""","""PAD_01153""",2.235469,2.626971,1.840278,2462
"""AT""","""PAD_06997""",1.616327,2.456752,1.474798,3287
"""AT""","""PAD_05098""",1.777877,2.063451,1.765631,4846
"""AT""","""PAD_83101""",2.282554,2.681769,1.722246,5783
…,…,…,…,…,…
"""AT""","""PAD_00601""",2.614693,2.910295,2.043898,3583
"""AT""","""PAD_78586""",1.372861,2.241495,1.244675,6689
"""AT""","""PAD_22694""",1.436553,0.975549,1.439791,6305
"""AT""","""PAD_02009""",2.226536,2.617892,1.791423,4816


In [65]:
per_country = (
    per_speaker.with_columns(Difference=pl.col("Coalition") - pl.col("Opposition"))
    .group_by("country")
    .agg([pl.count("Speaker_ID").alias("count"), pl.mean("Difference")])
    .filter(pl.col("count") >= 10)
    .sort("country")
)

per_country

country,count,Difference
str,u32,f64
"""AT""",73,0.737479
"""BA""",16,-0.041224
"""BE""",88,0.264336
"""BG""",53,0.574203
"""CZ""",91,0.619173
…,…,…
"""NL""",14,0.262446
"""PT""",137,0.79255
"""RS""",112,0.129075
"""SI""",207,0.351458


In [77]:
from iso3166 import countries

iso2_to_iso3_mapper = {c.alpha2: c.alpha3 for c in countries}

per_country = per_country.with_columns(
    iso3=pl.col("country").replace_strict(iso2_to_iso3_mapper)
)
per_country

country,count,Difference,iso3
str,u32,f64,str
"""AT""",73,0.737479,"""AUT"""
"""BA""",16,-0.041224,"""BIH"""
"""BE""",88,0.264336,"""BEL"""
"""BG""",53,0.574203,"""BGR"""
"""CZ""",91,0.619173,"""CZE"""
…,…,…,…
"""NL""",14,0.262446,"""NLD"""
"""PT""",137,0.79255,"""PRT"""
"""RS""",112,0.129075,"""SRB"""
"""SI""",207,0.351458,"""SVN"""


In [74]:
pl.__version__

'1.7.1'