In [75]:
import pandas as pd


df = pd.read_json(
    "agg_data/ParlaMint-SI.speeches.jsonl",
    lines=True,
    # nrows=10000
)
df = pd.concat([df, df.metadata.apply(pd.Series)], axis=1)

# Filtering
# Keep only MPs
c1 = df.Speaker_MP == "MP"
# Keep only speeches where speaker is either Opposition or Coalition:
c2 = df.Party_status.isin(["Opposition", "Coalition"])
# Keep only people that were in opposition AND coalition party_roles AND have at least 10 speeches as both:
gb = df.groupby(["Speaker_name", "Party_status"]).logits_pondered.count().reset_index()
ngb = gb.Speaker_name.value_counts().reset_index()
speakers_to_keep = gb[(gb.logits_pondered > 10) & gb.Speaker_name.isin(
    ngb.Speaker_name[ngb["count"] >= 2]
    )].Speaker_name.unique()
c3 = df.Speaker_name.isin(speakers_to_keep)

df = df[c1&c2&c3].reset_index(drop=True)
df.head(3)

Unnamed: 0,newdoc id,logits_pondered,logits_averaged,char_length,metadata,Text_ID,ID,Title,Date,Body,...,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth
0,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u6,3.297177,3.303611,2782,{'Text_ID': 'ParlaMint-SI-en_2022-04-06-SDZ8-I...,ParlaMint-SI-en_2022-04-06-SDZ8-Izredna-99,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u6,Minutes of the National Assembly of the Republ...,2022-04-06,Lower house,...,MP,notMinister,SAB,Stranka Alenke Bratušek,Opposition,Centre-left,BandelliMarko,"Bandelli, Marko",M,1967
1,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u10,3.459094,3.665817,1908,{'Text_ID': 'ParlaMint-SI-en_2022-04-06-SDZ8-I...,ParlaMint-SI-en_2022-04-06-SDZ8-Izredna-99,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u10,Minutes of the National Assembly of the Republ...,2022-04-06,Lower house,...,MP,notMinister,SNS,Slovenska nacionalna stranka,Opposition,Right to far-right,ŠiškoDušan,"Šiško, Dušan",M,1969
2,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u12,3.868118,3.847332,2388,{'Text_ID': 'ParlaMint-SI-en_2022-04-06-SDZ8-I...,ParlaMint-SI-en_2022-04-06-SDZ8-Izredna-99,ParlaMint-SI_2022-04-06-SDZ8-Izredna-99.ana.u12,Minutes of the National Assembly of the Republ...,2022-04-06,Lower house,...,MP,notMinister,SDS,Slovenska demokratska stranka,Coalition,Right,ŠkrinjarMojca,"Škrinjar, Mojca",F,1955


In [76]:
pd.set_option('display.max_rows', 10)
gb1 = df.groupby(["Speaker_name", "Party_status"]).agg({
    "logits_pondered": "mean"
}).reset_index()
gb2 = df.groupby(["Speaker_name"]).agg({
    "logits_pondered": "mean"
}).reset_index().rename(columns={"logits_pondered": "Overall"})

gb2["Coalition"] = gb2.merge(gb1[gb1["Party_status"]=="Coalition"], on="Speaker_name").logits_pondered
gb2["Opposition"] = gb2.merge(gb1[gb1["Party_status"]=="Opposition"], on="Speaker_name").logits_pondered
gb2

Unnamed: 0,Speaker_name,Overall,Coalition,Opposition
0,"Ambrožič, Borut",2.348645,1.967031,2.348645
1,"Anderlič, Anton",1.881291,1.868605,1.605560
2,"Bah Žibert, Anja",1.668794,2.628853,1.602672
3,"Bajc, Josip",2.628646,4.033764,2.610820
4,"Bajuk, Andrej",1.988748,2.171821,1.986032
...,...,...,...,...
289,"Židan, Gregor",2.485488,,
290,"Žnidar, Ljubo",2.111894,,
291,"Žnidaršič, Franc",2.338910,,
292,"Župevc, Melita",1.774113,,


In [77]:
df.groupby(["Speaker_name", "Party_status"]).logits_pondered.count().reset_index().Speaker_name.value_counts()

Speaker_name
Žveglič, Roman              2
Moge, Rudolf                2
Lisec, Tomaž                2
Vlačič, Patrick             2
Anderlič, Anton             2
                           ..
Krajčič, Darij              1
Krajnc, Bojan               1
Kozlovič, Lilijana          1
Korenjak Kramar, Ksenija    1
Majhenič, Silven            1
Name: count, Length: 294, dtype: int64

In [78]:
gb.Speaker_name.value_counts().reset_index()

Unnamed: 0,Speaker_name,count
0,"Podobnik, Janez",3
1,"Stepišnik, Stanko",3
2,"Dimic, Iva",3
3,"Pogačnik, Marko",3
4,"Divjak Mirnik, Lidija",3
...,...,...
968,"Ferluga, Marko",1
969,"Mikuž, Lara",1
970,"Mladenovič, Zoran",1
971,"Mlinar, Angelika",1
