# Champions of sentiment discourse

AUTHOR: Michal Mochtak (michal.mochtak@ru.nl), Peter Rupnik (peter.rupnik@ijs.si), Nikola Ljubešić

DATE: 2024-06-24

---

In this notebook we look into specific countries and their sentiment scores on speaker- and party-level.

On the first run, the data will be downloaded from the internet. In the next cell a function was prepared to filter the dataset by specific conditions (e.g. taking only the MPs that have a specific number of speeches on the record). In the next cells we will inspect two countries in a comparable time frame, Croatia and France, and then the entire corpus across full time span.

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
from pathlib import Path
import seaborn as sns
from IPython.display import display
if not Path("speeches.csv.zip").exists():
    from os import system
    system("wget https://huggingface.co/datasets/5roop/parlasent_data/resolve/main/speeches.csv.zip")
df = pd.read_csv("speeches.csv.zip")

  df = pd.read_csv("speeches.csv.zip")


In [2]:
def calculate_sentiment(target="Speaker_name", *, country=None, term=None, filter=None):
    global df
    all_countries = df.country.unique().tolist()
    if country == None:
        country = input(f"Choose country from {all_countries} \n(empty for all): ")

    # Filtering
    # Select speeches from a specific country:
    c0 = df.country == country
    if country in ["all", ""]:
        c0 = pd.Series([True for i in df.country])
    # Keep only MPs
    c1 = (df.Speaker_MP == "MP")
    # Limit searches to speeches longer than 100 characters:
    c2 = df.char_length >= 100
    # Include only speakers with at least 10 speeches:
    gb = df[c0&c1&c2].groupby("Speaker_name").logits_pondered.count().reset_index()
    speakers_to_keep = gb.Speaker_name[gb.logits_pondered >= 10]
    c3 = df.Speaker_name.isin(speakers_to_keep)
    ndf = df[c0&c1&c2&c3]
    if term == None:
        print(f"Available terms:")
        display(ndf.groupby("Term").agg({
            "Date": [min, max, "count"],
        }).sort_values(("Date", "min")), clear=True, )
        term = input(f"Choose term from {ndf.Term.unique().tolist()} (empty for all): ")
    if term:
        c0 = ndf.Term == term
        nndf = ndf[c0].reset_index(drop=True)
    else:
        nndf = ndf
    def percentage_of_negative(l: pd.Series) -> float:
        return (l < 2.0).sum()/l.shape[0]
    gb2 = nndf.groupby([f"{target}"]).agg({
        "logits_pondered": [percentage_of_negative, "count"],
    }).reset_index()
    gb2.columns = f"{target} percentage_of_negative count".split()

    # gb2 = gb2.merge(gb1, on=f"{target}", how="left")
    gb2 = gb2.sort_values(by="percentage_of_negative", ascending=False)

    if filter:
        sigma = gb2["count"].std()
        mu = gb2["count"].median()
        q1, q3 = gb2["count"].quantile([0.25, 0.75]).values.tolist()
        if filter == "1sd":
            c = (gb2["count"] >= mu - sigma) & (gb2["count"] <= mu+sigma)
        elif filter == "2sd":
            c = (gb2["count"] >= mu - 2*sigma) & (gb2["count"] <= mu + 2*sigma)
        elif filter == "interquartile":
            c = (gb2["count"] >= q1) & (gb2["count"] <= q3)
        else:
            raise NotImplementedError("Only accepts '1sd' or '2sd' for now")
        gb2 = gb2[c]
    return gb2.reset_index(drop=True)

Let's inspect the terms we have available, so that an approximately equal timeframe can be set:

In [3]:
df[df.country.isin(["HR", "FR"])].groupby("country Term Speaker_MP".split()).agg({
    "Date": [min, max, "count"]
}).sort_values(by=("Date", "min"))

  df[df.country.isin(["HR", "FR"])].groupby("country Term Speaker_MP".split()).agg({
  df[df.country.isin(["HR", "FR"])].groupby("country Term Speaker_MP".split()).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Date,Date
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,count
country,Term,Speaker_MP,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
HR,5. mandat,MP,2003-12-22,2007-10-12,74856
HR,5. mandat,notMP,2003-12-22,2007-10-12,4404
HR,5. mandat,-,2004-04-01,2007-10-11,2632
HR,6. mandat,notMP,2008-01-11,2011-10-28,4200
HR,6. mandat,MP,2008-01-11,2011-10-28,68561
HR,6. mandat,-,2008-02-21,2011-10-27,455
HR,7. mandat,notMP,2011-12-22,2015-09-24,3824
HR,7. mandat,MP,2011-12-22,2015-09-25,96544
HR,7. mandat,-,2012-01-27,2015-07-03,1650
HR,8. mandat,MP,2015-12-03,2016-06-20,10559


In [4]:
calculate_sentiment("Speaker_name", country="HR", term="9. mandat")

Unnamed: 0,Speaker_name,percentage_of_negative,count
0,"Esih, Bruna",1.0,13
1,"Glasnović, Željko",0.973545,567
2,"Klarin, Ivan",0.942857,35
3,"Pernar, Ivan",0.939297,1565
4,"Bunjac, Branimir",0.875089,1401
5,"Stazić, Nenad",0.867617,491
6,"Lenart, Željko",0.867168,399
7,"Culej, Stevo",0.866667,375
8,"Hasanbegović, Zlatko",0.860759,79
9,"Bulj, Miro",0.845632,3537


In [5]:
calculate_sentiment("Speaker_name", country="FR", term="15e législature")

Unnamed: 0,Speaker_name,percentage_of_negative,count
0,"Parigi, Jean-François",0.928571,14
1,"Dupont-Aignan, Nicolas",0.926335,543
2,"Houplain, Myriane",0.923077,13
3,"Cornut-Gentille, François",0.857143,84
4,"Evrard, José",0.846154,26
5,"Corbière, Alexis",0.832402,1611
6,"Aubert, Julien",0.828783,1758
7,"Bilde, Bruno",0.826347,167
8,"Mélenchon, Jean-Luc",0.81795,1961
9,"Collard, Gilbert",0.814815,108


In [6]:
calculate_sentiment("Speaker_party", country="HR", term="9. mandat")


Unnamed: 0,Speaker_party,percentage_of_negative,count
0,Živi zid,0.894138,3429
1,HRAST,0.791436,724
2,PH,0.767142,1473
3,HSS,0.677419,1674
4,MOST,0.656041,10446
5,SDP,0.62389,14634
6,HSLS,0.615385,13
7,SNAGA,0.601911,628
8,HSU,0.5888,625
9,IDS,0.477612,469


In [7]:
calculate_sentiment("Speaker_party", country="FR", term="15e législature")

Unnamed: 0,Speaker_party,percentage_of_negative,count
0,FI,0.781236,12982
1,GDR,0.694278,6519
2,UDI_I,0.659779,2710
3,-,0.64128,6250
4,NG,0.630577,1838
5,LT,0.602067,4355
6,LR,0.521784,27360
7,LC,0.517361,576
8,EDS,0.513178,645
9,UDI-A-I,0.492188,896


# Overall most negative and most positive parties



In [8]:
calculate_sentiment("Speaker_name", country="", term="",filter="1sd")

Unnamed: 0,Speaker_name,percentage_of_negative,count
0,"Milovankić, Radovan",1.0,11
1,"Jiménez-Becerril Barrios, María Teresa",1.0,11
2,"Karlsson, Mattias",1.0,14
3,"Stupar, Dušan",1.0,16
4,"Stošić, Predrag",1.0,19
5,"Stefanović, Živojin",1.0,18
6,"Moskal', Hennadij Hennadijovyč",1.0,18
7,"Ruiz Navarro, Eduardo Luis",1.0,16
8,"Utrilla Cano, Julio",1.0,10
9,"Alcaraz Martos, Francisco José",1.0,11
