# Parlament Speaker Recogniction Model
This notebook should predict what speaker said certain things based on the Austrian parlament speaches. 

This small project is based on the Dataset assembled by Mario Zechner (https://marioslab.io/)

In [1]:
# DATA FROM SPEAKER
import pandas as pd

# Either use direct download or load locally https://woswormeileistung.marioslab.io/data/persons.json
#speaker = pd.read_json("https://woswormeileistung.marioslab.io/data/persons.json")
speaker = pd.read_json("persons.json")
print(speaker.shape)
speaker.head()

(681, 6)


Unnamed: 0,id,name,parties,periods,url,imageUrl
0,3612,Sonja Ablinger,[SPÖ],"[XX, XXIII, XXIV]",https://parlament.gv.at/person/3612,https://parlament.gv.at/dokument/bild/34898/34...
1,14854,Dipl.-Ing. Elke Achleitner,"[BZÖ, FPÖ]",[XXII],https://parlament.gv.at/person/14854,https://parlament.gv.at/dokument/bild/21013/21...
2,83119,Mag. Nikolaus Alm,"[LIF, NEOS]",[XXV],https://parlament.gv.at/person/83119,https://parlament.gv.at/dokument/bild/43808/43...
3,51879,"Mag. Hannes Amesbauer, BA",[FPÖ],"[XXVI, XXVII]",https://parlament.gv.at/person/51879,https://parlament.gv.at/dokument/bild/200696/2...
4,2819,"Werner Amon, MBA",[ÖVP],"[XIX, XX, XXI, XXII, XXIII, XXIV, XXV, XXVI]",https://parlament.gv.at/person/2819,https://parlament.gv.at/dokument/bild/90356/90...


As one can see there are about 681 Speakers from the austrian parlament, starting from 2002

In [2]:
# DATA FROM SESSIONS 
# BE AWARA: sessions data has 440 mb, so it is only imported locally 
# https://woswormeileistung.marioslab.io/data/sessions.json

sessions = pd.read_json("sessions.json")

print(sessions.shape)
sessions.head()

(1001, 7)


Unnamed: 0,url,period,sessionNumber,sessionLabel,date,protocolUrls,sections
0,https://parlament.gv.at/gegenstand/XXVII/NRSIT...,XXVII,251,251. Sitzung (251/NRSITZ),2024-01-31,[],[]
1,https://parlament.gv.at/gegenstand/XXVII/NRSIT...,XXVII,250,250. Sitzung (250/NRSITZ),2024-01-31,[],[]
2,https://parlament.gv.at/gegenstand/XXVII/NRSIT...,XXVII,249,249. Sitzung (249/NRSITZ),2024-01-31,[],[]
3,https://parlament.gv.at/gegenstand/XXVII/NRSIT...,XXVII,248,248. Sitzung (248/NRSITZ),2023-12-15,[],[]
4,https://parlament.gv.at/gegenstand/XXVII/NRSIT...,XXVII,247,247. Sitzung (247/NRSITZ),2023-12-15,[],[]


In [3]:
sessions.sections.head()

0    []
1    []
2    []
3    []
4    []
Name: sections, dtype: object

In [4]:
sessions.sections.drop_duplicates()

0                                                      []
35      [{'speaker': '88386', 'text': 'Ich darf die 21...
36      [{'speaker': '88386', 'text': 'Meine sehr geeh...
39      [{'speaker': '88386', 'text': 'Ich eröffne die...
40      [{'speaker': '88386', 'text': 'Meine sehr geeh...
                              ...                        
996     [{'speaker': '799', 'text': 'Ich eröffne die 5...
997     [{'speaker': '799', 'text': 'Die Sitzung ist e...
998     [{'speaker': '799', 'text': 'Meine Damen und H...
999     [{'speaker': '799', 'text': 'Die Sitzung ist e...
1000    [{'speaker': '334', 'text': 'Meine Damen und H...
Name: sections, Length: 955, dtype: object

In [5]:
sessions.sections.describe()

count     1001
unique     955
top         []
freq        47
Name: sections, dtype: object

In [6]:
session_na = [1 if sessions["sections"][i] == [] else 0 for i in range(len(sessions))]
print("Absolute number of na: "+ str(sum(session_na)) + " || relative number of na: "+ str(sum(session_na)/len(sessions)))

Absolute number of na: 47 || relative number of na: 0.04695304695304695


In [7]:
# the entries in "speaker" arer list of dicts; therefore loop over the list to extract the dicts
text_dict = [j  for sections in sessions["sections"] for j in sections] 

# make a pandas df from the extracted dict
text = pd.DataFrame.from_dict(text_dict, orient="columns")
text["input"] = "SPEAKER: "+text["speaker"]+"; TEXT: "+text["text"]
print(text.shape) # 173498 
text.head()

(173498, 5)


Unnamed: 0,speaker,text,callouts,links,input
0,88386,Ich darf die 216. Sitzung des Nationalrates um...,"[{'caller': '2343', 'text': 'Ich bin noch nich...",[],SPEAKER: 88386; TEXT: Ich darf die 216. Sitzun...
1,88386,Hinsichtlich der eingelangten Verhandlungsgege...,[],"[{'label': '14141/AB', 'url': 'https://parlame...",SPEAKER: 88386; TEXT: Hinsichtlich der eingela...
2,88386,"Außerdem weise ich\n\ndie Anträge 3425/A, 3426...",[],"[{'label': '3425/A', 'url': 'https://parlament...",SPEAKER: 88386; TEXT: Außerdem weise ich\n\ndi...
3,88386,"Weiters darf ich mitteilen, dass folgende Fris...",[],"[{'label': '1774 der Beilagen', 'url': 'https:...",SPEAKER: 88386; TEXT: Weiters darf ich mitteil...
4,88386,Wir kommen nunmehr zur Abstimmung über den Ant...,[],"[{'label': '1774 der Beilagen', 'url': 'https:...",SPEAKER: 88386; TEXT: Wir kommen nunmehr zur A...


In [8]:
#for i in range(10): 
#    print(text["callouts"][i] == [])
tryout = pd.DataFrame([text["callouts"][i] == [] for i in range(10)])
tryout.describe()

Unnamed: 0,0
count,10
unique,2
top,True
freq,9


In [9]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(text)
ds

Dataset({
    features: ['speaker', 'text', 'callouts', 'links', 'input'],
    num_rows: 173498
})

In [11]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
tok_ds = tokz(str(text["input"][:400]))