# 1. Imports

In [2]:
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel, pipeline
import nltk
import warnings
from nltk.tokenize import sent_tokenize
import torch
import pickle
from IPython.display import FileLink



In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/toni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/toni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# ignore warnings
warnings.filterwarnings('ignore')

In [11]:
#from google.colab import drive
#drive.mount('/content/drive')

In [12]:
#connect to drive
#from google.colab import drive
#drive.mount('/content/drive')

In [13]:
# read pickle into df
content_df = pd.read_pickle('content_df_features.pkl')

# create subset with only electoral_terms 16-20
subset_df = content_df[content_df['electoral_term'].isin([17, 18, 19, 20])]
subset_df = subset_df[subset_df['position_short']!='Presidium of Parliament']
print(len(subset_df))
subset_df.head()



131058


Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,...,speech_content,date,faction,year,speech_length,age,gender,tenure,sentiment_score_1,sentiment_score_2
728977,729471,17,1,volker,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,kauder,5,Member of Parliament,,11001074,...,"Herr Alterspräsident, ich schlage für die CDU/...",2009-10-27,CDU/CSU,2009,105,60.0,männlich,18.616438,0.0,0.0
728980,729474,17,1,norbert,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,lammert,5,Member of Parliament,,11001274,...,"Herr Präsident, ich nehme die Wahl gerne an.",2009-10-27,CDU/CSU,2009,44,60.0,männlich,28.627397,0.0,0.2
728984,729478,17,1,gerda,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,hasselfeldt,5,Member of Parliament,,11000825,...,"Herr Präsident, ich nehme die Wahl gerne an un...",2009-10-27,CDU/CSU,2009,88,59.0,weiblich,22.413699,0.0,0.375
728986,729480,17,1,wolfgang,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,thierse,25,Member of Parliament,,11002318,...,"Ja, ich nehme die Wahl an.",2009-10-27,SPD,2009,26,66.0,männlich,19.076712,0.0,0.333333
728988,729482,17,1,hermann otto,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,solms,15,Member of Parliament,,11002190,...,Ich bedanke mich. Ich nehme die Wahl gerne an.,2009-10-27,FDP,2009,46,68.0,männlich,27.441096,0.0,0.25


# 2.Tokenize whole dataset



## 2.1 Pre-processing

In [14]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

subset_pre_tokenization = subset_df.copy()

# Tokenize speech_content to get token lengths
subset_pre_tokenization['tokenized_content'] = subset_pre_tokenization['speech_content'].apply(lambda x: tokenizer.tokenize(x))
subset_pre_tokenization['token_length'] = subset_pre_tokenization['tokenized_content'].apply(len)



In [15]:
# save as pkl
subset_pre_tokenization.to_pickle('content/drive/MyDrive/subset_pre_tokenization.pkl')

## 2.2 Split Tokens

Strategy:
Define Split Strategy Based on Length
Short speeches (under 128 tokens): No split needed; can be directly fed to BERT.
Moderate-length speeches (128–256 tokens): Split into two parts at the nearest sentence boundary around the middle.
Long speeches (over 256 tokens): Split into multiple chunks each to preserve context while avoiding the maximum token limit.


In [3]:
# load from pickle
subset_pre_tokenization = pd.read_pickle('content/drive/MyDrive/subset_pre_tokenization.pkl')

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

def split_speech(speech, tokenizer, max_tokens=128, overlap=20):
    tokenized_text = tokenizer.tokenize(speech)

    if len(tokenized_text) <= max_tokens:
        return [speech]

    sentences = sent_tokenize(speech)
    chunks = []
    current_chunk = []
    current_chunk_length = 0

    for sentence in sentences:
        # Tokenize the current sentence
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_length = len(sentence_tokens)

        if current_chunk_length + sentence_length > max_tokens:
            # Add the current chunk to chunks and start a new chunk with overlap
            chunks.append(current_chunk)
            current_chunk = [sentence]  # Add the sentence itself, not tokens
            current_chunk_length = sentence_length  # Reset the chunk length
        else:
            current_chunk.append(sentence)  # Add the sentence itself
            current_chunk_length += sentence_length

    if current_chunk:
        chunks.append(current_chunk)  # Final chunk

    return chunks  # Return chunks of sentences


In [5]:
# Process and save each `electoral_term` separately
for term in subset_pre_tokenization['electoral_term'].unique():
    term_df = subset_pre_tokenization[subset_pre_tokenization['electoral_term'] == term].copy()
    term_df['split_speeches'] = term_df['speech_content'].apply(lambda x: split_speech(x, tokenizer))

    # Save each processed term to a pickle file and download
    term_df.to_pickle(f'content/drive/MyDrive/term_{term}_tokenized.pkl')
    #FileLink(f'content/drive/MyDrive/term_{term}_tokenized.pkl')
    print(f"Saved term_{term}_tokenized.pkl")

Saved term_17_tokenized.pkl
Saved term_18_tokenized.pkl
Saved term_19_tokenized.pkl
Saved term_20_tokenized.pkl


In [6]:
# concat all the dfs
term_17 = pd.read_pickle('content/drive/MyDrive/term_17_tokenized.pkl')
term_18 = pd.read_pickle('content/drive/MyDrive/term_18_tokenized.pkl')
term_19 = pd.read_pickle('content/drive/MyDrive/term_19_tokenized.pkl')
term_20 = pd.read_pickle('content/drive/MyDrive/term_20_tokenized.pkl')

# concat all the dfs
subset_tokenized = pd.concat([term_17, term_18, term_19, term_20])


In [7]:
# save as picke
with open('subset_tokenized.pkl', 'wb') as f:
    pickle.dump(subset_tokenized, f)

In [8]:
#load subset_tokenized
#subset_tokenized = pd.read_pickle('content/drive/MyDrive/subset_tokenized.pkl')

# explode the split speeches

subset_tokenized_exploded = subset_tokenized.explode('split_speeches')
print(len(subset_tokenized_exploded))

# save as pickle
# save as picke
with open('subset_tokenized_exploded.pkl', 'wb') as f:
    pickle.dump(subset_tokenized_exploded, f)
#subset_tokenized_exploded.to_pickle('content/drive/MyDrive/subset_tokenized_exploded.pkl')

709864


In [20]:
subset_tokenized_exploded

Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,...,year,speech_length,age,gender,tenure,sentiment_score_1,sentiment_score_2,tokenized_content,token_length,split_speeches
728977,729471,17,1,volker,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,kauder,5,Member of Parliament,,11001074,...,2009,105,60.0,männlich,18.616438,0.000000,0.000000,"[Herr, Alters, ##präsident, ,, ich, schl, ##ag...",24,"Herr Alterspräsident, ich schlage für die CDU/..."
728980,729474,17,1,norbert,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,lammert,5,Member of Parliament,,11001274,...,2009,44,60.0,männlich,28.627397,0.000000,0.200000,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",10,"Herr Präsident, ich nehme die Wahl gerne an."
728984,729478,17,1,gerda,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,hasselfeldt,5,Member of Parliament,,11000825,...,2009,88,59.0,weiblich,22.413699,0.000000,0.375000,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",19,"Herr Präsident, ich nehme die Wahl gerne an un..."
728986,729480,17,1,wolfgang,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,thierse,25,Member of Parliament,,11002318,...,2009,26,66.0,männlich,19.076712,0.000000,0.333333,"[Ja, ,, ich, nehme, die, Wahl, an, .]",8,"Ja, ich nehme die Wahl an."
728988,729482,17,1,hermann otto,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,solms,15,Member of Parliament,,11002190,...,2009,46,68.0,männlich,27.441096,0.000000,0.250000,"[Ich, bed, ##anke, mich, ., Ich, nehme, die, W...",12,Ich bedanke mich. Ich nehme die Wahl gerne an.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966045,1075926,20,187,angelika,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,glöckner,25,Member of Parliament,,11004614,...,2024,2561,62.0,weiblich,9.780822,0.196735,0.016216,"[Sehr, geehrt, ##e, Frau, Präsidentin, !, Koll...",493,Sehr geehrte Frau Präsidentin ! Kolleginnen un...
966045,1075926,20,187,angelika,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,glöckner,25,Member of Parliament,,11004614,...,2024,2561,62.0,weiblich,9.780822,0.196735,0.016216,"[Sehr, geehrt, ##e, Frau, Präsidentin, !, Koll...",493,"diffiziler und etwas differenzierter , als Sie..."
966045,1075926,20,187,angelika,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,glöckner,25,Member of Parliament,,11004614,...,2024,2561,62.0,weiblich,9.780822,0.196735,0.016216,"[Sehr, geehrt, ##e, Frau, Präsidentin, !, Koll...",493,"Viele von ihnen haben Dinge erlebt , die wir u..."
966045,1075926,20,187,angelika,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,glöckner,25,Member of Parliament,,11004614,...,2024,2561,62.0,weiblich,9.780822,0.196735,0.016216,"[Sehr, geehrt, ##e, Frau, Präsidentin, !, Koll...",493,Jeder Mensch hat ein Recht auf Schutz und auf ...


# 3. Labels

!!! Note Silja: Ab hier könnnt ihr einfach unser Datenset reinladen und die Tokenization machhen. Der Code war eher darauf ausgelegt, dass wir die Daten für das Double Checken vorbereiten


In [15]:
# load from pickle
training_labels = pd.read_pickle('content/drive/MyDrive/augmented_training_data.pkl')
validation_labels = pd.read_pickle('content/drive/MyDrive/augmented_validation_data.pkl')

# safe both as excel
training_labels.to_excel('content/drive/MyDrive/training_labels.xlsx', index=False)
validation_labels.to_excel('content/drive/MyDrive/validation_labels.xlsx', index=False)

training_labels.head()

Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file
0,Wir jedenfalls sagen: Diese Regierung nimmt di...,18_66_11004079_10,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
1,"Wäre es so, wie Sie schildern, würden sie das ...",18_161_11004135_4,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
2,"Der zwölfte Menschenrechtsbericht zeigt, dass ...",18_236_11003742_1,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
3,Wir wollen Menschen helfen und Fluchtursachen ...,19_179_11003614_3,1,0,0,0,silja_02manual_pre_labeling_1610_final.xlsx
4,Hierüber kann man reden. In der Sache muss ich...,19_59_11004155_2,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx


In [16]:
# Load the German BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Tokenize each entry in "speech_content" and store it as text (not token IDs)
training_labels["tokenized_text"] = training_labels["speech_content"].apply(lambda x: tokenizer.tokenize(x))

# Calculate token length by finding the length of each list in "tokenized_text"
training_labels["token_length"] = training_labels["tokenized_text"].apply(len)

# Display the DataFrame to confirm tokenized text and length
print(training_labels[["speech_content", "tokenized_text", "token_length"]].head())

# show the distribution of token length
print(training_labels["token_length"].describe())

#show all with token length >128
print(training_labels[training_labels["token_length"] > 128])

print(len(training_labels[training_labels["token_length"] > 128]))

# save as csv
training_labels.to_csv("training_labels.csv", index=False)



                                      speech_content  \
0  Wir jedenfalls sagen: Diese Regierung nimmt di...   
1  Wäre es so, wie Sie schildern, würden sie das ...   
2  Der zwölfte Menschenrechtsbericht zeigt, dass ...   
3  Wir wollen Menschen helfen und Fluchtursachen ...   
4  Hierüber kann man reden. In der Sache muss ich...   

                                      tokenized_text  token_length  
0  [Wir, jedenfalls, sagen, :, Diese, Regierung, ...            63  
1  [Wäre, es, so, ,, wie, Sie, sch, ##ildern, ,, ...            50  
2  [Der, zwölf, ##te, Menschenrechts, ##bericht, ...            47  
3  [Wir, wollen, Menschen, helfen, und, Flucht, #...            51  
4  [Hier, ##über, kann, man, reden, ., In, der, S...            77  
count    5009.000000
mean       62.645438
std        17.272276
min        25.000000
25%        49.000000
50%        60.000000
75%        74.000000
max       136.000000
Name: token_length, dtype: float64
                                         speec

In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Tokenize each entry in "speech_content" and store the tokens as text (not token IDs)
validation_labels["tokenized_text"] = validation_labels["speech_content"].apply(lambda x: tokenizer.tokenize(x))

# Calculate token length for each entry
validation_labels["token_length"] = validation_labels["tokenized_text"].apply(len)

# Add a column to indicate if the tokenized text is truncated
validation_labels["truncated"] = validation_labels["token_length"] > 128

# Truncate the tokenized text to 128 tokens if needed
validation_labels["truncated_text"] = validation_labels["tokenized_text"].apply(lambda x: x[:128] if len(x) > 128 else x)

# Save the DataFrame with the truncated and original text as well as the truncation indicator
validation_labels.to_csv("validation_labels_with_truncation.csv", index=False)

validation_labels.head()


Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file,tokenized_text,token_length,truncated,truncated_text
1670,Seit der Rede des Bundeskanzlers ist das Wort ...,20_69_11004705_16,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Seit, der, Rede, des, Bundeskanzler, ##s, ist...",203,True,"[Seit, der, Rede, des, Bundeskanzler, ##s, ist..."
1671,Aus diesem Grund begrüßen wir ausdrücklich die...,20_25_11005049_1,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Aus, diesem, Grund, begrü, ##ßen, wir, ausdrü...",101,False,"[Aus, diesem, Grund, begrü, ##ßen, wir, ausdrü..."
1672,"Das Zweite, was Sie ansprechen, sind Veränderu...",20_90_999990119_2,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Das, Zweite, ,, was, Sie, ans, ##prech, ##en,...",54,False,"[Das, Zweite, ,, was, Sie, ans, ##prech, ##en,..."
1673,"Ich bin aber der Überzeugung, dass jemand, der...",20_73_11004342_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Ich, bin, aber, der, Überzeugung, ,, dass, je...",84,False,"[Ich, bin, aber, der, Überzeugung, ,, dass, je..."
1674,"also zum Beispiel die Geschlechterforschung, d...",20_114_11004041_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[also, zum, Beispiel, die, Geschlechter, ##for...",78,False,"[also, zum, Beispiel, die, Geschlechter, ##for..."


In [18]:
# Only needed for double-checking
# for both traininng and validation, split into 5 equal parts and safe as excel
'''training_paraphrased = training_labels[training_labels['source_file']=="paraphrased"]
training_labels_split = np.array_split(training_paraphrased, 5)
validation_labels_split = np.array_split(validation_labels, 5)

for i, split in enumerate(training_labels_split):
    split.to_excel(f"training_labels_split_{i+1}.xlsx", index=False)
    print(f"Saved training_labels_split_{i+1}.xlsx")

for i, split in enumerate(validation_labels_split):
    split.to_excel(f"validation_labels_split_{i+1}.xlsx", index=False)'''

'training_paraphrased = training_labels[training_labels[\'source_file\']=="paraphrased"]\ntraining_labels_split = np.array_split(training_paraphrased, 5)\nvalidation_labels_split = np.array_split(validation_labels, 5)\n\nfor i, split in enumerate(training_labels_split):\n    split.to_excel(f"training_labels_split_{i+1}.xlsx", index=False)\n    print(f"Saved training_labels_split_{i+1}.xlsx")\n\nfor i, split in enumerate(validation_labels_split):\n    split.to_excel(f"validation_labels_split_{i+1}.xlsx", index=False)'

In [37]:
df = pd.read_pickle('contributions_extended.pkl')

In [38]:
df

Unnamed: 0,id,type,first_name,last_name,faction_id,speech_id,text_position,politician_id,content
0,0,Lachen,,,-1,4545,7,-1,links
1,1,Personen-Einruf,,hütter,15,4545,2,11000979,Nein!
0,2,Zuruf,,,22,494,3,-1,§ 51!
1,3,Zuruf,,,-1,496,0,-1,links
2,4,Personen-Einruf,,blücher,15,497,0,11000202,Jawohl!
...,...,...,...,...,...,...,...,...,...
1044,2864760,Beifall,,,4,1075926,6,-1,
1045,2864761,Personen-Einruf,jörn,könig,0,1075926,6,11004788,Ich habe es gesagt: Anerkennungsquote unter 1 ...
1046,2864762,Beifall,wolfgang,strengmann-kuhn,4,1075926,7,11003888,
1047,2864763,Beifall,,,25,1075926,8,-1,


In [39]:
with open('contributions_extended.pkl', 'wb') as f:
        pickle.dump(df, f)

In [None]:
def split_speech(speech, tokenizer, max_tokens=128, overlap=20):
    tokenized_text = tokenizer.tokenize(speech)

    if len(tokenized_text) <= max_tokens:
        return [speech]

    sentences = sent_tokenize(speech)
    chunks = []
    current_chunk = []
    current_chunk_length = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_length = len(sentence_tokens)

        if current_chunk_length + sentence_length > max_tokens:
            # Add the current chunk to chunks and start a new chunk with overlap
            chunks.append(" ".join(current_chunk))
            current_chunk = sentence_tokens[-overlap:]
            current_chunk_length = len(current_chunk)
        else:
            current_chunk.extend(sentence_tokens)
            current_chunk_length += sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks  # Return tokenized chunks as is