# 1. Imports

In [None]:
import numpy as np
import pandas as pd
import regex as re

import matplotlib.pyplot as plt

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

from transformers import BertTokenizer
from transformers import BertTokenizer, BertModel, pipeline


import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import torch
import pickle

from IPython.display import FileLink




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read pickle into df
content_df = pd.read_pickle('/content/drive/MyDrive/content_df 6.pkl')

# create subset with only electoral_terms 16-20
subset_df = content_df[content_df['electoral_term'].isin([17, 18, 19, 20])]
subset_df = subset_df[subset_df['position_short']!='Presidium of Parliament']
print(len(subset_df))
subset_df.head()



131481


Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,speech_content,date,faction,year
729471,729471,17,1,volker,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,kauder,5,Member of Parliament,,11001074,"Herr Alterspräsident, ich schlage für die CDU/...",2009-10-26,CDU/CSU,2009
729474,729474,17,1,norbert,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,lammert,5,Member of Parliament,,11001274,"Herr Präsident, ich nehme die Wahl gerne an.",2009-10-26,CDU/CSU,2009
729478,729478,17,1,gerda,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,hasselfeldt,5,Member of Parliament,,11000825,"Herr Präsident, ich nehme die Wahl gerne an un...",2009-10-26,CDU/CSU,2009
729480,729480,17,1,wolfgang,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,thierse,25,Member of Parliament,,11002318,"Ja, ich nehme die Wahl an.",2009-10-26,SPD,2009
729482,729482,17,1,hermann otto,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,solms,15,Member of Parliament,,11002190,Ich bedanke mich. Ich nehme die Wahl gerne an.,2009-10-26,FDP,2009


# 2.Tokenize whole dataset



## 2.1 Pre-processing

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

subset_pre_tokenization = subset_df.copy()

# Tokenize speech_content to get token lengths
subset_pre_tokenization['tokenized_content'] = subset_pre_tokenization['speech_content'].apply(lambda x: tokenizer.tokenize(x))
subset_pre_tokenization['token_length'] = subset_pre_tokenization['tokenized_content'].apply(len)



In [None]:
# save as pkl
subset_pre_tokenization.to_pickle('/content/drive/MyDrive/subset_pre_tokenization.pkl')


## 2.2 Split Tokens

Strategy:
Define Split Strategy Based on Length
Short speeches (under 128 tokens): No split needed; can be directly fed to BERT.
Moderate-length speeches (128–256 tokens): Split into two parts at the nearest sentence boundary around the middle.
Long speeches (over 256 tokens): Split into multiple chunks each to preserve context while avoiding the maximum token limit.


In [None]:
# load from pickle
subset_pre_tokenization = pd.read_pickle('/content/drive/MyDrive/subset_pre_tokenization.pkl')

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Define the function to split speech content into chunks with a max length of 128 tokens
def split_speech(speech, tokenizer, max_tokens=128, overlap=20):
    tokenized_text = tokenizer.tokenize(speech)

    if len(tokenized_text) <= max_tokens:
        return [speech]

    sentences = sent_tokenize(speech)
    chunks = []
    current_chunk = []
    current_chunk_length = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_length = len(sentence_tokens)

        if current_chunk_length + sentence_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = sentence_tokens[-overlap:]
            current_chunk_length = len(current_chunk)
        else:
            current_chunk.extend(sentence_tokens)
            current_chunk_length += sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return [tokenizer.convert_tokens_to_string(chunk.split()) for chunk in chunks]

# Process and save each `electoral_term` separately
for term in subset_pre_tokenization['electoral_term'].unique():
    term_df = subset_pre_tokenization[subset_pre_tokenization['electoral_term'] == term].copy()
    term_df['split_speeches'] = term_df['speech_content'].apply(lambda x: split_speech(x, tokenizer))

    # Save each processed term to a pickle file and download
    term_df.to_pickle(f'/content/drive/MyDrive/term_{term}_tokenized.pkl')
    FileLink(f'/content/drive/MyDrive/term_{term}_tokenized.pkl')
    print(f"Saved term_{term}_tokenized.pkl")




Saved term_17_tokenized.pkl
Saved term_18_tokenized.pkl
Saved term_19_tokenized.pkl
Saved term_20_tokenized.pkl


In [None]:
# concat all the dfs
term_17 = pd.read_pickle('/content/drive/MyDrive/term_17_tokenized.pkl')
term_18 = pd.read_pickle('/content/drive/MyDrive/term_18_tokenized.pkl')
term_19 = pd.read_pickle('/content/drive/MyDrive/term_19_tokenized.pkl')
term_20 = pd.read_pickle('/content/drive/MyDrive/term_20_tokenized.pkl')

# concat all the dfs
subset_tokenized = pd.concat([term_17, term_18, term_19, term_20])

subset_tokenized.head()


Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,speech_content,date,faction,year,tokenized_content,token_length,split_speeches
729471,729471,17,1,volker,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,kauder,5,Member of Parliament,,11001074,"Herr Alterspräsident, ich schlage für die CDU/...",2009-10-26,CDU/CSU,2009,"[Herr, Alters, ##präsident, ,, ich, schl, ##ag...",24,"[Herr Alterspräsident, ich schlage für die CDU..."
729474,729474,17,1,norbert,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,lammert,5,Member of Parliament,,11001274,"Herr Präsident, ich nehme die Wahl gerne an.",2009-10-26,CDU/CSU,2009,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",10,"[Herr Präsident, ich nehme die Wahl gerne an.]"
729478,729478,17,1,gerda,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,hasselfeldt,5,Member of Parliament,,11000825,"Herr Präsident, ich nehme die Wahl gerne an un...",2009-10-26,CDU/CSU,2009,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",19,"[Herr Präsident, ich nehme die Wahl gerne an u..."
729480,729480,17,1,wolfgang,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,thierse,25,Member of Parliament,,11002318,"Ja, ich nehme die Wahl an.",2009-10-26,SPD,2009,"[Ja, ,, ich, nehme, die, Wahl, an, .]",8,"[Ja, ich nehme die Wahl an.]"
729482,729482,17,1,hermann otto,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,solms,15,Member of Parliament,,11002190,Ich bedanke mich. Ich nehme die Wahl gerne an.,2009-10-26,FDP,2009,"[Ich, bed, ##anke, mich, ., Ich, nehme, die, W...",12,[Ich bedanke mich. Ich nehme die Wahl gerne an.]


In [None]:
print(len(subset_tokenized))

# save as picke
subset_tokenized.to_pickle('/content/drive/MyDrive/subset_tokenized.pkl')


131481


In [None]:
#load subset_tokenized
subset_tokenized = pd.read_pickle('/content/drive/MyDrive/subset_tokenized.pkl')

# explode the split speeches

subset_tokenized_exploded = subset_tokenized.explode('split_speeches')
print(len(subset_tokenized_exploded))

# save as pickle
subset_tokenized_exploded.to_pickle('/content/drive/MyDrive/subset_tokenized_exploded.pkl')
subset_tokenized_exploded.head(20)


655084


Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,speech_content,date,faction,year,tokenized_content,token_length,split_speeches
729471,729471,17,1,volker,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,kauder,5,Member of Parliament,,11001074,"Herr Alterspräsident, ich schlage für die CDU/...",2009-10-26,CDU/CSU,2009,"[Herr, Alters, ##präsident, ,, ich, schl, ##ag...",24,"Herr Alterspräsident, ich schlage für die CDU/..."
729474,729474,17,1,norbert,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,lammert,5,Member of Parliament,,11001274,"Herr Präsident, ich nehme die Wahl gerne an.",2009-10-26,CDU/CSU,2009,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",10,"Herr Präsident, ich nehme die Wahl gerne an."
729478,729478,17,1,gerda,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,hasselfeldt,5,Member of Parliament,,11000825,"Herr Präsident, ich nehme die Wahl gerne an un...",2009-10-26,CDU/CSU,2009,"[Herr, Präsident, ,, ich, nehme, die, Wahl, ge...",19,"Herr Präsident, ich nehme die Wahl gerne an un..."
729480,729480,17,1,wolfgang,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,thierse,25,Member of Parliament,,11002318,"Ja, ich nehme die Wahl an.",2009-10-26,SPD,2009,"[Ja, ,, ich, nehme, die, Wahl, an, .]",8,"Ja, ich nehme die Wahl an."
729482,729482,17,1,hermann otto,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,solms,15,Member of Parliament,,11002190,Ich bedanke mich. Ich nehme die Wahl gerne an.,2009-10-26,FDP,2009,"[Ich, bed, ##anke, mich, ., Ich, nehme, die, W...",12,Ich bedanke mich. Ich nehme die Wahl gerne an.
729484,729484,17,1,petra,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,pau,7,Member of Parliament,,11003206,Ich nehme die Wahl gerne an und freue mich auf...,2009-10-26,DIE LINKE.,2009,"[Ich, nehme, die, Wahl, gerne, an, und, fre, #...",21,Ich nehme die Wahl gerne an und freue mich auf...
729486,729486,17,1,katrin,https://dip21.bundestag.de/dip21/btp/17/17001.pdf,göring-eckardt,4,Member of Parliament,,11003132,Ich nehme die Wahl an und bedanke mich sehr he...,2009-10-26,Bündnis 90/Die Grünen,2009,"[Ich, nehme, die, Wahl, an, und, bed, ##anke, ...",13,Ich nehme die Wahl an und bedanke mich sehr he...
729490,729490,17,2,angela,https://dip21.bundestag.de/dip21/btp/17/17002.pdf,merkel,5,Member of Parliament,,11001478,"Herr Präsident, ich nehme die Wahl an und beda...",2009-10-27,CDU/CSU,2009,"[Herr, Präsident, ,, ich, nehme, die, Wahl, an...",16,"Herr Präsident, ich nehme die Wahl an und beda..."
729493,729493,17,2,angela,https://dip21.bundestag.de/dip21/btp/17/17002.pdf,merkel,5,Chancellor,,11001478,"Ich schwöre, dass ich meine Kraft dem Wohle de...",2009-10-27,CDU/CSU,2009,"[Ich, schw, ##ör, ##e, ,, dass, ich, meine, Kr...",59,"Ich schwöre, dass ich meine Kraft dem Wohle de..."
729496,729496,17,2,guido,https://dip21.bundestag.de/dip21/btp/17/17002.pdf,westerwelle,15,Minister,bundesminister des auswärtigen,11002944,"Ich schwöre es, so wahr mir Gott helfe.",2009-10-27,FDP,2009,"[Ich, schw, ##ör, ##e, es, ,, so, wahr, mir, G...",13,"Ich schwöre es, so wahr mir Gott helfe."


# 3. Labels

!!! Note Silja: Ab hier könnnt ihr einfach unser Datenset reinladen und die Tokenization machhen. Der Code war eher darauf ausgelegt, dass wir die Daten für das Double Checken vorbereiten


In [None]:
# load from pickle
training_labels = pd.read_pickle('/content/drive/MyDrive/augmented_training_data.pkl')
validation_labels = pd.read_pickle('/content/drive/MyDrive/augmented_validation_data.pkl')

# safe both as excel
training_labels.to_excel('/content/drive/MyDrive/training_labels.xlsx', index=False)
validation_labels.to_excel('/content/drive/MyDrive/validation_labels.xlsx', index=False)

training_labels.head()

Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file
0,Wir jedenfalls sagen: Diese Regierung nimmt di...,18_66_11004079_10,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
1,"Wäre es so, wie Sie schildern, würden sie das ...",18_161_11004135_4,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
2,"Der zwölfte Menschenrechtsbericht zeigt, dass ...",18_236_11003742_1,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
3,Wir wollen Menschen helfen und Fluchtursachen ...,19_179_11003614_3,1,0,0,0,silja_02manual_pre_labeling_1610_final.xlsx
4,Hierüber kann man reden. In der Sache muss ich...,19_59_11004155_2,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx


In [None]:
# Load the German BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Tokenize each entry in "speech_content" and store it as text (not token IDs)
training_labels["tokenized_text"] = training_labels["speech_content"].apply(lambda x: tokenizer.tokenize(x))

# Calculate token length by finding the length of each list in "tokenized_text"
training_labels["token_length"] = training_labels["tokenized_text"].apply(len)

# Display the DataFrame to confirm tokenized text and length
print(training_labels[["speech_content", "tokenized_text", "token_length"]].head())

# show the distribution of token length
print(training_labels["token_length"].describe())

#show all with token length >128
print(training_labels[training_labels["token_length"] > 128])

print(len(training_labels[training_labels["token_length"] > 128]))

# save as csv
training_labels.to_csv("training_labels.csv", index=False)



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

                                      speech_content  \
0  Wir jedenfalls sagen: Diese Regierung nimmt di...   
1  Wäre es so, wie Sie schildern, würden sie das ...   
2  Der zwölfte Menschenrechtsbericht zeigt, dass ...   
3  Wir wollen Menschen helfen und Fluchtursachen ...   
4  Hierüber kann man reden. In der Sache muss ich...   

                                      tokenized_text  token_length  
0  [Wir, jedenfalls, sagen, :, Diese, Regierung, ...            63  
1  [Wäre, es, so, ,, wie, Sie, sch, ##ildern, ,, ...            50  
2  [Der, zwölf, ##te, Menschenrechts, ##bericht, ...            47  
3  [Wir, wollen, Menschen, helfen, und, Flucht, #...            51  
4  [Hier, ##über, kann, man, reden, ., In, der, S...            77  
count    5009.000000
mean       62.645438
std        17.272276
min        25.000000
25%        49.000000
50%        60.000000
75%        74.000000
max       136.000000
Name: token_length, dtype: float64
                                         speec

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Tokenize each entry in "speech_content" and store the tokens as text (not token IDs)
validation_labels["tokenized_text"] = validation_labels["speech_content"].apply(lambda x: tokenizer.tokenize(x))

# Calculate token length for each entry
validation_labels["token_length"] = validation_labels["tokenized_text"].apply(len)

# Add a column to indicate if the tokenized text is truncated
validation_labels["truncated"] = validation_labels["token_length"] > 128

# Truncate the tokenized text to 128 tokens if needed
validation_labels["truncated_text"] = validation_labels["tokenized_text"].apply(lambda x: x[:128] if len(x) > 128 else x)

# Save the DataFrame with the truncated and original text as well as the truncation indicator
validation_labels.to_csv("validation_labels_with_truncation.csv", index=False)

validation_labels.head()


Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file,tokenized_text,token_length,truncated,truncated_text
1670,Seit der Rede des Bundeskanzlers ist das Wort ...,20_69_11004705_16,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Seit, der, Rede, des, Bundeskanzler, ##s, ist...",203,True,"[Seit, der, Rede, des, Bundeskanzler, ##s, ist..."
1671,Aus diesem Grund begrüßen wir ausdrücklich die...,20_25_11005049_1,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Aus, diesem, Grund, begrü, ##ßen, wir, ausdrü...",101,False,"[Aus, diesem, Grund, begrü, ##ßen, wir, ausdrü..."
1672,"Das Zweite, was Sie ansprechen, sind Veränderu...",20_90_999990119_2,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Das, Zweite, ,, was, Sie, ans, ##prech, ##en,...",54,False,"[Das, Zweite, ,, was, Sie, ans, ##prech, ##en,..."
1673,"Ich bin aber der Überzeugung, dass jemand, der...",20_73_11004342_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[Ich, bin, aber, der, Überzeugung, ,, dass, je...",84,False,"[Ich, bin, aber, der, Überzeugung, ,, dass, je..."
1674,"also zum Beispiel die Geschlechterforschung, d...",20_114_11004041_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...,"[also, zum, Beispiel, die, Geschlechter, ##for...",78,False,"[also, zum, Beispiel, die, Geschlechter, ##for..."


In [None]:
# Only needed for double-checking
# for both traininng and validation, split into 5 equal parts and safe as excel
'''training_paraphrased = training_labels[training_labels['source_file']=="paraphrased"]
training_labels_split = np.array_split(training_paraphrased, 5)
validation_labels_split = np.array_split(validation_labels, 5)

for i, split in enumerate(training_labels_split):
    split.to_excel(f"training_labels_split_{i+1}.xlsx", index=False)
    print(f"Saved training_labels_split_{i+1}.xlsx")

for i, split in enumerate(validation_labels_split):
    split.to_excel(f"validation_labels_split_{i+1}.xlsx", index=False)'''

Saved training_labels_split_1.xlsx
Saved training_labels_split_2.xlsx
Saved training_labels_split_3.xlsx
Saved training_labels_split_4.xlsx
Saved training_labels_split_5.xlsx
