In [1]:
from tqdm import tqdm
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk

In [2]:
# Download stopwords if necessary
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Silja/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Load sentiment dictionaries and speech dataframe
sentiment_df_1 = pd.read_csv('sent_dictionary_1.csv')
sentiment_df_2 = pd.read_csv('sent_dictionary_2.csv')
speeches_df = pd.read_pickle('content_df_features.pkl')

In [5]:
# Prepare German stopwords and process speech content
stop_words = set(stopwords.words('german'))
sentiment_scores_1 = dict(zip(sentiment_df_1['v2'], sentiment_df_1['v3']))
sentiment_scores_2 = dict(zip(sentiment_df_2['feature'].str.strip(), sentiment_df_2['sentiment']))

In [6]:
def preprocess_text(text):
    # Lowercase, remove punctuation, and filter out stopwords
    return [word for word in text.lower().translate(str.maketrans('', '', string.punctuation)).split()
            if word not in stop_words]

In [7]:
# Apply preprocessing with tqdm
tqdm.pandas(desc="Preprocessing speeches")
speeches_df['processed_content'] = speeches_df['speech_content'].progress_apply(preprocess_text)

Preprocessing speeches: 100%|██████████| 966046/966046 [11:08<00:00, 1445.80it/s]


In [8]:
# Calculate sentiment score with sentiment dictionary
def calculate_sentiment_score(words, sentiment_scores):
    total_score = sum(sentiment_scores.get(word, 0) for word in words if word in sentiment_scores)
    return total_score / len(words) if len(words) > 0 else 0

In [9]:
def calculate_two_sentiment_scores(words, sentiment_scores):
    positive_score = sum(sentiment_scores.get(word, 0) for word in words if word in sentiment_scores and sentiment_scores[word] > 0)
    negative_score = sum(abs(sentiment_scores.get(word, 0)) for word in words if word in sentiment_scores and sentiment_scores[word] < 0)
    
    total_words = len(words)
    positivity_score = positive_score / total_words if total_words > 0 else 0
    negativity_score = negative_score / total_words if total_words > 0 else 0
    
    return positivity_score, negativity_score

In [10]:
tqdm.pandas(desc="Computing sentiment score with dictionary 1")
speeches_df['sentiment_score_1'] = speeches_df['processed_content'].progress_apply(
    lambda words: calculate_sentiment_score(words, sentiment_scores_1)
)

Computing sentiment score with dictionary 1:   0%|          | 0/966046 [00:00<?, ?it/s]

Computing sentiment score with dictionary 1: 100%|██████████| 966046/966046 [02:05<00:00, 7682.59it/s] 


In [11]:
tqdm.pandas(desc="Computing sentiment score with dictionary 2")
speeches_df['sentiment_score_2'] = speeches_df['processed_content'].progress_apply(
    lambda words: calculate_sentiment_score(words, sentiment_scores_2)
)

Computing sentiment score with dictionary 2: 100%|██████████| 966046/966046 [01:59<00:00, 8116.32it/s] 


In [12]:
tqdm.pandas(desc="Computing positivity and negativity scores with dictionary 2.1")

speeches_df[['sentiment_score_2_positivity', 'sentiment_score_2_negativity']] = speeches_df['processed_content'].progress_apply(
    lambda words: pd.Series(calculate_two_sentiment_scores(words, sentiment_scores_2))
)

Computing positivity and negativity scores with dictionary 2.1: 100%|██████████| 966046/966046 [28:27<00:00, 565.85it/s]  


In [13]:
speeches_df.drop(columns=['processed_content'], inplace=True)

In [14]:
speeches_df

Unnamed: 0,id,electoral_term,session,first_name,document_url,last_name,faction_id,position_short,position_long,politician_id,...,faction,year,speech_length,age,gender,tenure,sentiment_score_1,sentiment_score_2,sentiment_score_2_positivity,sentiment_score_2_negativity
0,0,1,2,,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,köhler,5,Presidium of Parliament,präsident,11001150,...,CDU/CSU,1949,546,57.0,männlich,0.000000,0.000000,0.047619,0.047619,0.000000
1,1,1,2,,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,arnold,-1,Guest,präsident des bundesrats,-1,...,,1949,45,,,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,1,2,,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,köhler,5,Presidium of Parliament,präsident,11001150,...,CDU/CSU,1949,895,57.0,männlich,0.000000,0.133946,0.181818,0.212121,0.030303
3,3,1,2,,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,heuss,-1,Guest,bundespräsident,-1,...,,1949,24,,,0.000000,0.000000,1.000000,1.000000,0.000000
4,4,1,2,,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,köhler,5,Presidium of Parliament,präsident,11001150,...,CDU/CSU,1949,49,57.0,männlich,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966041,1075922,20,187,thomas,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,seitz,0,Member of Parliament,,11004891,...,AfD,2024,1799,57.0,männlich,6.594521,0.112649,0.030534,0.099237,0.068702
966042,1075923,20,187,petra,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,pau,7,Presidium of Parliament,Vizepräsidentin,11003206,...,DIE LINKE.,2024,18,61.0,weiblich,25.887671,0.000000,0.000000,0.000000,0.000000
966043,1075924,20,187,thomas,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,seitz,18,Member of Parliament,,11004891,...,Fraktionslos,2024,12,57.0,männlich,6.594521,0.000000,0.500000,0.500000,0.000000
966044,1075925,20,187,petra,https://dip21.bundestag.de/dip21/btp/20/20187.pdf,pau,7,Presidium of Parliament,Vizepräsidentin,11003206,...,DIE LINKE.,2024,65,61.0,weiblich,25.887671,0.000000,0.000000,0.000000,0.000000


In [15]:
# Save results
speeches_df.to_pickle('content_df_features_sentiment.pkl')

In [16]:
speeches_df.to_csv('content_df_features_sentiment.csv')