In [None]:
from tqdm import tqdm
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk

In [None]:
# Download stopwords if necessary
nltk.download('stopwords')

In [None]:
# Load sentiment dictionaries and speech dataframe
sentiment_df_1 = pd.read_csv('sent_dictionary_1.csv')
sentiment_df_2 = pd.read_csv('sent_dictionary_2.csv')
speeches_df = pd.read_pickle('content_df_features.pkl')

In [None]:
# Prepare German stopwords and process speech content
stop_words = set(stopwords.words('german'))
sentiment_scores_1 = dict(zip(sentiment_df_1['v2'], sentiment_df_1['v3']))
sentiment_scores_2 = dict(zip(sentiment_df_2['feature'].str.strip(), sentiment_df_2['sentiment']))

In [None]:
def preprocess_text(text):
    # Lowercase, remove punctuation, and filter out stopwords
    return [word for word in text.lower().translate(str.maketrans('', '', string.punctuation)).split()
            if word not in stop_words]

In [None]:
# Apply preprocessing with tqdm
tqdm.pandas(desc="Preproocessing speeches")
speeches_df['processed_content'] = speeches_df['speech_content'].progress_apply(preprocess_text)

In [None]:
# Calculate sentiment score with sentiment dictionary
def calculate_sentiment_score(words, sentiment_scores):
    total_score = sum(sentiment_scores.get(word, 0) for word in words if word in sentiment_scores)
    return total_score / len(words) if len(words) > 0 else 0

In [None]:
tqdm.pandas(desc="Computing sentiment score with dictionary 1:")
speeches_df['sentiment_score_1'] = speeches_df['processed_content'].progress_apply(
    lambda words: calculate_sentiment_score(words, sentiment_scores_1)
)

In [None]:
tqdm.pandas(desc="Computing sentiment score with dictionary 2:")
speeches_df['sentiment_score_2'] = speeches_df['processed_content'].progress_apply(
    lambda words: calculate_sentiment_score(words, sentiment_scores_2)
)

In [None]:
# Save results
speeches_df.to_pickle('speeches_with_sentiment.pkl')