In [1]:
import pandas as pd
import glob
import os
import re
from itertools import combinations

In [8]:

folder_path = 'labeled_speech_data'
xlsx_files = glob.glob(os.path.join(folder_path, '*.xlsx'))
label_cols = ['polarizing', 'populist', 'extremist', 'neutral']

dfs = []

total_rows_initial = 0
rows_removed_checking_col = 0
rows_removed_neutral_conflict = 0
rows_removed_max_sentences = 0
rows_removed_max_characters = 0

# Process each file
for file in xlsx_files:
    # Ignore columns beyond the 7th column
    df = pd.read_excel(file, usecols=range(7), header=0)
    total_rows_initial += df.shape[0]
    
    # Assign standard column names
    df.columns = ['speech_content', 'speech_id_long', 'polarizing', 'populist', 'extremist', 'neutral', 'check_col']
    
    # Add 'is_validation' flag
    df['is_validation'] = 'validation' in file.lower()
    
    # Keep track of the source file (optional)
    df['source_file'] = os.path.basename(file)
    
    dfs.append(df)


combined_df = pd.concat(dfs, ignore_index=True)

# Remove rows based on the 'check_col' column (1 or x indicating keep row)
combined_df['check_col'] = combined_df['check_col'].astype(str).str.lower()
rows_before_check = combined_df.shape[0]
combined_df = combined_df[combined_df['check_col'].isin(['1', 'x'])]
rows_after_check = combined_df.shape[0]
rows_removed_checking_col = rows_before_check - rows_after_check


print("Number of rows removed using the checking column:", rows_removed_checking_col)

# Compute statistics on labels
print("\nCounts of each label:")
label_counts = combined_df[label_cols].sum()
print(label_counts)

# Co-occurrence of labels
co_occurrence = pd.DataFrame(0, index=label_cols, columns=label_cols)

for label1, label2 in combinations(label_cols, 2):
    count = ((combined_df[label1] == 1) & (combined_df[label2] == 1)).sum()
    co_occurrence.loc[label1, label2] = count
    co_occurrence.loc[label2, label1] = count

print("\nCo-occurrence of labels:")
print(co_occurrence)

# Find and remove rows where 'neutral' is not exclusive
neutral_conflicts = combined_df[
    (combined_df['neutral'] == 1) & (combined_df[label_cols[:-1]].sum(axis=1) > 0)
]

rows_removed_neutral_conflict = neutral_conflicts.shape[0]
print("\nNumber of rows where 'neutral' is not exclusive with other labels:", rows_removed_neutral_conflict)

# Remove conflicting rows
combined_df = combined_df.drop(neutral_conflicts.index)

# Clean the text content in 'speech_content' column
def clean_text(text):
    # Remove leading/trailing whitespaces and linebreaks
    text = text.strip().replace('\n', ' ').replace('\r', ' ')
    # Replace multiple spaces with a single space
    text = re.sub(' +', ' ', text)
    return text

combined_df['speech_content'] = combined_df['speech_content'].astype(str).apply(clean_text)

# Count how many speech excerpts contain quotation marks
quotation_marks = ['"', "'", '“', '”', '‘', '’', '«', '»', '‹', '›']
quotation_pattern = '[' + ''.join(quotation_marks) + ']'
num_quotes = combined_df['speech_content'].str.contains(quotation_pattern).sum()
print("\nNumber of rows with quotation marks:", num_quotes)

# Filter rows based on max sentences and max characters
def count_sentences(text):
    # Count number of sentence-ending punctuation marks
    return len(re.findall(r'[.!?:]', text))

# Specify max sentences and max characters (adjust as needed)
max_sentences = None  
max_characters = None  

# Filter rows based on max sentences
if max_sentences is not None:
    before_sentences = combined_df.shape[0]
    combined_df = combined_df[combined_df['speech_content'].apply(count_sentences) <= max_sentences]
    after_sentences = combined_df.shape[0]
    rows_removed_max_sentences = before_sentences - after_sentences
    print("\nRows removed based on max sentences:", rows_removed_max_sentences)

# Filter rows based on max characters
if max_characters is not None:
    before_chars = combined_df.shape[0]
    combined_df = combined_df[combined_df['speech_content'].str.len() <= max_characters]
    after_chars = combined_df.shape[0]
    rows_removed_max_characters = before_chars - after_chars
    print("Rows removed based on max characters:", rows_removed_max_characters)

# Separate validation and training data
combined_df = combined_df.drop(columns=['check_col'])
validation_df = combined_df[combined_df['is_validation'] == True]
training_df = combined_df[combined_df['is_validation'] == False]
training_df = training_df.drop(columns=['is_validation'])
validation_df = validation_df.drop(columns=['is_validation'])

# Check for duplicates in validation data based on 'speech_id_long'
duplicates = validation_df[validation_df.duplicated(subset=['speech_id_long'], keep=False)]

num_duplicates = duplicates.shape[0]
print("\nNumber of duplicate entries in validation data:", num_duplicates)

# Remove duplicates
validation_df = validation_df.drop_duplicates(subset=['speech_id_long'])
training_df = training_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
print("\nFinal training data shape:", training_df.shape)
print("Final validation data shape:", validation_df.shape)

# Additional statistics
total_rows_final = training_df.shape[0] + validation_df.shape[0]
total_rows_removed = total_rows_initial - total_rows_final

print("\nTotal rows in initial data:", total_rows_initial)
print("Total rows after processing:", total_rows_final)
print("Total rows removed during processing:", total_rows_removed)

print("\nBreakdown of rows removed:")
print(" - Removed by checking column:", rows_removed_checking_col)
print(" - Removed due to 'neutral' conflicts:", rows_removed_neutral_conflict)
print(" - Removed by max sentences filter:", rows_removed_max_sentences)
print(" - Removed by max characters filter:", rows_removed_max_characters)
print(" - Removed duplicates in validation data:", num_duplicates)

# Optionally, save the final datasets
training_df.to_pickle('training_data.pkl')
validation_df.to_pickle('validation_data.pkl')

Number of rows removed using the checking column: 273

Counts of each label:
polarizing     392
populist       132
extremist        5
neutral       2024
dtype: int64

Co-occurrence of labels:
            polarizing  populist  extremist  neutral
polarizing           0        83          4        1
populist            83         0          2        1
extremist            4         2          0        0
neutral              1         1          0        0

Number of rows where 'neutral' is not exclusive with other labels: 2

Number of rows with quotation marks: 178

Number of duplicate entries in validation data: 79

Final training data shape: (1670, 7)
Final validation data shape: (766, 7)

Total rows in initial data: 2762
Total rows after processing: 2436
Total rows removed during processing: 326

Breakdown of rows removed:
 - Removed by checking column: 273
 - Removed due to 'neutral' conflicts: 2
 - Removed by max sentences filter: 0
 - Removed by max characters filter: 0
 - Removed d

In [9]:
training_df.head()

Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file
0,Wir jedenfalls sagen: Diese Regierung nimmt di...,18_66_11004079_10,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
1,"Wäre es so, wie Sie schildern, würden sie das ...",18_161_11004135_4,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
2,"Der zwölfte Menschenrechtsbericht zeigt, dass ...",18_236_11003742_1,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx
3,Wir wollen Menschen helfen und Fluchtursachen ...,19_179_11003614_3,1,0,0,0,silja_02manual_pre_labeling_1610_final.xlsx
4,Hierüber kann man reden. In der Sache muss ich...,19_59_11004155_2,0,0,0,1,silja_02manual_pre_labeling_1610_final.xlsx


In [10]:
validation_df.head()

Unnamed: 0,speech_content,speech_id_long,polarizing,populist,extremist,neutral,source_file
0,Seit der Rede des Bundeskanzlers ist das Wort ...,20_69_11004705_16,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...
1,Aus diesem Grund begrüßen wir ausdrücklich die...,20_25_11005049_1,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...
2,"Das Zweite, was Sie ansprechen, sind Veränderu...",20_90_999990119_2,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...
3,"Ich bin aber der Überzeugung, dass jemand, der...",20_73_11004342_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...
4,"also zum Beispiel die Geschlechterforschung, d...",20_114_11004041_3,0,0,0,1,silja_05manual_pre_labeling_1610_validation_fi...
