In [None]:
import pandas as pd
import os
import jsonlines
import re
os.getcwd()

In [None]:
training_df = pd.read_excel('training_labels_final_comments.xlsx')
validation_df = pd.read_excel('validation_labels_final_comments.xlsx')

In [None]:
# Sample 800 instances for each class (polarizing, neutral)
polarizing_1 = training_df[training_df['polarizing'] == 1].sample(n=824, replace=False)
neutral_1 = training_df[training_df['neutral'] == 1].sample(n=824, replace=False)

# Combine sampled data
sampled_df = pd.concat([polarizing_1, neutral_1])

# Shuffle the final sampled dataframe to randomize the order
sampled_df = sampled_df.sample(frac=1, random_state=8).reset_index(drop=True)

In [None]:
def adjust_speech_properties(speech_id, speech_content, polarizing):
    # Create a dictionary with the dynamic values
    data = {"messages": [{"role": "system","content": "Label the provided parliamentary speeches of the German Bundestag as polarizing or neutral.\n\nIdentify if a given speech is politically polarizing or neutral. Take into account tone, subject, language used, as well as direct and indirect indications of polarization. Make a decision based on these factors and provide the appropriate label.\n\n# Steps\n\n1. Analyze the content of the provided speech, taking into account:\n    - The language and tone: Is it divisive or confrontational?\n    - Content that explicitly criticizes or alienates opposing parties.\n    - The promotion of an \"us vs. them\" narrative.\n\n2. Compare linguistic markers and content with patterns of polarising speech to assess if there is alignment with polarizing content, or if it resembles neutral and objective content.\n3. Provide a label:\n    - \"0\" or \"1\" whether the speech is polarizing\n4. Always base your conclusion only after critically considering each relevant element of the speech.\n\n# Output Format\n\nProvide your output in the following Json format:\n```json\n{\n  \"speech_id\": \"ID_placeholder\",\n  \"polarizing\": \"0 or 1\"\n}\n```\n\nEnsure your response includes appropriate format based on the requested format structure.\n\n# Notes\n\n- Be mindful of implicit division or subtle language elements that could indicate polarization.\n- Always justify the given labels, explicitly linking elements from speech to your decision-making criteria."},{"role": "user","content": f"ID: {speech_id}\nSpeech: \"{speech_content}\""},{"role": "assistant","content": f'{{"speech_id":"{speech_id}","polarizing":"{polarizing}"}}'}]}


    # Convert the dictionary to a JSON string
    return data

In [None]:
jsons_training = []
for _, row in sampled_df.iterrows():
        json_data = adjust_speech_properties(
            row['speech_id_long'],
            re.sub(r'^[„“"]|[„“"]$', '', row['speech_content']),
            row['polarizing']
        )
        jsons_training.append(json_data)

In [None]:
with jsonlines.open('training.jsonl', mode='w') as writer:
    for item in jsons_training:
        writer.write(item)

In [None]:
jsons_validation = []
for _, row in validation_df.iterrows():
    json_data = adjust_speech_properties(
        row['speech_id_long'],
        re.sub(r'^[„“"]|[„“"]$', '', row['speech_content']),
        row['polarizing']
    )
    jsons_validation.append(json_data)

In [None]:
with jsonlines.open('validation.jsonl', mode='w') as writer:
    for item in jsons_training:
        writer.write(item)