In [50]:
import re
import typing
import json

import numpy as np
import pandas as pd
import requests
import tqdm

In [51]:
DATA_PATH: str = './dataset'
DATA_SUBSET: str = 'full'

LANGUAGE: str = 'German'
TOPIC: str = 'ukraine'
GROUPER: str = 'persona'
EXTRACTOR: str = r'\d\.\s(.+)\n'

SAMPLE_SIZE: int = 100

MODEL: str = 'mixtral:8x7b-instruct-v0.1-q6_K'  # 'llama3:70b-instruct-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K'

In [52]:
SYSTEM: str = \
    """
	You retrieve the arguments used in lists of social media posts relating to a certain topic, list as many arguments as appropriate. You reply with a list of arguments in the list format specified below and nothing else.
	
	Posts:
	[
        "Sad to see Ukraine's energy infrastructure under attack while the West twiddles its thumbs on providing critical military aid - time to put America first and focus on our own national security!", 
        "It's unconscionable that the US House of Representatives is dragging their feet on passing a Ukraine aid bill. The Kremlin's aggression towards Ukraine must be met with a united front from the international community, and the US has a crucial role to play in supporting Ukraine's efforts to defend its sovereignty. Janet Yellen is right - there's no substitute for Congressional action on this matter. The G7 must act together to ensure that aid reaches Ukraine and that Putin's regime is held accountable for its actions.", 
        "📢 US Treasury Sec Yellen urges Congress to pass Ukraine aid bill, stating there's 'no substitute' for our support in this critical time. Let's stand together with Ukraine, managing risks as a united G7 to uphold human rights, peace, and democracy worldwide! 🌍🇺🇦", "Standing with Ukraine 🇺🇦, we must urgently pass the aid bill. The time for debate is over; it's time for action. Our unity & support are crucial for justice and peace. #StandWithUkraine 🌍💪"
	]
	
	Arguments:
	[
        "Ukraine's energy infrastructure is under attack", 
        "The West, particularly the US, should provide critical military aid to Ukraine", 
        "The US should prioritize its national security and take action in supporting Ukraine", 
        "The US House of Representatives is delaying the passage of a Ukraine aid bill", 
        "The Kremlin's aggression towards Ukraine must be met with a united front from the international community", 
        "The US has a crucial role to play in supporting Ukraine's efforts to defend its sovereignty", 
        "There is no substitute for Congressional action on this matter", 
        "US Treasury Secretary Janet Yellen urges Congress to pass the Ukraine aid bill", 
        "The G7 must act together to ensure that aid reaches Ukraine and that Putin's regime is held accountable for its actions", 
        "It's time for the US to stand together with Ukraine, managing risks as a united G7 to uphold human rights, peace, and democracy worldwide", 
        "The aid bill must be passed urgently", "The time for debate is over; it's time for action", 
        "Unity and support from the US and the international community are crucial for justice and peace in Ukraine"
	]
	"""

In [53]:
data: pd.DataFrame = (
    pd.read_parquet(f'{DATA_PATH}.{DATA_SUBSET}.parquet')
    .pipe(lambda _df: _df[_df['language'] == LANGUAGE])
    .pipe(lambda _df: _df[_df['topic'] == TOPIC])
)
data

Unnamed: 0_level_0,persona,model,topic,language,text,retrieved_source,annotation.topic,annotation.persona,annotation.authenticity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
e905c64ed39ce239700abf6ee3d73fb9a75b1917ce27268d,neutral,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,German,Die Stabilität der Welt hängt auch von der Lös...,,,,
0d79f1537c2efb299b37084db31eb912ca551e3e4f94d9ec,conservative,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,German,Unterstützen wir weiterhin Ukraine in ihrem Ka...,https://www.gov.uk/government/news/foreign-sec...,,,
87c70235e5ad0c1005636a4ab2e0b389e2c81b87a00fa347,liberal,llama2:70b-chat-q6_K,ukraine,German,@Liberaler_Insgesamt🚨Die Energiepreise in Euro...,https://www.dynamikeseidhseis.gr/en/european-e...,,,
40bf732f9e96cfadf214bd969dd0e98b0c7975a3344ad7b8,conservative,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,German,Staatliche Eingriffe in Wirtschaft und Außenpo...,https://www.ft.com/content/98f15b60-bc4d-4d3c-...,,,
96d620855906a606baf62736f84e4f9e3970ce56c5db1e85,alt_right,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,German,Ukraine standhaft trotz übermächtiger russisch...,https://www.voanews.com/a/at-least-50-000-russ...,,,
...,...,...,...,...,...,...,...,...,...
fb762108836c05d7195f810c89440d16e4653f3070366c06,alt_right,llama2:70b-chat-q6_K,ukraine,German,It's a joke how the EU pretends to be concerne...,https://www.euronews.com/2024/04/13/ukraine-wa...,,,
d691424c960b13a040b1619b4d0220a0f84026a9068423ac,neutral,llama2:70b-chat-q6_K,ukraine,German,Ukraine's struggle against Russia's aggression...,https://www.politico.eu/article/why-ukraine-lo...,,,
45a4fe47837c350b488be2993cdb737796b820aa5c93c33d,alt_right,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,German,Die Ukraine spielt eine zentrale Rolle für die...,,,,
44f506523ed36bfb80e8f20912058dc79855267447f13ada,liberal,llama2:70b-chat-q6_K,ukraine,German,@Liberal_Voice: Day 783 of the Russia-Ukraine ...,https://www.aljazeera.com/news/2024/4/17/russi...,,,


In [54]:
chunked_result: typing.List[pd.DataFrame] = []

In [55]:
for label, group in data.groupby(GROUPER):
    for chunk in tqdm.tqdm(np.array_split(group, len(group) / SAMPLE_SIZE)):
        
        try: 
            chunked_result.append(
                pd.DataFrame(
                    data=[
                        match.group(1) for match in
                        (
                            re.compile(EXTRACTOR)
                            .finditer(
                                requests.post(
                                    'https://inf.cl.uni-trier.de/',
                                    json={
                                        'model': MODEL,
                                        'system': SYSTEM,
                                        'prompt': f'Posts:\n{chunk["text"].tolist()}\n\nArguments:\n'
                                    }
                                ).json()['response']
                            )
                        )
                    ],
                    columns=['arguments']
                )
                .assign(label=label)
            )
            
        except json.JSONDecodeError:
            print("invalid json response, skipping to next batch")



  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:13<00:27, 13.53s/it][A[A

 67%|██████▋   | 2/3 [00:38<00:20, 20.23s/it][A[A

100%|██████████| 3/3 [01:38<00:00, 32.85s/it][A[A


invalid json response, skipping to next batch




  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [01:00<02:00, 60.07s/it][A[A

invalid json response, skipping to next batch




 67%|██████▋   | 2/3 [01:59<00:59, 59.54s/it][A[A

100%|██████████| 3/3 [02:08<00:00, 42.82s/it][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.15s/it][A[A

 67%|██████▋   | 2/3 [00:33<00:17, 17.29s/it][A[A

100%|██████████| 3/3 [00:47<00:00, 15.76s/it][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:23<00:47, 23.90s/it][A[A

 67%|██████▋   | 2/3 [00:45<00:22, 22.53s/it][A[A

100%|██████████| 3/3 [01:26<00:00, 28.81s/it][A[A

invalid json response, skipping to next batch





In [58]:
argument_df = pd.concat(chunked_result, ignore_index=True)

argument_df.to_parquet(f'arguments.by.{GROUPER}.parquet')
argument_df.to_csv(f'arguments.by.{GROUPER}.csv')