In [1]:
import re
import typing
import json

import numpy as np
import pandas as pd
import requests
import tqdm

In [87]:
DATA_PATH: str = './dataset'
DATA_SUBSET: str = 'full'

LANGUAGE: str = 'English'
TOPIC: str = 'ukraine'
GROUPER: str = 'retrieved_source'
EXTRACTOR: str = r'\d\.\s(.+)\n'

SAMPLE_SIZE: int = 100

MODEL: str = 'mixtral:8x7b-instruct-v0.1-q6_K'  # 'llama3:70b-instruct-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K'

In [3]:
SYSTEM: str = \
    """
        You are a helpful assistant that retrieves lists of arguments contained in sets of social media posts relating to a certain topic, list as many arguments as appropriate. Your reply consists of nothing but a list of arguments and follows the list format specified below.
        
        Example input:
        Please find arguments within the following set of social media posts about ukraine:
	"Posts":
	[
        "Sad to see Ukraine's energy infrastructure under attack while the West twiddles its thumbs on providing critical military aid - time to put America first and focus on our own national security!", 
        "It's unconscionable that the US House of Representatives is dragging their feet on passing a Ukraine aid bill. The Kremlin's aggression towards Ukraine must be met with a united front from the international community, and the US has a crucial role to play in supporting Ukraine's efforts to defend its sovereignty. Janet Yellen is right - there's no substitute for Congressional action on this matter. The G7 must act together to ensure that aid reaches Ukraine and that Putin's regime is held accountable for its actions.", 
        "📢 US Treasury Sec Yellen urges Congress to pass Ukraine aid bill, stating there's 'no substitute' for our support in this critical time. Let's stand together with Ukraine, managing risks as a united G7 to uphold human rights, peace, and democracy worldwide! 🌍🇺🇦", "Standing with Ukraine 🇺🇦, we must urgently pass the aid bill. The time for debate is over; it's time for action. Our unity & support are crucial for justice and peace. #StandWithUkraine 🌍💪"
	]
	
	Example output:
	"Ukraine's energy infrastructure is under attack", 
        "The West, particularly the US, should provide critical military aid to Ukraine", 
        "The US should prioritize its national security and take action in supporting Ukraine", 
        "The US House of Representatives is delaying the passage of a Ukraine aid bill", 
        "The Kremlin's aggression towards Ukraine must be met with a united front from the international community", 
        "The US has a crucial role to play in supporting Ukraine's efforts to defend its sovereignty", 
        "There is no substitute for Congressional action on this matter", 
        "US Treasury Secretary Janet Yellen urges Congress to pass the Ukraine aid bill", 
        "The G7 must act together to ensure that aid reaches Ukraine and that Putin's regime is held accountable for its actions", 
        "It's time for the US to stand together with Ukraine, managing risks as a united G7 to uphold human rights, peace, and democracy worldwide", 
        "The aid bill must be passed urgently", "The time for debate is over; it's time for action", 
        "Unity and support from the US and the international community are crucial for justice and peace in Ukraine"
	
	"""

In [4]:
data: pd.DataFrame = (
    pd.read_parquet(f'{DATA_PATH}.{DATA_SUBSET}.parquet')
    .pipe(lambda _df: _df[_df['language'] == LANGUAGE])
    .pipe(lambda _df: _df[_df['topic'] == TOPIC])
)
data

Unnamed: 0_level_0,persona,model,topic,language,text,retrieved_source,annotation.topic,annotation.persona,annotation.authenticity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
c0e20a320406ea9273e4b8a5f9d5d6a93e21e110c8bf77ae,alt_right,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,Sad to see Ukraine's energy infrastructure und...,https://oilprice.com/Energy/Energy-General/Rus...,,,
695b60f976c5dad0e0852181c0f0e04cce6ec93401b91a9a,neutral,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,China's Xi proposes 4 priorities for Ukraine. ...,https://www.cnbc.com/2024/04/16/ukraine-war-li...,,,
7377850436bd8d2889e0f2037e2ecc379c9a5b821d60e5ec,liberal,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,Absolutely! It's time for NATO to adopt a forw...,https://en.interfax.com.ua/news/general/980528...,,,
40e120d328fc13049d591f1d7e7655098de034ae5197e26b,liberal,llama2:70b-chat-q6_K,ukraine,English,Devastating news out of Ukraine. The destroyed...,https://www.vaticannews.va/en/world/news/2024-...,,,
0e8c31e5ef8b359ee97a368a10491c2753f1c2c1e05dedb2,liberal,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,A year has passed since Sudan was plunged into...,https://www.washingtonpost.com/world/2024/04/1...,,,
...,...,...,...,...,...,...,...,...,...
1fc8d275bcb5a157c37ffcee21c0ada691201790221f5b93,liberal,llama2:70b-chat-q6_K,ukraine,English,Heartbreaking news from Ukraine as Russian dou...,https://www.bbc.com/news/world-europe-68761490,,,
3fcc2434d2b22c6e576947a6ef089f35dcad17ad667cea21,conservative,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,"Impressive! Amidst conflict, Ukraine's economy...",https://www.atlanticcouncil.org/blogs/ukrainea...,,,
72102782412f624afc6b8d23223c27c3669f94ccee0f1b3d,liberal,llama2:70b-chat-q6_K,ukraine,English,The war in Ukraine is not just a local conflic...,https://www.express.co.uk/news/world/1888999/r...,,,
590bfa3fb88d60a32531d1377a960451c88cf2d9ad288a2e,alt_right,mixtral:8x7b-instruct-v0.1-q6_K,ukraine,English,"China's Xi proposes his ""solution"" to the manu...",https://www.cnbc.com/2024/04/16/ukraine-war-li...,,,


In [93]:
data['retrieved_source'] = data['retrieved_source'].astype('category')


In [5]:
data.language.value_counts()

English    1281
Dutch         0
German        0
Name: language, dtype: int64

In [114]:
chunked_result: typing.List[pd.DataFrame] = []

In [104]:
# Count the occurrences of each value in the column
counts = data['retrieved_source'].value_counts()

# Filter the DataFrame based on the count condition
threshold = 40
selected_values = counts[counts > threshold].index

# Select rows where the value in 'column_name' occurs more than 40 times
selected_rows = data[data['retrieved_source'].isin(selected_values)]

print(selected_rows)

                                                       persona  \
id                                                               
c0e20a320406ea9273e4b8a5f9d5d6a93e21e110c8bf77ae     alt_right   
695b60f976c5dad0e0852181c0f0e04cce6ec93401b91a9a       neutral   
7377850436bd8d2889e0f2037e2ecc379c9a5b821d60e5ec       liberal   
0e8c31e5ef8b359ee97a368a10491c2753f1c2c1e05dedb2       liberal   
d50cc14d1059eff7cc37b79c515781cc9479d263adbe15aa       liberal   
...                                                        ...   
4f6dd6db227e435d04d4bb0a7d78f96e579bbafa7bf6ade4     alt_right   
504b513e984b6edf9fda4200935b949388feb3c493c742f7     alt_right   
98b8081fcdabc88329b2ea8ffb0ff57a686349ede9fea035       liberal   
431218579a5e305acadcf2139060f4f5198b3f307cd20964  conservative   
590bfa3fb88d60a32531d1377a960451c88cf2d9ad288a2e     alt_right   

                                                                            model  \
id                                                      

In [115]:
for label, group in selected_rows.groupby(GROUPER):
    
        try: 
            chunked_result.append(
                pd.DataFrame(
                    data=[
                        requests.post(
                            'https://inf.cl.uni-trier.de/',
                            json={
                                'model': MODEL,
                                'system': SYSTEM,
                                'prompt': f'Please list the arguments within the following set of social media posts about'
                                        + TOPIC + 
                                        '.\nDo not add an introduction or qualification. The output will be machine processed so stick to the prescribed list format.\n"Posts":\n{group["text"].tolist()}'
                                }).json()['response']                       
                    ],
                    columns=['arguments']
                )
                .assign(label=label)
            )
            
        except json.JSONDecodeError:
            print("invalid json response, skipping to next batch")

In [None]:
for label, group in data.groupby(GROUPER):
    for chunk in tqdm.tqdm(np.array_split(group, len(group) / SAMPLE_SIZE)):
        
        try: 
            chunked_result.append(
                pd.DataFrame(
                    data=[
                        requests.post(
                            'https://inf.cl.uni-trier.de/',
                            json={
                                'model': MODEL,
                                'system': SYSTEM,
                                'prompt': f'Please list the arguments within the following set of social media posts about'
                                        + TOPIC + 
                                        '.\nDo not add an introduction or qualification. The output will be machine processed so stick to the prescribed list format.\n"Posts":\n{chunk["text"].tolist()}'
                                }).json()['response']                       
                    ],
                    columns=['arguments']
                )
                .assign(label=label)
            )
            
        except json.JSONDecodeError:
            print("invalid json response, skipping to next batch")
    

In [117]:
chunked_result


[                                           arguments  \
 0   * Continued support for Ukraine is crucial fo...   
 
                                                label  
 0  https://cepa.org/article/europe-slumbers-at-uk...  ,
                                            arguments  \
 0   "The US should provide Ukraine with military ...   
 
                                                label  
 0  https://dppa.un.org/en/mtg-sc-9600-asg-jenca-u...  ,
                                            arguments  \
 0   "Russian aggression towards Ukraine is a thre...   
 
                                                label  
 0  https://en.interfax.com.ua/news/general/980528...  ,
                                            arguments  \
 0   * Continued Russian aggression towards Ukrain...   
 
                                                label  
 0  https://news.sky.com/story/are-we-heading-for-...  ,
                                            arguments  \
 0   * Continued Russian ag

In [118]:
argument_df = pd.concat(chunked_result, ignore_index=True)
print(argument_df)


                                            arguments  \
0    * Continued support for Ukraine is crucial fo...   
1    "The US should provide Ukraine with military ...   
2    "Russian aggression towards Ukraine is a thre...   
3    * Continued Russian aggression towards Ukrain...   
4    * Continued Russian aggression towards Ukrain...   
..                                                ...   
90   "Russia's aggression towards Ukraine must be ...   
91   * Continued Russian aggression towards Ukrain...   
92   "Russian military buildup on Ukraine's border...   
93   * Russia's actions towards Ukraine are a thre...   
94   "Ukraine's right to self-defense against exte...   

                                                label  
0   https://cepa.org/article/europe-slumbers-at-uk...  
1   https://dppa.un.org/en/mtg-sc-9600-asg-jenca-u...  
2   https://en.interfax.com.ua/news/general/980528...  
3   https://news.sky.com/story/are-we-heading-for-...  
4   https://news.sky.com/story/russ

In [119]:
argument_df['arguments'] = argument_df['arguments'].str.split(',')
argument_df = argument_df.explode('arguments')
print(argument_df['arguments'])

                                            arguments  \
0    * Continued support for Ukraine is crucial fo...   
0                                          not debate   
0    in supporting Ukraine\n* Unity and urgency ar...   
1    "The US should provide Ukraine with military ...   
1   \n"Ukraine needs support from the internationa...   
..                                                ...   
94    \n"Standing with Ukraine requires urgent action   
94                    including passing the aid bill"   
94        \n"Debate on the matter should be concluded   
94            and action should be taken immediately"   
94   \n"Unity and support from the US and internat...   

                                                label  
0   https://cepa.org/article/europe-slumbers-at-uk...  
0   https://cepa.org/article/europe-slumbers-at-uk...  
0   https://cepa.org/article/europe-slumbers-at-uk...  
1   https://dppa.un.org/en/mtg-sc-9600-asg-jenca-u...  
1   https://dppa.un.org/en/mtg-sc-9

In [121]:
argument_df['arguments'] = argument_df['arguments'].str.replace('\n|\*', '')
print(argument_df['arguments'])

0       Continued support for Ukraine is crucial for...
0                                            not debate
0      in supporting Ukraine Unity and urgency are n...
1      "The US should provide Ukraine with military ...
1     "Ukraine needs support from the international ...
                            ...                        
94        "Standing with Ukraine requires urgent action
94                      including passing the aid bill"
94            "Debate on the matter should be concluded
94              and action should be taken immediately"
94     "Unity and support from the US and internatio...
Name: arguments, Length: 809, dtype: object


  argument_df['arguments'] = argument_df['arguments'].str.replace('\n|\*', '')


In [122]:
argument_df['arguments'] = argument_df['arguments'].str.split(',')
argument_df = argument_df.explode('arguments')
argument_df['arguments'] = argument_df['arguments'].str.replace('\n|\*', '')
argument_df = argument_df.reset_index(drop=True)
argument_df = argument_df.drop_duplicates()
print(argument_df)

                                             arguments  \
0      Continued support for Ukraine is crucial for...   
1                                           not debate   
2     in supporting Ukraine Unity and urgency are n...   
3     "The US should provide Ukraine with military ...   
4    "Ukraine needs support from the international ...   
..                                                 ...   
804      "Standing with Ukraine requires urgent action   
805                    including passing the aid bill"   
806          "Debate on the matter should be concluded   
807            and action should be taken immediately"   
808   "Unity and support from the US and internatio...   

                                                 label  
0    https://cepa.org/article/europe-slumbers-at-uk...  
1    https://cepa.org/article/europe-slumbers-at-uk...  
2    https://cepa.org/article/europe-slumbers-at-uk...  
3    https://dppa.un.org/en/mtg-sc-9600-asg-jenca-u...  
4    https://dppa.

In [123]:
argument_df.to_parquet(f'arguments.by.{GROUPER}.{LANGUAGE}.parquet')
argument_df.to_csv(f'arguments.by.{GROUPER}.{LANGUAGE}.csv')


In [None]:
for label, group in data.groupby(GROUPER):
    for chunk in tqdm.tqdm(np.array_split(group, len(group) / SAMPLE_SIZE)):
        print(
            requests.post(
                                    'https://inf.cl.uni-trier.de/',
                                    json={
                                        'model': MODEL,
                                        'system': SYSTEM,
                                        'prompt': f'Please list the arguments within the following set of social media posts about'
                                        + TOPIC + 
                                        '.\nDo not add an introduction or qualification. The output will be machine processed so stick to the prescribed output format.\n"Posts":\n{chunk["text"].tolist()}'
                                    }
                                ).json()['response']
        )
        break

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:07<?, ?it/s]


 "Ukraine's energy infrastructure is at risk and needs protection", 
"The West should provide military aid to Ukraine for its defense", 
"The US should prioritize national security while supporting Ukraine", 
"The US House of Representatives is delaying the passage of a Ukraine aid bill", 
"International unity is necessary to counter the Kremlin's aggression towards Ukraine", 
"The US has a significant role in helping Ukraine defend its sovereignty", 
"Congressional action is essential for supporting Ukraine", 
"US Treasury Secretary Janet Yellen urges Congress to pass the Ukraine aid bill", 
"The G7 should act together to ensure that aid reaches Ukraine and hold Putin's regime accountable", 
"Immediate action and unity from the US and global community are vital for upholding human rights, peace, and democracy worldwide", 
"Passing the aid bill is urgent for justice and peace in Ukraine".


  0%|          | 0/3 [00:08<?, ?it/s]


 "Ukraine is experiencing cyber attacks on its energy infrastructure", 
"The international community should provide critical military aid to Ukraine", 
"The US should prioritize its national security and take action in supporting Ukraine", 
"The US House of Representatives is delaying the passage of a Ukraine aid bill", 
"The Kremlin's aggression towards Ukraine must be met with a united front from the international community", 
"The US has a crucial role to play in supporting Ukraine's efforts to defend its sovereignty", 
"There is no substitute for Congressional action on this matter", 
"US Treasury Secretary Janet Yellen urges Congress to pass the Ukraine aid bill", 
"The G7 must act together to ensure that aid reaches Ukraine and that Putin's regime is held accountable for its actions", 
"The US should stand together with Ukraine, managing risks as a united G7 to uphold human rights, peace, and democracy worldwide", 
"The aid bill must be passed urgently", 
"Unity and support from 

  0%|          | 0/3 [00:05<?, ?it/s]


 "The Russian military buildup near Ukraine is a threat to peace and stability in the region", 
"NATO should increase its presence and support for Eastern European countries, including Ukraine, to deter Russian aggression", 
"The US has an important role to play in supporting Ukraine's sovereignty and territorial integrity", 
"International sanctions against Russia are necessary to hold them accountable for their actions in Ukraine", 
"Political and economic support from the international community is crucial for Ukraine's stability and development", 
"The Russian annexation of Crimea and ongoing conflict in eastern Ukraine violates international law and must be condemned", 
"Ukraine has the right to defend itself against external threats, including through military means", 
"A diplomatic solution is needed to resolve the conflict in eastern Ukraine and ensure long-term peace and stability in the region".


  0%|          | 0/3 [00:14<?, ?it/s]

 "The Nord Stream 2 pipeline will increase Europe's dependence on Russian gas", 
"Ukraine has been a reliable transit country for Russian gas to Europe and should continue to play this role", 
"The US should impose sanctions on the Nord Stream 2 pipeline to protect Ukraine's interests", 
"Russian aggression towards Ukraine is a threat to European security", 
"The EU should diversify its energy sources to reduce dependence on Russian gas", 
"Germany's support for the Nord Stream 2 pipeline undermines Ukraine's sovereignty and security", 
"The US and EU should work together to counter Russian aggression in Ukraine", 
"Nord Stream 2 will make it harder for Ukraine to negotiate favorable gas transit deals with Russia", 
"Russia has a history of using energy exports as a political weapon", 
"The Nord Stream 2 pipeline is a geopolitical project that benefits Russia at the expense of Ukraine and Europe", 
"Ukraine's economy suffers when it loses gas transit fees from Russia", 
"The US should 




In [None]:
for label, group in data.groupby(GROUPER):
    for chunk in tqdm.tqdm(np.array_split(group, len(group) / SAMPLE_SIZE)):
        
        try: 
            chunked_result.append(
                pd.DataFrame(
                    data=[
                        match.group(1) for match in
                        (
                            re.compile(EXTRACTOR)
                            .finditer(
                                requests.post(
                                    'https://inf.cl.uni-trier.de/',
                                    json={
                                        'model': MODEL,
                                        'system': SYSTEM,
                                        'prompt': f'Please list the arguments within the following set of social media posts about' + TOPIC + '.\nDo not add an introduction or qualification.\n"Posts":\n{chunk["text"].tolist()}'                                    }
                                ).json()['response']
                            )
                        )
                    ],
                    columns=['arguments']
                )
                .assign(label=label)
            )
            break    
        except json.JSONDecodeError:
            print("invalid json response, skipping to next batch")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:06<?, ?it/s]
  0%|          | 0/3 [00:08<?, ?it/s]
  0%|          | 0/3 [00:07<?, ?it/s]
  0%|          | 0/3 [00:06<?, ?it/s]
