In [1]:

import typing

import pandas as pd

import config
import src

import re
import json
import numpy as np
import requests
import tqdm

In [3]:
CFG = config.Config()

In [6]:
LANGUAGE: str = 'English'
TOPIC: str = 'ukraine'
GROUPER: str = 'topic'
EXTRACTOR: str = r'\d\.\s(.+)\n'

SAMPLE_SIZE: int = 100

In [4]:
dataset: pd.DataFrame = pd.read_parquet(CFG.final_data_files["user_content"])
dataset

KeyError: 'user_content'

In [None]:
MODEL: str = "llama3:70b-instruct-q6_K"

In [None]:
SYSTEM_claim: str = \
    """
        Instruction:

        You are a text annotation assitant. Analyze a collection of social media comments, enclosed in chevrons <..>. Identify and list the claims within these comments. Claims can be related to events, issues, opinions or concerns in relation to the specified topic.
        Claims are defined as the main assertion or conclusion of an argument.
        You summarize each claim into a short simple sentence.

        Response format:

        You provide only the list of claims, separated by commas, without any additional text or explanations. If no claims can be identified, return an empty list [].

        Response format template:
        
        ["claim 1", "claim 2", "claim 3"]
	"""

In [None]:
#test generalized claim mining
requests.post(
                            'https://inf.cl.uni-trier.de/',
                            json={
                                'model': MODEL,
                                'system': SYSTEM_claim,
                                'prompt': f'The following set of social media posts are about '
                                        + TOPIC + 
                                        f'. Check whether your answer only consists of a list of claims. \n"Posts":\n<{dataset["content"][:5].to_list()}>'
                                }).json()['response']

In [None]:
#apply generalized claim mining to get a list of claims:
chunked_result: typing.List[pd.DataFrame] = []
for label, group in dataset.groupby(GROUPER):
    for index, row in tqdm.tqdm(group.iterrows()):
        try: 
            chunked_result.append(
                pd.DataFrame(
                    data=[
                        requests.post(
                            'https://inf.cl.uni-trier.de/',
                            json={
                                'model': MODEL,
                                'system': SYSTEM_claim,
                                'prompt': f'The following set of social media posts are about '
                                        + TOPIC + 
                                        f'. Check whether your answer strictly adheres to the specified format. \n"Posts":\n<{row["text"]}>'
                                }).json()['response']                       
                    ],
                    columns=['claims']
                )
                .assign(label=label)
            )
            
        except json.JSONDecodeError:
            print("invalid json response, skipping to next batch")

In [None]:
claim_df = pd.concat(chunked_result, ignore_index=True)
print(claim_df)

In [None]:
claim_df.to_json('data/claims.by.{GROUPER}.{LANGUAGE}.json', orient="records", force_ascii=False, indent=4)
claim_df.to_parquet('data/claims.by.{GROUPER}.{LANGUAGE}.parquet')