In [64]:
import hashlib
import itertools
import json
import random
from time import sleep
from typing import List, Dict, Tuple

import requests
from tqdm import tqdm

In [65]:
RAW_DATA_DIR: str = './raw'

In [66]:
# experiment setup

integrations: List[dict] = [
    {
        "provider": "huggingFace",
        "model": "mistralai/Mistral-7B-Instruct-v0.2",
    },
    # disabling openai during first test
    # {
    #     "provider": "OpenAI",
    #     "model": "gpt-3.5-turbo"
    # }
]

personas: List[List[str]] = [
    ["liberal"],
    ["conservative"],
    ["alt_right"]
]
languages: List[str] = ["English", "German", "Dutch"]
platforms: List[str] = ["Twitter", "Reddit"]

# topics with subtopics
topics: List[Dict[str, str | List[str]]] = [
    {
        "theme": "Economy",
        "aspect": ["taxes", "inflation", "unemployment", "wages"]
    },
    {
        "theme": "Covid-19",
        "aspect": ["vaccines", "prevention", "government regulations"]
    },
    {
        "theme": "Ukraine War",
        "aspect": ["military conflict", "international stability", "energy prices"]
    },
    {
        "theme": "Healthcare",
        "aspect": ["affordability", "public and private options", "screenings and prevention", "medical research"]
    },
    {
        "theme": "Environment",
        "aspect": ["recycling", "energy consumption", "climate change"]
    }
]

# number of iterations
x: int = 1

# hidden parameters (randomly chosen)
length: List[str] = ['few-word', 'single-sentence', 'short', 'long']

In [67]:
configurations: List[Tuple] = list(
    itertools.product(*[integrations, personas, languages, platforms, topics])
)
random.shuffle(configurations)

In [68]:
for _ in range(x):
    for integration, persona, language, platform, topic in tqdm(configurations):

        payload: dict = {
            "personas": persona,
            "integration": integration,
            "language": language,
            "platform": platform,
            "topic": f'{random.choice(topic["aspect"])} in view of {topic["theme"]}',
            "length": random.choice(length)
        }

        response = requests.post('https://api.twon.uni-trier.de/generate/', json=payload)

        if response.status_code == 500 or response.status_code == 502:
            print("500/502: Connection Error, too many request, try again later.")
            break

        try:
            data: dict = response.json()

        except Exception as e:
            print(e, ':', response)
            break

        sample: dict = {
            "persona": persona[0],
            "model": integration["model"],
            "topic": topic["theme"],
            "platform": platform,
            "language": language,
            "text": data["response"],
            "annotation": {
                "topic": None,
                "persona": None,
                "authenticity": None
            }
        }

        sample['id'] = hashlib.shake_256(str.encode(json.dumps(sample))).hexdigest(24)
        open(f'{RAW_DATA_DIR}/{sample["id"]}.json', 'w').write(json.dumps(sample, indent=4))
        sleep(5)

 22%|██▏       | 20/90 [02:53<10:08,  8.69s/it]

500/502: Connection Error, too many request, try again later.



