In [1]:
import concurrent.futures

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

load_dotenv()
client = OpenAI()

In [None]:
import random

subjects = [
    "Government",
    "Tech Company",
    "Celebrity",
    "Startup",
    "Research Team",
    "Local Resident",
    "Police",
    "Investor",
]
verbs = [
    "announces",
    "investigates",
    "launches",
    "reveals",
    "warns about",
    "plans",
    "criticizes",
    "embraces",
]
objects = [
    "new policy",
    "AI breakthrough",
    "security flaw",
    "data leak",
    "controversial statement",
    "market crash",
    "climate report",
    "merger deal",
    "tax reform",
    "cyberattack",
    "digital currency",
    "vaccine trial",
    "space mission",
    "privacy concern",
    "natural disaster",
    "trade agreement",
    "immigration plan",
    "education reform",
    "job cuts",
    "stock surge",
    "interest rate hike",
    "housing crisis",
    "energy shortage",
    "drought warning",
    "public protest",
    "military operation",
    "court ruling",
    "pandemic update",
    "inflation spike",
    "budget proposal",
    "surveillance program",
    "AI regulation",
]
contexts = [
    "amid rising tensions",
    "after public backlash",
    "during global summit",
    "in unexpected move",
    "with bipartisan support",
    "sparking debate",
    "to curb inflation",
    "ahead of elections",
]


def generate_headlines(n=1000):
    headlines = set()
    while len(headlines) < n:
        headline = f"{random.choice(subjects)} {random.choice(verbs)} {random.choice(objects)} {random.choice(contexts)}"
        headlines.add(headline)
    return list(headlines)


headlines = generate_headlines(1000)
print(headlines[:10])



In [3]:
def process_gpt_request(input_msg, line):
    response = client.responses.create(
        model="gpt-4o-mini", input=input_msg.format(line=line.strip())
    )
    return response.output_text.strip()


executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)

## Headlines

In [4]:
input_msg = "Please write a concrete news article headline taking inspiration from the following situation: {line}. You should mention the details of the situation in the headline. Please just return the headline in plain text."

In [5]:
res = process_gpt_request(
    input_msg, "Celebrity embraces education reform amid rising tensions"
)
print(res)

"Celebrity Activist Launches Education Reform Initiative as Tensions Escalate in Local School Districts"


In [None]:
with open("/workspace/datasets/news/headlines.txt", "w") as out:
    futures = [
        executor.submit(process_gpt_request, input_msg, line) for line in headlines
    ]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(headlines)):
        out.write(future.result() + "\n")

## Articles

In [None]:
input_msg = "Please write a three paragraph news story about the fictional headline '{line}'. Please just return the three paragraphs in plain text."

In [11]:
res = process_gpt_request(
    input_msg,
    "Startup Admits to Ignoring Natural Disaster Reports After Public Outcry Over Delayed Safety Measures",
)
print(res)

In a startling revelation, TechX, a rapidly growing startup known for its innovative safety solutions, has admitted to overlooking critical reports on impending natural disasters. This admission comes on the heels of public outcry and mounting criticism from both consumers and safety advocates who charged the company with negligence after the launch of a new safety product that failed to incorporate necessary precautions during recent catastrophic weather events.

The backlash intensified following multiple instances where the startup's technology fell short of protecting users in affected areas. Customers reported that key features designed to alert users about imminent dangers were either delayed or completely absent. In response to the public's concerns, TechX held a press conference where CEO Angela Rowe apologized for the oversight, asserting that the company's commitment to user safety is paramount. She promised a comprehensive review of their protocols and the immediate implemen

In [5]:
with (
    open("/workspace/datasets/news/headlines.txt") as f,
    open("/workspace/datasets/news/three_paragraph.txt", "w") as out,
):
    futures = [executor.submit(process_gpt_request, input_msg, line) for line in f]
    for future in tqdm(futures, total=len(futures)):
        out.write(future.result().replace("\n", "\\n") + "\n")

100%|██████████| 737/737 [07:59<00:00,  1.54it/s]
