In [3]:
import os
import time
import uuid
import random
from datetime import datetime
import pandas as pd
import ollama

# --- Topic list ---
topics = [
    'US - Crime + Justice', 'World - Africa', 'World - Americas', 'World - Asia',
    'World - Australia', 'World - China', 'World - Europe', 'World - India',
    'World - Middle East', 'World - United Kingdom', 'Politics - CNN Polls',
    'Politics - Elections', 'Business - Tech', 'Business - Media',
    'Business - Markets', 'Business - Pre-markets', 'Business - After-Hours',
    'Business - Investing', 'Business - Markets Now', 'Health - Fitness',
    'Health - Food', 'Health - Sleep', 'Health - Mindfulness',
    'Health - Relationships', 'Entertainment - Movies',
    'Entertainment - Television', 'Entertainment - Celebrity', 'Tech - Innovate',
    'Tech - Foreseeable Future', 'Tech - Innovative Cities', 'Style - Arts',
    'Style - Design', 'Style - Fashion', 'Style - Architecture', 'Style - Luxury',
    'Style - Beauty', 'Travel - Destinations', 'Travel - Food & Drink',
    'Travel - Lodging and Hotels', 'Travel - News', 'Sports - Pro Football',
    'Sports - College Football', 'Sports - Basketball', 'Sports - Baseball',
    'Sports - Soccer', 'Sports - Olympics', 'Sports - Hockey', 'Science - Space',
    'Science - Life', 'Science - Medicine', 'Science - Climate',
    'Science - Solutions', 'Science - Weather'
]

# --- Generation function ---
def generate_article(topic, model="phi3:mini", temperature=0.9, max_tokens=750):
    """
    Generate a realistic news article locally using Ollama (Phi-3 Mini model).
    """
    prompt = f"""
Write a full news article in the style of CNN or DailyMail.
The story should sound realistic, factual, and human-written.
Use natural journalistic language with short and medium-length sentences.
Start with a strong lead paragraph summarizing who, what, where, and when.
Then expand with quotes, context, background, and a final paragraph about next steps or reactions.
Include realistic numbers, dates, and locations.
Add 1–3 short quotes attributed to plausible people (officials, witnesses, or experts).
Use neutral tone — no opinions, exaggeration, or bullet points.
The topic should be about {topic}.
Output only the article text (no title, no list, no explanation).
End cleanly after several paragraphs.
"""
    start_time = time.time()
    response = ollama.generate(
        model=model,
        prompt=prompt,
        options={"temperature": temperature, "num_predict": max_tokens}
    )
    duration = time.time() - start_time

    return {
        "topic": topic,
        "article": response["response"].strip(),
        "generation_time_sec": round(duration, 2)
    }



In [6]:
# --- Main loop ---
results = []
num_articles = 3000  # adjust dataset size as needed
phi_df = pd.DataFrame(columns=['uniqueID', 'topic', 'article'])

for i in range(num_articles):
    print(f"Generating article {i+1}/{num_articles}...")
    topic = random.choice(topics)
    article_data = generate_article(topic)

    article_uuid = str(uuid.uuid4())

    phi_df.loc[len(phi_df)] = [
        article_uuid,
        article_data['topic'],
        article_data['article']
    ]

    results.append(article_data)

# --- Save outputs ---
os.makedirs("Phi3_Generations", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save each article as text (optional)
for i, res in enumerate(results, 1):
    filename = f"Phi3_Generations/article_{i:04d}_{timestamp}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"Topic: {res['topic']}\n\n{res['article']}")

# Save combined CSV
csv_path = f"Phi3_Generations/phi3_articles_{timestamp}.csv"
phi_df.to_csv(csv_path, index=False, encoding="utf-8")

# --- Summary ---
avg_time = sum(r['generation_time_sec'] for r in results) / len(results)
print(f"\n Generated {len(results)} articles.")
print(f"Average generation time: {avg_time:.2f} sec/article")
print(f"CSV saved at: {csv_path}")


Generating article 1/3000...
Generating article 2/3000...
Generating article 3/3000...
Generating article 4/3000...
Generating article 5/3000...
Generating article 6/3000...
Generating article 7/3000...
Generating article 8/3000...
Generating article 9/3000...
Generating article 10/3000...
Generating article 11/3000...
Generating article 12/3000...
Generating article 13/3000...
Generating article 14/3000...
Generating article 15/3000...
Generating article 16/3000...
Generating article 17/3000...
Generating article 18/3000...
Generating article 19/3000...
Generating article 20/3000...
Generating article 21/3000...
Generating article 22/3000...
Generating article 23/3000...
Generating article 24/3000...
Generating article 25/3000...
Generating article 26/3000...
Generating article 27/3000...
Generating article 28/3000...
Generating article 29/3000...
Generating article 30/3000...
Generating article 31/3000...
Generating article 32/3000...
Generating article 33/3000...
Generating article 