In [2]:
import os
import time
import uuid
import random
from datetime import datetime
import pandas as pd
import ollama

# --- Topic list ---
topics = [
    'US - Crime + Justice', 'World - Africa', 'World - Americas', 'World - Asia',
    'World - Australia', 'World - China', 'World - Europe', 'World - India',
    'World - Middle East', 'World - United Kingdom', 'Politics - CNN Polls',
    'Politics - Elections', 'Business - Tech', 'Business - Media',
    'Business - Markets', 'Business - Pre-markets', 'Business - After-Hours',
    'Business - Investing', 'Business - Markets Now', 'Health - Fitness',
    'Health - Food', 'Health - Sleep', 'Health - Mindfulness',
    'Health - Relationships', 'Entertainment - Movies',
    'Entertainment - Television', 'Entertainment - Celebrity', 'Tech - Innovate',
    'Tech - Foreseeable Future', 'Tech - Innovative Cities', 'Style - Arts',
    'Style - Design', 'Style - Fashion', 'Style - Architecture', 'Style - Luxury',
    'Style - Beauty', 'Travel - Destinations', 'Travel - Food & Drink',
    'Travel - Lodging and Hotels', 'Travel - News', 'Sports - Pro Football',
    'Sports - College Football', 'Sports - Basketball', 'Sports - Baseball',
    'Sports - Soccer', 'Sports - Olympics', 'Sports - Hockey', 'Science - Space',
    'Science - Life', 'Science - Medicine', 'Science - Climate',
    'Science - Solutions', 'Science - Weather'
]

# --- Generation function ---
def generate_article(topic, model="phi3:mini", temperature=0.9, max_tokens=750):
    """
    Generate a realistic news article locally using Ollama (Phi-3 Mini model).
    """
    prompt = f"""
Write a full news article in the style of CNN or DailyMail.
The story should sound realistic, factual, and human-written.
Use natural journalistic language with short and medium-length sentences.
Start with a strong lead paragraph summarizing who, what, where, and when.
Then expand with quotes, context, background, and a final paragraph about next steps or reactions.
Include realistic numbers, dates, and locations.
Add 1–3 short quotes attributed to plausible people (officials, witnesses, or experts).
Use neutral tone — no opinions, exaggeration, or bullet points.
The topic should be about {topic}.
Output only the article text (no title, no list, no explanation).
End cleanly after several paragraphs.
"""
    start_time = time.time()
    response = ollama.generate(
        model=model,
        prompt=prompt,
        options={"temperature": temperature, "num_predict": max_tokens}
    )
    duration = time.time() - start_time

    return {
        "topic": topic,
        "article": response["response"].strip(),
        "generation_time_sec": round(duration, 2)
    }



Crash-safe version:

In [None]:
CHECKPOINT_EVERY = 60  # adjust: 25–100 is a nice sweet spot

# --- Prep: same paths/names as before ---
os.makedirs("Phi3_Generations", exist_ok=True)
os.makedirs("Phi3_Generations/articles_txt_felipe", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = f"Phi3_Generations/phi3_articles_felipe.csv"

# Create CSV with header once
if not os.path.exists(csv_path):
    pd.DataFrame(columns=['uniqueID', 'topic', 'article']).to_csv(
        csv_path, index=False, encoding="utf-8"
    )

results = []
num_articles = 6000
phi_df = pd.DataFrame(columns=['uniqueID', 'topic', 'article'])

# Keep a small in-memory buffer for rows to append
buffer_rows = []

for i in range(num_articles):
    print(f"Generating article {i+1}/{num_articles}...")
    topic = random.choice(topics)
    article_data = generate_article(topic)

    article_uuid = str(uuid.uuid4())

    # In-memory structures (unchanged behavior)
    phi_df.loc[len(phi_df)] = [article_uuid, article_data['topic'], article_data['article']]
    results.append(article_data)
    buffer_rows.append((article_uuid, article_data['topic'], article_data['article']))

    # Save article text immediately (still very cheap)
    filename = f"Phi3_Generations/articles_txt_felipe/article_{i+1:04d}_{timestamp}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"Topic: {article_data['topic']}\n\n{article_data['article']}")

    # Append to CSV in batches
    if len(buffer_rows) >= CHECKPOINT_EVERY:
        pd.DataFrame(buffer_rows, columns=['uniqueID','topic','article']).to_csv(
            csv_path, mode='a', header=False, index=False, encoding="utf-8"
        )
        buffer_rows.clear()

# Flush any remaining rows
if buffer_rows:
    pd.DataFrame(buffer_rows, columns=['uniqueID','topic','article']).to_csv(
        csv_path, mode='a', header=False, index=False, encoding="utf-8"
    )

# Final snapshot (same name/path)
phi_df.to_csv(csv_path, index=False, encoding="utf-8")

# Summary (same text)
avg_time = sum(r.get('generation_time_sec', 0) for r in results) / len(results) if results else 0.0
print(f"\n Generated {len(results)} articles.")
print(f"Average generation time: {avg_time:.2f} sec/article")
print(f"CSV saved at: {csv_path}")

Generating article 1/6000...
Generating article 2/6000...
Generating article 3/6000...
Generating article 4/6000...
Generating article 5/6000...
Generating article 6/6000...
Generating article 7/6000...
Generating article 8/6000...
Generating article 9/6000...
Generating article 10/6000...
Generating article 11/6000...
Generating article 12/6000...
Generating article 13/6000...
Generating article 14/6000...
Generating article 15/6000...
Generating article 16/6000...
Generating article 17/6000...
Generating article 18/6000...
Generating article 19/6000...
Generating article 20/6000...
Generating article 21/6000...
Generating article 22/6000...
Generating article 23/6000...
Generating article 24/6000...
Generating article 25/6000...
Generating article 26/6000...
Generating article 27/6000...
Generating article 28/6000...
Generating article 29/6000...
Generating article 30/6000...
Generating article 31/6000...
Generating article 32/6000...
Generating article 33/6000...
Generating article 