In [9]:
from openai import OpenAI
import json
import os
from tqdm import tqdm
import math
import glob

In [5]:
def get_summary_from_gpt4(article):
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that summarizes news articles. Provide a concise summary in 2-4 sentences."},
                    {"role": "user", "content": f"Please summarize this article: {article}"}
                ],
                max_tokens=400,
                temperature=0
            )
    return response.choices[0].message.content.strip()

In [None]:
with open('../data/bitcoin/bitcoin_news.json', 'r', encoding='utf-8') as f:
    news_data = json.load(f)

chunk_size = 1000
num_chunks = math.ceil(len(news_data) / chunk_size)

for chunk_id in range(4,num_chunks):
    start_idx = chunk_id * chunk_size
    end_idx = min((chunk_id + 1) * chunk_size, len(news_data))
    chunk_data = news_data[start_idx:end_idx]

    print(f"summarizing articles in chunk id: {chunk_id}")
    for article in tqdm(chunk_data):
        summary = get_summary_from_gpt4(article['full_article'])
        article['summary'] = summary

    # Save chunk to separate file
    chunk_filename = f'../data/bitcoin/bitcoin_news_with_summaries_chunk_{chunk_id}.json'
    with open(chunk_filename, 'w', encoding='utf-8') as f:
        json.dump(chunk_data, f, indent=4, ensure_ascii=False)
    
    print(f'Saved chunk {chunk_id} to {chunk_filename}')


summarizing articles in chunk id: 4


100%|██████████| 1000/1000 [39:15<00:00,  2.36s/it]


Saved chunk 4 to ../data/bitcoin/bitcoin_news_with_summaries_chunk_4.json
summarizing articles in chunk id: 5


100%|██████████| 827/827 [33:33<00:00,  2.43s/it]

Saved chunk 5 to ../data/bitcoin/bitcoin_news_with_summaries_chunk_5.json





In [13]:
# Path pattern for your chunk files
chunk_files = sorted(glob.glob('../data/bitcoin/bitcoin_news_with_summaries_chunk_*.json'))

all_articles = []

for chunk_file in chunk_files:
    with open(chunk_file, 'r', encoding='utf-8') as f:
        chunk_data = json.load(f)
        all_articles.extend(chunk_data)  # Add all articles from this chunk

# Save the concatenated result
with open('../data/bitcoin/bitcoin_news_with_summaries.json', 'w', encoding='utf-8') as f:
    json.dump(all_articles, f, indent=4, ensure_ascii=False)

print(f"Combined {len(chunk_files)} chunks into {len(all_articles)} articles.")


Combined 6 chunks into 5827 articles.
