In [15]:
from openai import OpenAI
import json
import os
from tqdm import tqdm
import math
import glob
import re
from collections import defaultdict

In [5]:
def get_summary_from_gpt4(article):
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that summarizes news articles. Provide a concise summary in 2-4 sentences."},
                    {"role": "user", "content": f"Please summarize this article: {article}"}
                ],
                max_tokens=400,
                temperature=0
            )
    return response.choices[0].message.content.strip()

In [None]:
with open('../data/bitcoin/bitcoin_news.json', 'r', encoding='utf-8') as f:
    news_data = json.load(f)

chunk_size = 1000
num_chunks = math.ceil(len(news_data) / chunk_size)

for chunk_id in range(4,num_chunks):
    start_idx = chunk_id * chunk_size
    end_idx = min((chunk_id + 1) * chunk_size, len(news_data))
    chunk_data = news_data[start_idx:end_idx]

    print(f"summarizing articles in chunk id: {chunk_id}")
    for article in tqdm(chunk_data):
        summary = get_summary_from_gpt4(article['full_article'])
        article['summary'] = summary

    # Save chunk to separate file
    chunk_filename = f'../data/bitcoin/bitcoin_news_with_summaries_chunk_{chunk_id}.json'
    with open(chunk_filename, 'w', encoding='utf-8') as f:
        json.dump(chunk_data, f, indent=4, ensure_ascii=False)
    
    print(f'Saved chunk {chunk_id} to {chunk_filename}')


summarizing articles in chunk id: 4


100%|██████████| 1000/1000 [39:15<00:00,  2.36s/it]


Saved chunk 4 to ../data/bitcoin/bitcoin_news_with_summaries_chunk_4.json
summarizing articles in chunk id: 5


100%|██████████| 827/827 [33:33<00:00,  2.43s/it]

Saved chunk 5 to ../data/bitcoin/bitcoin_news_with_summaries_chunk_5.json





In [13]:
# Path pattern for your chunk files
chunk_files = sorted(glob.glob('../data/bitcoin/bitcoin_news_with_summaries_chunk_*.json'))

all_articles = []

for chunk_file in chunk_files:
    with open(chunk_file, 'r', encoding='utf-8') as f:
        chunk_data = json.load(f)
        all_articles.extend(chunk_data)  # Add all articles from this chunk

# Save the concatenated result
with open('../data/bitcoin/bitcoin_news_with_summaries.json', 'w', encoding='utf-8') as f:
    json.dump(all_articles, f, indent=4, ensure_ascii=False)

print(f"Combined {len(chunk_files)} chunks into {len(all_articles)} articles.")


Combined 6 chunks into 5827 articles.


In [17]:
# Ask GPT to select one summary per day

# Load all articles
with open('../data/bitcoin/bitcoin_news_with_summaries.json', 'r', encoding='utf-8') as f:
    all_articles = json.load(f)

# Group articles by date
articles_by_date = defaultdict(list)
for article in all_articles:
    date = article['publication_time'][:10]  # Extract YYYY-MM-DD
    articles_by_date[date].append(article)

client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

selected_articles = []

for date, articles in tqdm(articles_by_date.items()):
    # Prepare a prompt with all summaries for the day
    summaries = [
        f"Article {i+1}: {a.get('summary')}"
        for i, a in enumerate(articles)
    ]
    system_prompt = (
        "You are an expert in financial forecasting."
        "Given the following news article summaries for a single day,"
        "select the one article that would be most useful for forecasting the price of Bitcoin."
        "Reply ONLY with the article number (e.g., 'Article 2')."
    )

    user_prompt = "\n\n".join(summaries)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=100,
        temperature=0
    )
    answer = response.choices[0].message.content.strip()
    # Extract the article number from the response
    match = re.search(r'Article\s*(\d+)', answer)
    if match:
        idx = int(match.group(1)) - 1
        if 0 <= idx < len(articles):
            selected_articles.append(articles[idx])
        else:
            print(f"Index out of range for date {date}: {answer}")
    else:
        print(f"Could not parse GPT response for date {date}: {answer}")

# Save the selected articles
with open('../data/bitcoin/bitcoinprice_news_selected_one_per_day.json', 'w', encoding='utf-8') as f:
    json.dump(selected_articles, f, indent=4, ensure_ascii=False)

print(f"Selected {len(selected_articles)} articles (one per day).")

  0%|          | 2/1166 [00:01<10:35,  1.83it/s]

Could not parse GPT response for date 2018-01-02: I'm sorry, but none of the articles provided are relevant to forecasting the price of Bitcoin.


  5%|▍         | 55/1166 [00:26<09:31,  1.94it/s]

Could not parse GPT response for date 2018-03-05: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 11%|█         | 131/1166 [01:06<07:50,  2.20it/s]

Could not parse GPT response for date 2018-05-29: I'm sorry, but none of the articles provided seem to be directly related to Bitcoin or its price forecasting. Could you please provide more relevant articles?


 12%|█▏        | 136/1166 [01:12<18:59,  1.11s/it]

Could not parse GPT response for date 2018-06-04: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 12%|█▏        | 138/1166 [01:13<15:35,  1.10it/s]

Could not parse GPT response for date 2018-06-06: None of the articles provided are directly useful for forecasting the price of Bitcoin.


 21%|██        | 246/1166 [02:08<06:41,  2.29it/s]

Could not parse GPT response for date 2018-10-07: None of the articles provided contain information relevant to forecasting the price of Bitcoin.


 24%|██▍       | 279/1166 [02:23<06:17,  2.35it/s]

Could not parse GPT response for date 2018-11-12: None of the articles provided are directly related to Bitcoin or its market dynamics. Therefore, none of them would be useful for forecasting the price of Bitcoin.


 28%|██▊       | 332/1166 [02:50<08:58,  1.55it/s]

Could not parse GPT response for date 2019-01-07: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 30%|██▉       | 348/1166 [02:58<07:44,  1.76it/s]

Could not parse GPT response for date 2019-01-23: None of the articles provided are directly relevant to forecasting the price of Bitcoin.


 35%|███▌      | 409/1166 [03:30<08:01,  1.57it/s]

Could not parse GPT response for date 2019-03-27: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 35%|███▌      | 410/1166 [03:31<07:40,  1.64it/s]

Could not parse GPT response for date 2019-03-28: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 36%|███▋      | 425/1166 [03:40<06:50,  1.81it/s]

Could not parse GPT response for date 2019-04-12: None of the articles provided are relevant to forecasting the price of Bitcoin.


 42%|████▏     | 488/1166 [04:20<05:10,  2.18it/s]

Could not parse GPT response for date 2019-06-19: None


 57%|█████▋    | 669/1166 [06:22<04:16,  1.93it/s]  

Could not parse GPT response for date 2019-12-29: None


 58%|█████▊    | 680/1166 [06:27<04:04,  1.99it/s]

Could not parse GPT response for date 2020-01-11: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 65%|██████▌   | 762/1166 [07:15<03:28,  1.94it/s]

Could not parse GPT response for date 2020-04-06: None of the articles provided are directly useful for forecasting the price of Bitcoin.


 67%|██████▋   | 779/1166 [07:24<04:15,  1.51it/s]

Could not parse GPT response for date 2020-04-24: None of the articles provided are directly related to Bitcoin or the cryptocurrency market. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


 74%|███████▍  | 863/1166 [08:05<02:49,  1.78it/s]

Could not parse GPT response for date 2020-07-23: None of the articles are directly useful for forecasting the price of Bitcoin, as they all focus on Bitcoin SV, which is a different cryptocurrency.


 75%|███████▌  | 879/1166 [08:14<02:27,  1.94it/s]

Could not parse GPT response for date 2020-08-10: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be useful for forecasting the price of Bitcoin.


 76%|███████▌  | 883/1166 [08:17<03:51,  1.22it/s]

Could not parse GPT response for date 2020-08-14: I'm sorry, but none of the articles provided seem to be directly related to Bitcoin or its price forecasting. Could you please provide more relevant articles?


 91%|█████████ | 1063/1166 [09:46<00:48,  2.12it/s]

Could not parse GPT response for date 2021-02-22: I'm sorry, but none of the articles provided seem to be directly related to Bitcoin or its price forecasting. Could you please provide more relevant articles?


 95%|█████████▌| 1113/1166 [10:14<00:35,  1.48it/s]

Could not parse GPT response for date 2021-04-15: None of the articles provided are directly related to Bitcoin or cryptocurrency markets. Therefore, none of them would be particularly useful for forecasting the price of Bitcoin.


100%|█████████▉| 1163/1166 [10:38<00:01,  1.88it/s]

Could not parse GPT response for date 2021-12-05: None of the articles provided are relevant for forecasting the price of Bitcoin.


100%|██████████| 1166/1166 [10:40<00:00,  1.82it/s]

Selected 1143 articles (one per day).



