In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

RAW_DATA_FOLDER = os.getenv("RAW_DATA_FOLDER", "/data/raw")
CLEAN_DATA_FOLDER = os.getenv("CLEAN_DATA_FOLDER", "/data/clean")

In [2]:
import polars as pl

cryptopanic_news = pl.read_parquet(
    f"{CLEAN_DATA_FOLDER}/cryptopanic_news_light_normalized.parquet"
)

cryptopanic_news

cryptopanic_id,title,description,source_domain,published_at,cryptopanic_url,currencies
i64,str,str,str,datetime[μs],str,str
176510,"""Crypto Investor Makes Over 3,6…","""An investor has made an incred…","""cryptodnes.bg""",2024-12-20 10:00:54,"""https://cryptopanic.com/news/2…","""SOL"""
176215,"""Cardano’s 10% jump sparks hope…","""After a quiet December, 2025 s…","""ambcrypto.com""",2025-01-02 11:00:07,"""https://cryptopanic.com/news/2…","""ADA"""
87112,"""Someone Paid 93 ETH In Fees Fo…","""Data shows someone has today p…","""bitcoinist.com""",2022-11-18 16:00:04,"""https://cryptopanic.com/news/1…","""ETH"""
82678,"""‘Critical’ Polygon bug put $24…","""Polygon was at risk of losing …","""theblockcrypto.com""",2021-12-29 19:01:22,"""https://cryptopanic.com/news/m…","""MATIC"""
120931,"""Charles Hoskinson reveals ‘con…","""The cryptocurrency market may …","""finbold.com""",2022-12-08 14:16:55,"""https://cryptopanic.com/news/1…","""ADA,XCN"""
…,…,…,…,…,…,…
92034,"""$0.001 SHIB Price Beckons As S…","""Moreover, the surging interest…","""zycrypto.com""",2023-07-11 17:53:13,"""https://cryptopanic.com/news/1…","""SHIB"""
183725,"""U.S banks can now legally hand…","""Market watchers believe allowi…","""ambcrypto.com""",2025-03-08 10:00:24,"""https://cryptopanic.com/news/2…","""BTC"""
88544,"""Cardano Founder Charles Hoskin…","""Cardano founder, Hoskinson has…","""bitcoinist.com""",2023-11-06 16:00:18,"""https://cryptopanic.com/news/1…","""ADA"""
82629,"""‘Wormhole’ protocol goes live,…","""Wormhole, a cross-chain messag…","""finbold.com""",2021-08-09 19:44:29,"""https://cryptopanic.com/news/t…","""ETH,SOL"""


In [3]:
cryptopanic_news.schema

Schema([('cryptopanic_id', Int64),
        ('title', String),
        ('description', String),
        ('source_domain', String),
        ('published_at', Datetime(time_unit='us', time_zone=None)),
        ('cryptopanic_url', String),
        ('currencies', String)])

In [8]:
import pandas as pd
import openai
import ast
import concurrent.futures
from tqdm import tqdm
import tiktoken
import os
import time
from collections import deque
from datetime import datetime, timedelta, timezone

API_KEY = os.getenv("OPENAI_API_KEY")

client = openai.Client(api_key=API_KEY)

MODEL = "gpt-4.1-nano"
ENCODING = tiktoken.encoding_for_model("gpt-4")
MAX_TOKENS = 8000  # Safe margin

MAX_TOKENS_PER_MINUTE = 200_000

SYSTEM_PROMPT = (
    "You are an expert financial news sentiment classifier specializing in cryptocurrencies. "
    "Your job is to assess whether each news item is likely to have a Positive, Neutral, or Negative "
    "impact on the mentioned cryptocurrencies. Consider market context and how crypto investors might respond."
)

PROMPT_PREFIX = (
    "You are a financial news sentiment classifier specializing in "
    "cryptocurrencies. Classify the sentiment (Positive, Neutral, Negative) "
    "for the following cryptocurrency news articles. Consider the impact on "
    "the mentioned cryptocurrencies in the context of the news.\n\n"
)

PROMPT_SUFFIX = (
    "Respond with a list of sentiment labels only in order "
    "(e.g., ['Positive', 'Neutral', ...])."
)

RESERVED_RESPONSE_TOKENS = 1000 
token_usage_log = deque()

In [9]:
def count_tokens(text: str) -> int:
    return len(ENCODING.encode(text))

def rate_limited_request(prompt: str, model: str = MODEL, max_tokens: int = RESERVED_RESPONSE_TOKENS):
    global token_usage_log

    # Calculate tokens used in the prompt + expected response
    tokens_used = count_tokens(prompt) + max_tokens

    # Clean up old entries (older than 60 seconds)
    now = datetime.now(timezone.utc)
    one_minute_ago = now - timedelta(seconds=60)
    while token_usage_log and token_usage_log[0][0] < one_minute_ago:
        token_usage_log.popleft()

    # Sum tokens used in the last 60 seconds
    tokens_last_minute = sum(t for ts, t in token_usage_log)

    # Wait if we're over the limit
    if tokens_last_minute + tokens_used > MAX_TOKENS_PER_MINUTE:
        wait_time = (token_usage_log[0][0] + timedelta(seconds=60) - now).total_seconds()
        print(f"[Rate Limit] Sleeping for {wait_time:.2f} seconds...")
        time.sleep(wait_time)

    # Make the API call
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0.2,
        )
        # Log token usage
        token_usage_log.append((datetime.now(timezone.utc), tokens_used))
        return response
    except Exception as e:
        print(f"Error during request: {e}")
        time.sleep(20)
        return None

def split_dataframe_by_token_limit(df: pd.DataFrame, token_limit: int = MAX_TOKENS - RESERVED_RESPONSE_TOKENS):
    batches = []
    current_batch = []
    static_token_cost = count_tokens(PROMPT_PREFIX) + count_tokens(PROMPT_SUFFIX)
    current_token_count = static_token_cost

    for idx, row in df.iterrows():
        row_text = (
            f"{idx}. Title: {row['title']}\n"
            f"Description: {row['description']}\n"
            f"Currencies: {row.get('currencies', 'N/A')}\n\n"
        )
        row_tokens = count_tokens(row_text)

        if current_token_count + row_tokens > token_limit:
            if current_batch:
                batches.append(pd.DataFrame(current_batch))
            current_batch = [row.to_dict()]
            current_token_count = static_token_cost + row_tokens
        else:
            current_batch.append(row.to_dict())
            current_token_count += row_tokens

    if current_batch:
        batches.append(pd.DataFrame(current_batch))

    return batches

def create_batch_prompt(batch: pd.DataFrame) -> str:
    prompt = PROMPT_PREFIX
    for idx, row in batch.iterrows():
        prompt += (
            f"{idx}. Title: {row['title']}\n"
            f"Description: {row['description']}\n"
            f"Currencies: {row.get('currencies', 'N/A')}\n\n"
        )
    prompt += PROMPT_SUFFIX
    return prompt

def classify_sentiment(batch: pd.DataFrame) -> list:
    prompt = create_batch_prompt(batch)
    response = rate_limited_request(prompt)

    if response is None:
        return ["Error"] * len(batch)

    try:
        content = response.choices[0].message.content.strip()
        sentiment_list = ast.literal_eval(content)
        return sentiment_list
    except Exception as e:
        print(f"Error parsing response: {e}")
        return ["Error"] * len(batch)
    
def process_all_batches(df: pd.DataFrame, max_workers: int = 4):
    results = []
    batches = split_dataframe_by_token_limit(df)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(classify_sentiment, batch) for batch in batches]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            results.extend(future.result())

    df = df.iloc[:len(results)].copy()
    df['sentiment'] = results
    return df

In [10]:
cryptopanic_news_pd = cryptopanic_news.to_pandas()

cryptopanic_news_with_labels = process_all_batches(cryptopanic_news_pd, max_workers=6)

  9%|▊         | 25/290 [00:12<01:49,  2.41it/s]

[Rate Limit] Sleeping for 49.04 seconds...


  9%|▉         | 26/290 [00:13<02:04,  2.12it/s]

[Rate Limit] Sleeping for 48.33 seconds...
[Rate Limit] Sleeping for 48.33 seconds...
[Rate Limit] Sleeping for 48.33 seconds...


 10%|█         | 29/290 [00:14<01:36,  2.71it/s]

[Rate Limit] Sleeping for 47.59 seconds...


 10%|█         | 30/290 [00:14<01:36,  2.69it/s]

[Rate Limit] Sleeping for 47.21 seconds...


 11%|█         | 31/290 [01:03<44:39, 10.35s/it]

[Rate Limit] Sleeping for 0.18 seconds...


 11%|█▏        | 33/290 [01:04<26:17,  6.14s/it]

[Rate Limit] Sleeping for 0.81 seconds...
[Rate Limit] Sleeping for 0.64 seconds...


 12%|█▏        | 34/290 [01:04<19:40,  4.61s/it]

[Rate Limit] Sleeping for 0.40 seconds...


 12%|█▏        | 35/290 [01:04<14:47,  3.48s/it]

[Rate Limit] Sleeping for 1.00 seconds...


 12%|█▏        | 36/290 [01:05<11:14,  2.66s/it]

[Rate Limit] Sleeping for 0.48 seconds...


 13%|█▎        | 37/290 [01:06<08:59,  2.13s/it]

[Rate Limit] Sleeping for 0.08 seconds...


 13%|█▎        | 38/290 [01:07<07:26,  1.77s/it]

[Rate Limit] Sleeping for 0.42 seconds...


 13%|█▎        | 39/290 [01:07<05:33,  1.33s/it]

Error parsing response: invalid syntax (<unknown>, line 2)
[Rate Limit] Sleeping for 0.17 seconds...


 14%|█▍        | 40/290 [01:07<04:11,  1.01s/it]

[Rate Limit] Sleeping for 0.85 seconds...


 14%|█▍        | 41/290 [01:07<03:15,  1.28it/s]

[Rate Limit] Sleeping for 0.60 seconds...
[Rate Limit] Sleeping for 0.55 seconds...


 15%|█▍        | 43/290 [01:08<02:42,  1.52it/s]

[Rate Limit] Sleeping for 0.15 seconds...


 15%|█▌        | 44/290 [01:09<02:51,  1.43it/s]

[Rate Limit] Sleeping for 0.29 seconds...


 16%|█▌        | 46/290 [01:10<02:07,  1.91it/s]

[Rate Limit] Sleeping for 0.09 seconds...
[Rate Limit] Sleeping for 0.72 seconds...


 16%|█▌        | 47/290 [01:10<01:41,  2.40it/s]

[Rate Limit] Sleeping for 0.58 seconds...


 17%|█▋        | 48/290 [01:11<02:04,  1.95it/s]

[Rate Limit] Sleeping for 1.09 seconds...


 17%|█▋        | 49/290 [01:12<02:21,  1.71it/s]

[Rate Limit] Sleeping for 0.33 seconds...
[Rate Limit] Sleeping for 0.25 seconds...


 18%|█▊        | 51/290 [01:13<02:06,  1.89it/s]

[Rate Limit] Sleeping for 0.60 seconds...


 18%|█▊        | 52/290 [01:13<01:55,  2.06it/s]

[Rate Limit] Sleeping for 0.25 seconds...


 18%|█▊        | 53/290 [01:13<01:41,  2.34it/s]

[Rate Limit] Sleeping for 0.73 seconds...


 19%|█▊        | 54/290 [01:14<01:50,  2.13it/s]

[Rate Limit] Sleeping for 0.15 seconds...


 19%|█▉        | 55/290 [01:14<01:55,  2.04it/s]

[Rate Limit] Sleeping for 49.06 seconds...


 19%|█▉        | 56/290 [01:15<01:35,  2.44it/s]

[Rate Limit] Sleeping for 48.85 seconds...


 20%|█▉        | 57/290 [01:16<02:23,  1.62it/s]

[Rate Limit] Sleeping for 47.72 seconds...
[Rate Limit] Sleeping for 47.72 seconds...


 20%|██        | 59/290 [01:16<01:39,  2.32it/s]

[Rate Limit] Sleeping for 47.31 seconds...


 21%|██        | 60/290 [01:17<01:49,  2.10it/s]

[Rate Limit] Sleeping for 46.69 seconds...


 21%|██        | 61/290 [02:06<49:32, 12.98s/it]

[Rate Limit] Sleeping for 0.30 seconds...
[Rate Limit] Sleeping for 0.30 seconds...


 22%|██▏       | 65/290 [02:06<17:36,  4.70s/it]

[Rate Limit] Sleeping for 0.10 seconds...
[Rate Limit] Sleeping for 0.08 seconds...
[Rate Limit] Sleeping for 0.86 seconds...


 23%|██▎       | 66/290 [02:07<14:30,  3.88s/it]

[Rate Limit] Sleeping for 0.24 seconds...


 23%|██▎       | 67/290 [02:08<12:04,  3.25s/it]

[Rate Limit] Sleeping for 0.60 seconds...
[Rate Limit] Sleeping for 0.53 seconds...


 24%|██▍       | 69/290 [02:08<07:28,  2.03s/it]

[Rate Limit] Sleeping for 0.20 seconds...


 24%|██▍       | 70/290 [02:09<05:59,  1.63s/it]

[Rate Limit] Sleeping for 0.77 seconds...


 24%|██▍       | 71/290 [02:09<04:46,  1.31s/it]

[Rate Limit] Sleeping for 0.50 seconds...


 25%|██▍       | 72/290 [02:10<04:17,  1.18s/it]

[Rate Limit] Sleeping for 0.19 seconds...


 25%|██▌       | 73/290 [02:11<04:01,  1.11s/it]

[Rate Limit] Sleeping for 0.35 seconds...
[Rate Limit] Sleeping for 0.27 seconds...


 26%|██▌       | 75/290 [02:11<02:44,  1.31it/s]

[Rate Limit] Sleeping for 0.50 seconds...


 27%|██▋       | 78/290 [02:12<01:31,  2.32it/s]

[Rate Limit] Sleeping for 0.08 seconds...
[Rate Limit] Sleeping for 0.07 seconds...
[Rate Limit] Sleeping for 0.04 seconds...


 27%|██▋       | 79/290 [02:13<02:29,  1.41it/s]

[Rate Limit] Sleeping for 0.41 seconds...
[Rate Limit] Sleeping for 0.33 seconds...


 29%|██▊       | 83/290 [02:14<01:10,  2.93it/s]

[Rate Limit] Sleeping for 0.11 seconds...
[Rate Limit] Sleeping for 0.06 seconds...
[Rate Limit] Sleeping for 0.51 seconds...
[Rate Limit] Sleeping for 0.42 seconds...


 29%|██▉       | 85/290 [02:16<01:50,  1.86it/s]

[Rate Limit] Sleeping for 0.41 seconds...


 30%|██▉       | 86/290 [02:16<01:39,  2.05it/s]

[Rate Limit] Sleeping for 0.12 seconds...
[Rate Limit] Sleeping for 0.09 seconds...


 30%|███       | 88/290 [02:16<01:22,  2.44it/s]

[Rate Limit] Sleeping for 0.20 seconds...[Rate Limit] Sleeping for 0.20 seconds...



 31%|███       | 90/290 [02:17<01:15,  2.66it/s]

[Rate Limit] Sleeping for 48.42 seconds...


 31%|███▏      | 91/290 [02:18<01:43,  1.92it/s]

[Rate Limit] Sleeping for 47.29 seconds...
[Rate Limit] Sleeping for 47.29 seconds...


 32%|███▏      | 94/290 [02:19<01:06,  2.94it/s]

[Rate Limit] Sleeping for 46.93 seconds...
[Rate Limit] Sleeping for 46.83 seconds...


 33%|███▎      | 95/290 [02:19<01:02,  3.14it/s]

[Rate Limit] Sleeping for 46.58 seconds...


 33%|███▎      | 96/290 [03:08<39:06, 12.10s/it]

[Rate Limit] Sleeping for 0.20 seconds...[Rate Limit] Sleeping for 0.20 seconds...



 34%|███▍      | 99/290 [03:08<17:58,  5.65s/it]

[Rate Limit] Sleeping for 0.23 seconds...
[Rate Limit] Sleeping for 0.10 seconds...
[Rate Limit] Sleeping for 0.06 seconds...


 35%|███▍      | 101/290 [03:09<11:02,  3.50s/it]

[Rate Limit] Sleeping for 0.81 seconds...


 35%|███▌      | 102/290 [03:10<09:12,  2.94s/it]

[Rate Limit] Sleeping for 0.81 seconds...


 36%|███▌      | 103/290 [03:10<07:31,  2.41s/it]

[Rate Limit] Sleeping for 0.07 seconds...


 36%|███▌      | 105/290 [03:12<04:46,  1.55s/it]

[Rate Limit] Sleeping for 0.10 seconds...
[Rate Limit] Sleeping for 0.06 seconds...


 37%|███▋      | 106/290 [03:12<03:44,  1.22s/it]

[Rate Limit] Sleeping for 1.38 seconds...


 37%|███▋      | 107/290 [03:13<03:22,  1.11s/it]

[Rate Limit] Sleeping for 0.56 seconds...


 37%|███▋      | 108/290 [03:13<02:34,  1.18it/s]

[Rate Limit] Sleeping for 0.35 seconds...


 38%|███▊      | 111/290 [03:14<01:15,  2.37it/s]

[Rate Limit] Sleeping for 0.05 seconds...
[Rate Limit] Sleeping for 0.17 seconds...
[Rate Limit] Sleeping for 0.13 seconds...


 39%|███▊      | 112/290 [03:16<02:28,  1.20it/s]

[Rate Limit] Sleeping for 0.02 seconds...
[Rate Limit] Sleeping for 0.25 seconds...


 39%|███▉      | 114/290 [03:16<01:34,  1.87it/s]

[Rate Limit] Sleeping for 0.08 seconds...


 40%|███▉      | 115/290 [03:17<01:42,  1.71it/s]

[Rate Limit] Sleeping for 0.48 seconds...


 40%|████      | 116/290 [03:17<01:38,  1.77it/s]

[Rate Limit] Sleeping for 1.10 seconds...[Rate Limit] Sleeping for 1.10 seconds...



 41%|████      | 118/290 [03:17<01:09,  2.48it/s]

[Rate Limit] Sleeping for 0.75 seconds...


 41%|████      | 119/290 [03:18<01:05,  2.62it/s]

[Rate Limit] Sleeping for 0.45 seconds...


 41%|████▏     | 120/290 [03:18<01:02,  2.72it/s]

[Rate Limit] Sleeping for 0.12 seconds...


 42%|████▏     | 121/290 [03:19<01:41,  1.67it/s]

[Rate Limit] Sleeping for 48.31 seconds...


 42%|████▏     | 122/290 [03:20<01:27,  1.92it/s]

Error parsing response: invalid syntax (<unknown>, line 2)
[Rate Limit] Sleeping for 48.01 seconds...


 42%|████▏     | 123/290 [03:20<01:19,  2.09it/s]

[Rate Limit] Sleeping for 47.63 seconds...


 43%|████▎     | 124/290 [03:20<01:08,  2.43it/s]

[Rate Limit] Sleeping for 47.39 seconds...


 43%|████▎     | 126/290 [03:21<00:50,  3.24it/s]

[Rate Limit] Sleeping for 47.19 seconds...
[Rate Limit] Sleeping for 46.98 seconds...


 44%|████▍     | 127/290 [04:10<39:41, 14.61s/it]

[Rate Limit] Sleeping for 0.18 seconds...


 45%|████▌     | 131/290 [04:10<12:12,  4.61s/it]

[Rate Limit] Sleeping for 0.52 seconds...
[Rate Limit] Sleeping for 0.48 seconds...
[Rate Limit] Sleeping for 0.46 seconds...
[Rate Limit] Sleeping for 0.39 seconds...


 46%|████▌     | 132/290 [04:10<09:45,  3.71s/it]

[Rate Limit] Sleeping for 0.07 seconds...


 46%|████▌     | 133/290 [04:12<08:30,  3.25s/it]

[Rate Limit] Sleeping for 0.66 seconds...


 46%|████▌     | 134/290 [04:12<06:30,  2.50s/it]

[Rate Limit] Sleeping for 0.42 seconds...


 47%|████▋     | 137/290 [04:13<02:51,  1.12s/it]

[Rate Limit] Sleeping for 0.19 seconds...
[Rate Limit] Sleeping for 0.11 seconds...
[Rate Limit] Sleeping for 0.06 seconds...


 48%|████▊     | 138/290 [04:13<02:23,  1.06it/s]

[Rate Limit] Sleeping for 0.27 seconds...


 49%|████▉     | 143/290 [04:15<01:13,  1.99it/s]

[Rate Limit] Sleeping for 0.78 seconds...
[Rate Limit] Sleeping for 0.74 seconds...
[Rate Limit] Sleeping for 0.73 seconds...
[Rate Limit] Sleeping for 0.71 seconds...
[Rate Limit] Sleeping for 0.57 seconds...


 50%|████▉     | 144/290 [04:16<01:15,  1.94it/s]

[Rate Limit] Sleeping for 0.04 seconds...


 51%|█████     | 147/290 [04:18<01:16,  1.86it/s]

[Rate Limit] Sleeping for 0.18 seconds...
[Rate Limit] Sleeping for 0.12 seconds...
[Rate Limit] Sleeping for 0.02 seconds...
[Rate Limit] Sleeping for 0.28 seconds...
[Rate Limit] Sleeping for 0.27 seconds...


 52%|█████▏    | 150/290 [04:18<00:51,  2.73it/s]

[Rate Limit] Sleeping for 1.17 seconds...


 52%|█████▏    | 151/290 [04:20<01:15,  1.84it/s]

[Rate Limit] Sleeping for 0.08 seconds...


 52%|█████▏    | 152/290 [04:20<01:07,  2.03it/s]

Error parsing response: invalid syntax (<unknown>, line 2)
[Rate Limit] Sleeping for 0.17 seconds...


 53%|█████▎    | 153/290 [04:20<00:59,  2.32it/s]

[Rate Limit] Sleeping for 0.20 seconds...


 53%|█████▎    | 154/290 [04:20<00:54,  2.51it/s]

[Rate Limit] Sleeping for 0.11 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 200000, Requested 7468. Please try again in 2.24s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


 53%|█████▎    | 155/290 [04:22<01:45,  1.28it/s]

[Rate Limit] Sleeping for 47.26 seconds...


 54%|█████▍    | 156/290 [04:23<01:31,  1.47it/s]

[Rate Limit] Sleeping for 46.85 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 194717, Requested 7706. Please try again in 726ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 194667, Requested 7363. Please try again in 609ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 194454, Requested 7

 54%|█████▍    | 157/290 [04:41<12:36,  5.69s/it]

[Rate Limit] Sleeping for 28.02 seconds...


 54%|█████▍    | 158/290 [04:44<10:21,  4.71s/it]

[Rate Limit] Sleeping for 25.79 seconds...
[Rate Limit] Sleeping for 25.77 seconds...
[Rate Limit] Sleeping for 25.71 seconds...


 56%|█████▌    | 161/290 [05:11<15:26,  7.18s/it]

[Rate Limit] Sleeping for 0.75 seconds...


 57%|█████▋    | 164/290 [05:12<07:44,  3.68s/it]

[Rate Limit] Sleeping for 0.13 seconds...
[Rate Limit] Sleeping for 0.06 seconds...
[Rate Limit] Sleeping for 0.24 seconds...
[Rate Limit] Sleeping for 0.23 seconds...
[Rate Limit] Sleeping for 0.16 seconds...


 58%|█████▊    | 167/290 [05:14<04:47,  2.33s/it]

[Rate Limit] Sleeping for 0.80 seconds...


 58%|█████▊    | 168/290 [05:14<03:59,  1.97s/it]

[Rate Limit] Sleeping for 0.58 seconds...
[Rate Limit] Sleeping for 0.52 seconds...
[Rate Limit] Sleeping for 0.51 seconds...


 59%|█████▉    | 171/290 [05:15<02:22,  1.19s/it]

[Rate Limit] Sleeping for 0.09 seconds...
[Rate Limit] Sleeping for 0.01 seconds...


 60%|██████    | 174/290 [05:17<01:59,  1.03s/it]

[Rate Limit] Sleeping for 0.35 seconds...
[Rate Limit] Sleeping for 0.15 seconds...
[Rate Limit] Sleeping for 0.10 seconds...


 61%|██████    | 176/290 [05:18<01:22,  1.37it/s]

[Rate Limit] Sleeping for 0.01 seconds...


 61%|██████    | 177/290 [05:18<01:13,  1.54it/s]

[Rate Limit] Sleeping for 0.19 seconds...


 62%|██████▏   | 180/290 [05:20<01:04,  1.69it/s]

[Rate Limit] Sleeping for 0.14 seconds...


 62%|██████▏   | 181/290 [05:20<00:58,  1.87it/s]

[Rate Limit] Sleeping for 0.02 seconds...


 63%|██████▎   | 182/290 [05:20<00:50,  2.15it/s]

[Rate Limit] Sleeping for 0.07 seconds...
[Rate Limit] Sleeping for 0.07 seconds...
[Rate Limit] Sleeping for 0.05 seconds...


 64%|██████▍   | 185/290 [05:23<01:04,  1.64it/s]

[Rate Limit] Sleeping for 0.12 seconds...
[Rate Limit] Sleeping for 0.12 seconds...


 65%|██████▍   | 188/290 [05:23<00:40,  2.50it/s]

[Rate Limit] Sleeping for 48.48 seconds...
[Rate Limit] Sleeping for 48.37 seconds...


 65%|██████▌   | 189/290 [05:23<00:40,  2.50it/s]

[Rate Limit] Sleeping for 47.97 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 195342, Requested 7459. Please try again in 840ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


 66%|██████▌   | 190/290 [05:27<01:58,  1.18s/it]

[Rate Limit] Sleeping for 44.21 seconds...


 66%|██████▌   | 191/290 [05:28<01:40,  1.01s/it]

[Rate Limit] Sleeping for 43.71 seconds...


 66%|██████▌   | 192/290 [05:44<08:02,  4.92s/it]

[Rate Limit] Sleeping for 27.82 seconds...


 67%|██████▋   | 193/290 [06:13<18:56, 11.71s/it]

[Rate Limit] Sleeping for 0.71 seconds...
[Rate Limit] Sleeping for 0.71 seconds...


 67%|██████▋   | 195/290 [06:14<10:33,  6.67s/it]

[Rate Limit] Sleeping for 0.31 seconds...
[Rate Limit] Sleeping for 0.24 seconds...


 68%|██████▊   | 197/290 [06:14<06:32,  4.22s/it]

[Rate Limit] Sleeping for 0.40 seconds...
[Rate Limit] Sleeping for 0.37 seconds...


 69%|██████▊   | 199/290 [06:16<04:30,  2.98s/it]

[Rate Limit] Sleeping for 1.42 seconds...


 69%|██████▉   | 200/290 [06:17<03:50,  2.56s/it]

[Rate Limit] Sleeping for 0.53 seconds...[Rate Limit] Sleeping for 0.53 seconds...



 70%|██████▉   | 202/290 [06:17<02:31,  1.72s/it]

[Rate Limit] Sleeping for 0.19 seconds...


 70%|███████   | 204/290 [06:19<01:59,  1.39s/it]

[Rate Limit] Sleeping for 0.32 seconds...


 71%|███████   | 205/290 [06:20<01:39,  1.17s/it]

[Rate Limit] Sleeping for 0.21 seconds...


 72%|███████▏  | 210/290 [06:22<00:49,  1.63it/s]

[Rate Limit] Sleeping for 0.60 seconds...


 73%|███████▎  | 211/290 [06:22<00:41,  1.89it/s]

[Rate Limit] Sleeping for 0.29 seconds...
[Rate Limit] Sleeping for 0.22 seconds...


 74%|███████▍  | 214/290 [06:25<00:50,  1.50it/s]

[Rate Limit] Sleeping for 2.46 seconds...
[Rate Limit] Sleeping for 2.46 seconds...
[Rate Limit] Sleeping for 2.42 seconds...


 75%|███████▍  | 217/290 [06:25<00:31,  2.28it/s]

[Rate Limit] Sleeping for 1.83 seconds...


 75%|███████▌  | 218/290 [06:28<00:58,  1.24it/s]

[Rate Limit] Sleeping for 45.65 seconds...


 76%|███████▌  | 219/290 [06:29<01:06,  1.06it/s]

[Rate Limit] Sleeping for 44.23 seconds...


 76%|███████▌  | 220/290 [06:30<00:58,  1.19it/s]

[Rate Limit] Sleeping for 43.71 seconds...
[Rate Limit] Sleeping for 43.71 seconds...
[Rate Limit] Sleeping for 43.70 seconds...


 77%|███████▋  | 223/290 [06:52<04:41,  4.20s/it]

[Rate Limit] Sleeping for 21.18 seconds...


 77%|███████▋  | 224/290 [07:15<08:41,  7.90s/it]

[Rate Limit] Sleeping for 0.29 seconds...


 79%|███████▊  | 228/290 [07:16<03:22,  3.27s/it]

[Rate Limit] Sleeping for 0.00 seconds...
[Rate Limit] Sleeping for 0.82 seconds...
[Rate Limit] Sleeping for 0.81 seconds...
[Rate Limit] Sleeping for 0.76 seconds...


 79%|███████▉  | 229/290 [07:17<02:58,  2.92s/it]

[Rate Limit] Sleeping for 1.83 seconds...


 79%|███████▉  | 230/290 [07:18<02:31,  2.52s/it]

[Rate Limit] Sleeping for 0.81 seconds...[Rate Limit] Sleeping for 0.81 seconds...



 80%|████████  | 232/290 [07:19<01:34,  1.63s/it]

[Rate Limit] Sleeping for 0.50 seconds...


 80%|████████  | 233/290 [07:19<01:19,  1.40s/it]

[Rate Limit] Sleeping for 0.09 seconds...


 81%|████████  | 234/290 [07:20<01:05,  1.18s/it]

[Rate Limit] Sleeping for 0.15 seconds...


 82%|████████▏ | 238/290 [07:22<00:35,  1.45it/s]

[Rate Limit] Sleeping for 0.09 seconds...
[Rate Limit] Sleeping for 0.06 seconds...
[Rate Limit] Sleeping for 0.04 seconds...
[Rate Limit] Sleeping for 0.20 seconds...
[Rate Limit] Sleeping for 0.19 seconds...


 83%|████████▎ | 240/290 [07:22<00:24,  2.01it/s]

[Rate Limit] Sleeping for 0.30 seconds...


 83%|████████▎ | 241/290 [07:24<00:40,  1.20it/s]

[Rate Limit] Sleeping for 0.48 seconds...
[Rate Limit] Sleeping for 0.48 seconds...
[Rate Limit] Sleeping for 0.48 seconds...


 84%|████████▍ | 244/290 [07:25<00:24,  1.86it/s]

[Rate Limit] Sleeping for 0.00 seconds...
[Rate Limit] Sleeping for 0.57 seconds...


 85%|████████▍ | 246/290 [07:25<00:20,  2.15it/s]

[Rate Limit] Sleeping for 2.35 seconds...


 85%|████████▌ | 247/290 [07:26<00:22,  1.94it/s]

[Rate Limit] Sleeping for 1.60 seconds...


 86%|████████▌ | 248/290 [07:27<00:25,  1.66it/s]

[Rate Limit] Sleeping for 0.66 seconds...


 86%|████████▌ | 249/290 [07:27<00:23,  1.76it/s]

[Rate Limit] Sleeping for 0.20 seconds...


 86%|████████▌ | 250/290 [07:28<00:22,  1.80it/s]

[Rate Limit] Sleeping for 1.11 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 200000, Requested 7905. Please try again in 2.371s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


 87%|████████▋ | 251/290 [07:30<00:31,  1.23it/s]

[Rate Limit] Sleeping for 0.08 seconds...
[Rate Limit] Sleeping for 0.01 seconds...


 87%|████████▋ | 253/290 [07:30<00:23,  1.57it/s]

[Rate Limit] Sleeping for 21.80 seconds...


 88%|████████▊ | 254/290 [07:32<00:31,  1.15it/s]

[Rate Limit] Sleeping for 20.17 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 198256, Requested 7676. Please try again in 1.779s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 193111, Requested 7671. Please try again in 234ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


 88%|████████▊ | 255/290 [07:38<01:11,  2.06s/it]

[Rate Limit] Sleeping for 14.53 seconds...


 88%|████████▊ | 256/290 [07:49<02:32,  4.48s/it]

[Rate Limit] Sleeping for 3.23 seconds...


 89%|████████▊ | 257/290 [07:54<02:34,  4.67s/it]

[Rate Limit] Sleeping for 21.40 seconds...


 89%|████████▉ | 258/290 [07:54<01:49,  3.42s/it]

[Rate Limit] Sleeping for 21.18 seconds...
[Rate Limit] Sleeping for 21.13 seconds...


 90%|████████▉ | 260/290 [07:55<00:59,  1.99s/it]

[Rate Limit] Sleeping for 20.78 seconds...


 90%|█████████ | 261/290 [07:55<00:45,  1.56s/it]

[Rate Limit] Sleeping for 20.57 seconds...


 90%|█████████ | 262/290 [07:56<00:38,  1.38s/it]

[Rate Limit] Sleeping for 19.72 seconds...


 91%|█████████ | 264/290 [08:17<02:08,  4.94s/it]

[Rate Limit] Sleeping for 0.09 seconds...
[Rate Limit] Sleeping for 0.93 seconds...
[Rate Limit] Sleeping for 0.86 seconds...


 92%|█████████▏| 266/290 [08:18<01:07,  2.82s/it]

[Rate Limit] Sleeping for 0.65 seconds...


 92%|█████████▏| 267/290 [08:18<00:52,  2.30s/it]

[Rate Limit] Sleeping for 0.29 seconds...
[Rate Limit] Sleeping for 0.29 seconds...


 93%|█████████▎| 269/290 [08:20<00:34,  1.64s/it]

[Rate Limit] Sleeping for 1.83 seconds...


 94%|█████████▍| 273/290 [08:21<00:12,  1.32it/s]

[Rate Limit] Sleeping for 1.06 seconds...
[Rate Limit] Sleeping for 1.06 seconds...
[Rate Limit] Sleeping for 0.97 seconds...
[Rate Limit] Sleeping for 0.90 seconds...
[Rate Limit] Sleeping for 0.84 seconds...


 96%|█████████▌| 277/290 [08:24<00:09,  1.35it/s]

[Rate Limit] Sleeping for 0.19 seconds...[Rate Limit] Sleeping for 0.19 seconds...

[Rate Limit] Sleeping for 0.09 seconds...
[Rate Limit] Sleeping for 0.07 seconds...


 96%|█████████▌| 279/290 [08:25<00:06,  1.68it/s]

[Rate Limit] Sleeping for 0.08 seconds...
[Rate Limit] Sleeping for 0.06 seconds...


 97%|█████████▋| 281/290 [08:26<00:05,  1.59it/s]

[Rate Limit] Sleeping for 0.02 seconds...


 97%|█████████▋| 282/290 [08:26<00:04,  1.70it/s]

[Rate Limit] Sleeping for 0.55 seconds...


 98%|█████████▊| 283/290 [08:27<00:04,  1.71it/s]

[Rate Limit] Sleeping for 0.44 seconds...


 98%|█████████▊| 284/290 [08:29<00:04,  1.21it/s]

[Rate Limit] Sleeping for 0.83 seconds...
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 200000, Requested 7738. Please try again in 2.321s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 197003, Requested 7720. Please try again in 1.416s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 197190, Requested 

 98%|█████████▊| 285/290 [08:31<00:06,  1.21s/it]

Error during request: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-nano in organization org-xNx4bgcMtYDCt9h8gAsxgBbj on tokens per min (TPM): Limit 200000, Used 193944, Requested 7176. Please try again in 336ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


100%|██████████| 290/290 [08:51<00:00,  1.83s/it]


In [13]:
# check which ones have sentiment as 'Error'
error_sentiments = cryptopanic_news_with_labels[cryptopanic_news_with_labels['sentiment'] == 'Error']
print(f"Number of errors: {len(error_sentiments)}")

# Batch processing also the ones with 'Error' sentiment
while True:
    news_with_errors = cryptopanic_news_with_labels[cryptopanic_news_with_labels['sentiment'] == 'Error']
    if news_with_errors.empty:
        print("No more errors to process.")
        break

    print(f"Processing {len(news_with_errors)} news items with 'Error' sentiment...")

    new_batches = process_all_batches(news_with_errors, max_workers=6)
    cryptopanic_news_with_labels.update(new_batches)

Number of errors: 1164
Processing 1164 news items with 'Error' sentiment...


100%|██████████| 15/15 [00:08<00:00,  1.84it/s]


Processing 176 news items with 'Error' sentiment...


100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Processing 23 news items with 'Error' sentiment...


100%|██████████| 1/1 [00:00<00:00,  1.02it/s]

No more errors to process.





In [15]:
# check statistics of the sentiment labels
sentiment_counts = cryptopanic_news_with_labels['sentiment'].value_counts()
print("Sentiment counts:")
print(sentiment_counts)

# print all the news with not 'Positive', 'Neutral', 'Negative'
non_standard_sentiments = cryptopanic_news_with_labels[~cryptopanic_news_with_labels['sentiment'].isin(['Positive', 'Neutral', 'Negative'])]
print("News with non-standard sentiments:")
print(non_standard_sentiments[['title', 'description', 'sentiment']])

Sentiment counts:
sentiment
Positive               12461
Negative                4977
Neutral                 2030
Bullish                   16
Bullish/Positive          10
Bullish/Speculative        1
Name: count, dtype: int64
News with non-standard sentiments:
                                                   title  \
919    Polygon Launches Burning Mechanism Similar to ...   
922    Why Did Elon Musk Choose Dogecoin Over Bitcoin...   
923    Number of SHIB holders surpass 800,000 adding ...   
1547   Proposal on SEC website suggests XRP as a stra...   
3615   Bitcoin transactions need 2 hours to be safe, ...   
3622   Crypto Winter Is Rough. Here Are Five Essentia...   
5127            AI predicts XRP price for Halloween 2023   
5129   Bitcoin price plunges below $43K in minutes in...   
7581   Dogecoin founder slams every token on the Bina...   
9003   Bitcoin, Ether, SOL, Cardano, XRP, Shiba Inu B...   
9133   Cardano-based DeFi ADALend Announces IDO, Hire...   
9136   Algorand (

In [20]:
# not sure how to handle these, so as there are not many, we will skip them
cryptopanic_news_with_labels = cryptopanic_news_with_labels[
    cryptopanic_news_with_labels['sentiment'].isin(['Positive', 'Neutral', 'Negative'])
]

# save the processed DataFrame to a new Parquet file
output_file = f"{CLEAN_DATA_FOLDER}/cryptopanic_news_with_labels.parquet"

cryptopanic_news_with_labels.to_parquet(output_file, index=False)

In [19]:
# read test
cryptopanic_news_with_labels_test = pl.read_parquet(output_file)
cryptopanic_news_with_labels_test

cryptopanic_id,title,description,source_domain,published_at,cryptopanic_url,currencies,sentiment
i64,str,str,str,datetime[μs],str,str,str
176510,"""Crypto Investor Makes Over 3,6…","""An investor has made an incred…","""cryptodnes.bg""",2024-12-20 10:00:54,"""https://cryptopanic.com/news/2…","""SOL""","""Positive"""
176215,"""Cardano’s 10% jump sparks hope…","""After a quiet December, 2025 s…","""ambcrypto.com""",2025-01-02 11:00:07,"""https://cryptopanic.com/news/2…","""ADA""","""Negative"""
87112,"""Someone Paid 93 ETH In Fees Fo…","""Data shows someone has today p…","""bitcoinist.com""",2022-11-18 16:00:04,"""https://cryptopanic.com/news/1…","""ETH""","""Neutral"""
82678,"""‘Critical’ Polygon bug put $24…","""Polygon was at risk of losing …","""theblockcrypto.com""",2021-12-29 19:01:22,"""https://cryptopanic.com/news/m…","""MATIC""","""Positive"""
120931,"""Charles Hoskinson reveals ‘con…","""The cryptocurrency market may …","""finbold.com""",2022-12-08 14:16:55,"""https://cryptopanic.com/news/1…","""ADA,XCN""","""Neutral"""
…,…,…,…,…,…,…,…
79592,"""We Should All Expect Repercuss…","""Year-to-date gains for Dogecoi…","""newsbtc.com""",2021-04-18 12:15:49,"""https://cryptopanic.com/news/d…","""DOGE""","""Negative"""
77552,"""Bitcoin Dives to $37,000, Nasd…","""The Powell rally did not last …","""trustnodes.com""",2022-05-05 15:14:48,"""https://cryptopanic.com/news/b…","""BTC""","""Negative"""
83671,"""Shiba Inu Goes Live on Mexico’…","""The second-biggest canine cryp…","""u.today""",2022-01-12 05:30:39,"""https://cryptopanic.com/news/s…","""SHIB""","""Positive"""
86423,"""Fed Leaves Policy on Hold, End…","""The U.S. central bank signaled…","""coindesk.com""",2023-06-14 18:04:07,"""https://cryptopanic.com/news/1…","""BTC""","""Neutral"""
