<a href="https://colab.research.google.com/github/dude123studios/TradingBots/blob/main/NVIDIA_STOCK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Define API key and base URL
API_KEY = '658d2e0e1d4a4203babad16ca1612e72'
BASE_URL = 'https://newsapi.org/v2/everything?'

# Define search query
query = 'NVIDIA OR GPU OR "Artificial Intelligence" OR AI'

# Define date range
end_date = datetime.today().date()
start_date = end_date - timedelta(days=27)  # Past 3 months

# Fetch articles
url = (f"{BASE_URL}q={query}&from={start_date}&to={end_date}"
       f"&sortBy=popularity&pageSize=100&apiKey={API_KEY}")  # Fetch top 100 articles

response = requests.get(url)
articles = response.json().get('articles', [])
# Process articles into DataFrame
news_data = []
for article in articles:
    if article['content']:  # Ensure the content is not None
        news_data.append({
            'publishedAt': article['publishedAt'][:100],
            'content': article['title'] + ". " + (article['description'] or ""),
        })

df = pd.DataFrame(news_data)
df['publishedAt'] = pd.to_datetime(df['publishedAt']).dt.date

# Limit to 10 articles per day
df = df.groupby('publishedAt').head(10).reset_index(drop=True)

# Display the first few rows
print(df.head())

  publishedAt                                            content
0  2025-02-07  OnePlus 13 review: A focused flagship that ign...
1  2025-02-20  A $599 iPhone 16e is a cruel joke. The $599 iP...
2  2025-02-13  Gemini Advanced can now recall your past conve...
3  2025-02-12  Google will use machine learning to try and te...
4  2025-02-27  iPhone 16e review: What's your acceptable comp...


In [None]:
import json
import pandas as pd
import yfinance as yf
import requests
from datetime import datetime, timedelta

# Step 2: Fetch NVIDIA & S&P 500 stock data
nvidia_ticker = yf.Ticker("NVDA")
sp500_ticker = yf.Ticker("^GSPC")  # S&P 500 Index

# Fetch stock data from 3 months ago to today + 3 days (for labeling)
stock_start = start_date - timedelta(days=1)
stock_end = end_date + timedelta(days=3)

nvda_prices = nvidia_ticker.history(start=stock_start, end=stock_end)[["Close"]].reset_index()
sp500_prices = sp500_ticker.history(start=stock_start, end=stock_end)[["Close"]].reset_index()

# Convert stock dates
nvda_prices["Date"] = nvda_prices["Date"].dt.date
sp500_prices["Date"] = sp500_prices["Date"].dt.date

# Function to assign labels based on NVIDIA vs. S&P 500 performance
def get_relative_label(news_date):
    if news_date not in nvda_prices["Date"].values or news_date not in sp500_prices["Date"].values:
        return None

    # NVIDIA & S&P 500 closing prices on news day
    nvda_close_today = nvda_prices.loc[nvda_prices["Date"] == news_date, "Close"].values[0]
    sp500_close_today = sp500_prices.loc[sp500_prices["Date"] == news_date, "Close"].values[0]

    # Look at 1-2 trading days ahead for stock movement
    next_day = news_date + timedelta(days=1)
    next2_day = news_date + timedelta(days=2)

    while next_day not in nvda_prices["Date"].values and next2_day not in nvda_prices["Date"].values:
        next_day += timedelta(days=1)
        next2_day += timedelta(days=1)
        if (next_day - news_date).days > 4:  # Skip weekends but avoid infinite loops
            return None

    # Find closest available stock data
    nvda_close_future = nvda_prices.loc[nvda_prices["Date"] == (next_day if next_day in nvda_prices["Date"].values else next2_day), "Close"].values[0]
    sp500_close_future = sp500_prices.loc[sp500_prices["Date"] == (next_day if next_day in sp500_prices["Date"].values else next2_day), "Close"].values[0]

    # Calculate percentage change
    nvda_return = ((nvda_close_future - nvda_close_today) / nvda_close_today) * 100
    sp500_return = ((sp500_close_future - sp500_close_today) / sp500_close_today) * 100

    # NVIDIA relative performance
    relative_change = nvda_return - sp500_return

    # Assign labels
    if relative_change > 0.5:
        return 0  # NVIDIA outperformed the market
    elif relative_change < -0.5:
        return 1  # NVIDIA underperformed
    else:
        return 2  # NVIDIA moved similarly to the market

# Apply function to get labels
df["sentiment"] = df["publishedAt"].apply(get_relative_label)

# Remove rows where labels couldn't be assigned
df = df.dropna(subset=["sentiment"])

# Save labeled dataset
df.to_csv("nvidia_news_labeled.csv", index=False)

print("News dataset collected & labeled successfully!")

News dataset collected & labeled successfully!


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import evaluate
import numpy as np
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to class predictions
    return accuracy_metric.compute(predictions=predictions, references=labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset


# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(label), dtype=torch.long)
        }



# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['content'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)

# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len=512)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len=512)

# Load pre-trained FinBERT model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation frequency
    save_strategy="epoch",           # save model at the end of each epoch
    load_best_model_at_end=True,     # load the best model when finished training
)

# Define Trainer
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Save the trained model
model.save_pretrained("./finbert_nvidia_stock_model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdude123studios[0m ([33mdude123studios-university-of-oxford[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.243104,0.8
2,3.289300,2.20709,0.8
3,3.289300,2.154343,0.8


In [None]:
import shutil
shutil.make_archive("finbert_nvidia_stock_model_1", 'zip', './finbert_nvidia_stock_model')

'/content/finbert_nvidia_stock_model_1.zip'

In [None]:
tokenizer.save_pretrained("./finbert_nvidia")

('./finbert_nvidia/tokenizer_config.json',
 './finbert_nvidia/special_tokens_map.json',
 './finbert_nvidia/vocab.txt',
 './finbert_nvidia/added_tokens.json')

In [None]:
from transformers import pipeline

# Load fine-tuned model
finbert_sentiment = pipeline("text-classification", model="./finbert_nvidia_stock_model", tokenizer="./finbert_nvidia")

# Test on a new article
test_text = "NVIDIA reports record profits due to AI boom."
result = finbert_sentiment(test_text)

print(f"Sentiment Prediction: {result}")

Device set to use cpu


Sentiment Prediction: [{'label': 'Positive', 'score': 1.0}]


In [None]:
test_text_2 = '''OpenAI announced on Thursday it is launching GPT-4.5, the much-anticipated AI model code-named Orion. GPT-4.5 is OpenAI’s largest model to date, trained using more computing power and data than any of the company’s previous releases.

Despite its size, OpenAI notes in a whitepaper that it does not consider GPT-4.5 to be a frontier model.

4-Pack Colorful Spring Cat Toys, Elastic Soft Tube Interactive Feline Playthings, High Visual Appeal Pet Supplies

Subscribers to ChatGPT Pro, OpenAI’s $200-a-month plan, will gain access to GPT-4.5 in ChatGPT starting Thursday as part of a research preview. Developers on paid tiers of OpenAI’s API will also be able to use GPT-4.5 starting today. As for other ChatGPT users, customers signed up for ChatGPT Plus and ChatGPT Team should get the model sometime next week, an OpenAI spokesperson told TechCrunch.

The industry has held its collective breath for Orion, which some consider to be a bellwether for the viability of traditional AI training approaches. GPT-4.5 was developed using the same key technique – dramatically increasing the amount of computing power and data during a “pre-training” phase called unsupervised learning — that OpenAI used to develop GPT-4, GPT-3, GPT-2, and GPT-1.

In every GPT generation before GPT-4.5, scaling up led to massive jumps in performance across domains including mathematics, writing, and coding. Indeed, OpenAI says that GPT-4.5’s increased size has given it “a deeper world knowledge” and “higher emotional intelligence.” However, there are signs that the gains from scaling up data and computing are beginning to level off. On several AI benchmarks, GPT-4.5 falls short of newer AI “reasoning” models from Chinese AI company DeepSeek, Anthropic, and OpenAI itself.'''

result = finbert_sentiment(test_text_2)

print(f"Sentiment Prediction: {result}")

Sentiment Prediction: [{'label': 'Neutral', 'score': 0.9988709092140198}]


In [None]:
import requests
import torch
import datetime
from collections import Counter

# Step 1: Fetch Today's AI/NVIDIA/GPU-related News
def fetch_latest_news(api_key):
    today = (datetime.datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
    url = f"https://newsapi.org/v2/everything?q=(NVIDIA OR GPU OR AI)&from={today}&sortBy=popularity&pageSize=10&apiKey={api_key}"
    response = requests.get(url)
    data = response.json()

    if "articles" in data:
        articles = [article["title"] + ". " + article["description"] for article in data["articles"] if article["description"]]
        return articles
    else:
        print("Error fetching news:", data)
        return []

# Step 2: Load the Fine-Tuned FinBERT Model
model_path = "./finbert_nvidia"  # Change this to your model's path
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained('./finbert_nvidia_stock_model')
model.eval()

# Step 3: Predict NVIDIA Stock Movement
def predict_stock_movement(news_articles):
    predictions = []
    label_map = {0: "Stock Down", 1: "Stock Same", 2: "Stock Up"}  # Ensure this matches your fine-tuning labels

    for article in news_articles:
        inputs = tokenizer(article, truncation=True, padding=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
        predictions.append((article, label_map[predicted_label]))

    return predictions

# Step 4: Compute the Majority Prediction
def majority_vote(predictions):
    labels = [pred[1] for pred in predictions]  # Extract predicted labels
    count = Counter(labels)  # Count occurrences
    most_common_label = count.most_common(1)[0][0]  # Get most common label

    print("\n--- Majority Prediction Based on Today's News ---")
    print(f"Stock Movement Prediction: {most_common_label}")
    print("Breakdown:", count)

# Step 5: Run the Prediction Pipeline
api_key = "658d2e0e1d4a4203babad16ca1612e72"  # Replace with your NewsAPI key
news_articles = fetch_latest_news(api_key)

if news_articles:
    predictions = predict_stock_movement(news_articles)

    # Print individual predictions
    for i, (article, prediction) in enumerate(predictions):
        print(f"News {i+1}: {article[:150]}...")  # Print first 150 chars for readability
        print(f"Predicted Impact on NVIDIA Stock: {prediction}\n")

    # Compute and Print Majority Result
    majority_vote(predictions)
else:
    print("No news articles found.")

News 1: TCL's 60 series phones pack premium features without the high-end price. TCL is bringing AI smarts to $200 phones with a little help from the cloud....
Predicted Impact on NVIDIA Stock: Stock Down

News 2: New MacBook Air Coming This Week: What to Expect. Apple CEO Tim Cook teased a new product announcement this week, sharing a short video that says "the...
Predicted Impact on NVIDIA Stock: Stock Down

News 3: Researchers Find Less-Educated Areas Adopting AI Writing Tools Faster. An anonymous reader quotes a report from Ars Technica: Since the launch of Chat...
Predicted Impact on NVIDIA Stock: Stock Down

News 4: Hijacking AirTag Infrastructure To Track Arbitrary Devices. In case you weren’t aware, Apple devices around you are constantly scanning for AirTags. N...
Predicted Impact on NVIDIA Stock: Stock Down

News 5: The $1000 RTX 5070 Ti may be the new normal. Nvidia’s vendor partners are intent on keeping the prices of its 50-series graphics cards high. Recent re...
Predicte