In [50]:
!pip install yfinance feedparser newspaper3k scikit-learn pandas numpy matplotlib seaborn xgboost sentence-transformers tqdm



In [51]:
!pip install lxml_html_clean



In [52]:
import os
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import yfinance as yf
import feedparser
from newspaper import Article
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [65]:
# Choose ticker and date range
TICKER = "AAPL"
START_DATE = "2024-01-01" # Changed to a more recent start date
END_DATE = "2026-06-30" # Further extended END_DATE to ensure future trading days for news alignment


# News sources (we'll use Google News RSS for the ticker and optionally add others)
# No API key required for this RSS approach.
GOOGLE_NEWS_RSS_TEMPLATE = "https://news.google.com/rss/search?q={query}&hl=en-US&gl=US&ceid=US:en"


# Keyword query for Google News — ticker + company name helps
COMPANY_NAME = "Apple"
QUERY = f'{TICKER} OR {COMPANY_NAME}'


# Threshold to label a next-day move as up/down (e.g., 0 -> any positive move is Up)
RETURN_THRESHOLD = 0.0


# Whether to compute SBERT embeddings (better semantic features) — set to True to use
USE_SBERT = False
SBERT_MODEL = 'all-MiniLM-L6-v2' # small, fast

In [66]:
print('Downloading price data for', TICKER)
prices = yf.download(TICKER, start=START_DATE, end=END_DATE, progress=False)

# Store original columns to check if they are MultiIndex
original_columns = prices.columns

# Flatten the column names if they are a MultiIndex (common with yfinance output)
if isinstance(original_columns, pd.MultiIndex):
    # For yfinance with a single ticker, if previous attempt with level 1 resulted in 'AAPL',
    # then metrics are likely at level 0
    prices.columns = original_columns.get_level_values(0)

desired_cols = ['Open','High','Low','Close','Volume']
# Filter prices to only keep the desired columns that actually exist
existing_desired_cols = [col for col in desired_cols if col in prices.columns]

# Check if all desired columns are present; if not, raise an informative error
if len(existing_desired_cols) != len(desired_cols):
    missing_cols = set(desired_cols) - set(existing_desired_cols)
    raise KeyError(f"Missing one or more required columns: {missing_cols}. Existing columns after processing: {prices.columns.tolist()}")

prices = prices[existing_desired_cols]

prices.index = pd.to_datetime(prices.index)
prices.head()


# Add next-day return target
prices['Next_Close'] = prices['Close'].shift(-1)
prices['Next_Return'] = (prices['Next_Close'] - prices['Close']) / prices['Close']
prices = prices.dropna(subset=['Next_Return'])


# Label: 1 if next return > threshold else 0
prices['Target'] = (prices['Next_Return'] > RETURN_THRESHOLD).astype(int)

Downloading price data for AAPL


  prices = yf.download(TICKER, start=START_DATE, end=END_DATE, progress=False)


In [67]:
# This function fetches RSS entries and extracts full text using newspaper3k


def fetch_news_rss(query, start_date, end_date, max_items=500):
    rss_url = GOOGLE_NEWS_RSS_TEMPLATE.format(query=query.replace(' ', '+'))
    print('Fetching RSS:', rss_url)
    feed = feedparser.parse(rss_url)
    print(f'Raw RSS feed entries: {len(feed.entries)} before filtering by date')
    items = []
    for entry in feed.entries:
        # Google News gives published_parsed
        try:
            published = datetime(*entry.published_parsed[:6])
        except Exception:
            published = None
        link = entry.link
        title = entry.title
        items.append({'title': title, 'link': link, 'published': published})
    # Filter by date
    df = pd.DataFrame(items)
    df = df.dropna(subset=['published'])

    # Heuristic: if published dates are in the future relative to END_DATE, adjust them to END_DATE
    # This is a workaround for unreliable RSS feed published dates.
    max_allowed_date = pd.to_datetime(end_date)
    df.loc[df['published'] > max_allowed_date, 'published'] = max_allowed_date

    df = df[(df['published'] >= pd.to_datetime(start_date)) & (df['published'] <= pd.to_datetime(end_date))]
    if df.shape[0] > max_items:
        df = df.sample(max_items, random_state=1)
    # Extract article text (slow)
    texts = []
    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc='Downloading articles'):
        try:
            art = Article(row.link)
            art.download()
            art.parse()
            txt = art.text
            if not txt:
                txt = row.title
        except Exception:
            txt = row.title
        texts.append(txt)
    df['text'] = texts
    return df

# Use the global START_DATE and END_DATE for news fetching to align with price data
news_df = fetch_news_rss(QUERY, START_DATE, END_DATE, max_items=300)
print('Fetched', len(news_df), 'articles')
news_df.head()

Fetching RSS: https://news.google.com/rss/search?q=AAPL+OR+Apple&hl=en-US&gl=US&ceid=US:en
Raw RSS feed entries: 100 before filtering by date


Downloading articles: 100%|██████████| 100/100 [00:19<00:00,  5.07it/s]


Fetched 100 articles


Unnamed: 0,title,link,published,text
0,Apple Cuts Jobs Across Its Sales Organization ...,https://news.google.com/rss/articles/CBMitwFBV...,2025-11-24 19:36:51,Apple Cuts Jobs Across Its Sales Organization ...
1,Apple (AAPL) Cuts Jobs in Sales Division - Tip...,https://news.google.com/rss/articles/CBMid0FVX...,2025-11-24 20:43:55,Apple (AAPL) Cuts Jobs in Sales Division - Tip...
2,Prediction: Taiwan Semiconductor Will Be Worth...,https://news.google.com/rss/articles/CBMimAFBV...,2025-11-24 12:00:00,Prediction: Taiwan Semiconductor Will Be Worth...
3,Apple reduces headcount across sales teams to ...,https://news.google.com/rss/articles/CBMirgFBV...,2025-11-24 20:52:59,Apple reduces headcount across sales teams to ...
4,"iPhone Pocket, China Sales, $600 Million Paten...",https://news.google.com/rss/articles/CBMi0AFBV...,2025-11-23 12:01:06,"iPhone Pocket, China Sales, $600 Million Paten..."


In [69]:
# We'll assign each article to the next trading day (i.e., article published on day D influences return on D+1)


# Ensure published has date only
news_df['date'] = news_df['published'].dt.date
prices['date'] = prices.index.date


# For each article, find the next trading day index in prices
price_dates = pd.Series(prices.index.date, index=prices.index)


def find_next_trading_date(article_date):
    # article_date is a datetime.date
    # find the earliest price date > article_date
    mask = price_dates.index[price_dates.index.date > article_date]
    if len(mask) == 0:
        return None
    return mask[0].date()


news_df['target_date'] = news_df['date'].apply(find_next_trading_date)
news_df = news_df.dropna(subset=['target_date'])


# Merge with prices to get label
prices_reset = prices.reset_index()
prices_reset['date'] = prices_reset['Date'].dt.date # Corrected from 'index' to 'Date'
merge_df = news_df.merge(prices_reset[['date','Target','Next_Return']], left_on='target_date', right_on='date', how='left')
merge_df = merge_df.dropna(subset=['Target'])
print('Dataset size after alignment:', merge_df.shape)
merge_df.head()

Dataset size after alignment: (11, 9)


Unnamed: 0,title,link,published,text,date_x,target_date,date_y,Target,Next_Return
0,Apple announces finalists for the 2025 App Sto...,https://news.google.com/rss/articles/CBMimwFBV...,2025-11-19 14:02:33,Apple announces finalists for the 2025 App Sto...,2025-11-19,2025-11-20,2025-11-20,1,0.019681
1,Mapping the future with 3D-printed titanium Ap...,https://news.google.com/rss/articles/CBMiowFBV...,2025-11-18 14:02:55,Mapping the future with 3D-printed titanium Ap...,2025-11-18,2025-11-19,2025-11-19,0,-0.008601
2,Introducing iPhone Pocket: a beautiful way to ...,https://news.google.com/rss/articles/CBMiqgFBV...,2025-11-11 08:00:00,Introducing iPhone Pocket: a beautiful way to ...,2025-11-11,2025-11-12,2025-11-12,0,-0.001901
3,"Apple introduces Digital ID, a new way to crea...",https://news.google.com/rss/articles/CBMivgFBV...,2025-11-12 08:00:00,"Apple introduces Digital ID, a new way to crea...",2025-11-12,2025-11-13,2025-11-13,0,-0.001978
4,"Introducing iPhone Air, a powerful new iPhone ...",https://news.google.com/rss/articles/CBMisAFBV...,2025-09-09 07:00:00,"Introducing iPhone Air, a powerful new iPhone ...",2025-09-09,2025-09-10,2025-09-10,1,0.014286


In [70]:
from sklearn.model_selection import StratifiedKFold


texts = merge_df['text'].fillna(merge_df['title']).values
labels = merge_df['Target'].astype(int).values


# TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_tfidf = tfidf.fit_transform(texts)
print('TF-IDF shape:', X_tfidf.shape)


# Optional: SBERT embeddings
if USE_SBERT:
    print('Computing SBERT embeddings...')
    sbert = SentenceTransformer(SBERT_MODEL)
    X_sbert = sbert.encode(texts, show_progress_bar=True)
    # combine features (simple concat)
    from scipy.sparse import hstack
    X = hstack([X_tfidf, X_sbert])
else:
    X = X_tfidf

TF-IDF shape: (11, 142)


In [72]:
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, labels, merge_df.index, test_size=0.2, random_state=42, stratify=labels)


# Logistic Regression baseline
clf_lr = LogisticRegression(max_iter=1000)
clf_lr.fit(X_train, y_train)
preds_lr = clf_lr.predict(X_test)
print('Logistic Regression')
print(classification_report(y_test, preds_lr))


# Random Forest
clf_rf = RandomForestClassifier(n_estimators=200, random_state=42)
clf_rf.fit(X_train, y_train)
preds_rf = clf_rf.predict(X_test)
print('Random Forest')
print(classification_report(y_test, preds_rf))


# XGBoost
clf_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_xgb.fit(X_train, y_train)
preds_xgb = clf_xgb.predict(X_test)
print('XGBoost')
print(classification_report(y_test, preds_xgb))


# Compare accuracy
for name, preds in [('LR', preds_lr), ('RF', preds_rf), ('XGB', preds_xgb)]:
    print(name, 'acc:', accuracy_score(y_test, preds), 'roc_auc:', roc_auc_score(y_test, (clf_xgb.predict_proba(X_test)[:,1] if name=='XGB' else (clf_rf.predict_proba(X_test)[:,1] if name=='RF' else clf_lr.predict_proba(X_test)[:,1]))))

Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

XGBoost
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

LR acc: 0.6666666666666666 roc_auc: 1.0
RF acc: 0.6666666666666666 roc_auc: 0.75
XGB acc: 0.6666666666666666 roc_auc: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [75]:
from sklearn.metrics.pairwise import linear_kernel


corpus_vectors = X_tfidf # tfidf only


def retrieve(query, top_k=5):
    qv = tfidf.transform([query])
    sims = linear_kernel(qv, corpus_vectors).flatten()
    top_idx = sims.argsort()[::-1][:top_k]
    results = merge_df.iloc[top_idx][['title','link','published','text','Next_Return','Target']].copy()
    results['score'] = sims[top_idx]
    return results


# Example retrieve
print('Top 3 news for query: "earnings beat"')
print(retrieve('earnings beat', top_k=3))

Top 3 news for query: "earnings beat"
                                                title  \
10  A major evolution of Apple Security Bounty, wi...   
9   Apple increases U.S. commitment to $600 billio...   
8   Apple unleashes M5, the next big leap in AI pe...   

                                                 link           published  \
10  https://news.google.com/rss/articles/CBMib0FVX... 2025-10-10 07:00:00   
9   https://news.google.com/rss/articles/CBMitwFBV... 2025-08-06 07:00:00   
8   https://news.google.com/rss/articles/CBMisgFBV... 2025-10-15 07:00:00   

                                                 text  Next_Return  Target  \
10  A major evolution of Apple Security Bounty, wi...     0.000444       1   
9   Apple increases U.S. commitment to $600 billio...     0.042358       1   
8   Apple unleashes M5, the next big leap in AI pe...     0.019559       1   

    score  
10    0.0  
9     0.0  
8     0.0  


In [77]:
def predict_article_impact(article_text):
    v = tfidf.transform([article_text])
    pred = clf_xgb.predict(v)
    prob = clf_xgb.predict_proba(v)[0,1]
    return int(pred[0]), float(prob)


def query_and_predict(query, top_k=5):
    results = retrieve(query, top_k=top_k)
    out = []
    for _, row in results.iterrows():
        lab, prob = predict_article_impact(row['text'])
        out.append({
            'title': row['title'],
            'link': row['link'],
            'published': row['published'],
            'score': row['score'],
            'predicted_label': lab,
            'predicted_prob_up': prob,
            'actual_next_return': row['Next_Return'],
            'actual_label': int(row['Target'])
        })
    return pd.DataFrame(out)


print(query_and_predict('Apple quarterly earnings surprise', top_k=5))

                                               title  \
0                     Apple debuts iPhone 17 - Apple   
1  Apple unleashes M5, the next big leap in AI pe...   
2  Apple introduces Digital ID, a new way to crea...   
3  Apple is the exclusive new broadcast partner f...   
4  Apple announces finalists for the 2025 App Sto...   

                                                link           published  \
0  https://news.google.com/rss/articles/CBMib0FVX... 2025-09-09 07:00:00   
1  https://news.google.com/rss/articles/CBMisgFBV... 2025-10-15 07:00:00   
2  https://news.google.com/rss/articles/CBMivgFBV... 2025-11-12 08:00:00   
3  https://news.google.com/rss/articles/CBMirAFBV... 2025-10-17 07:00:00   
4  https://news.google.com/rss/articles/CBMimwFBV... 2025-11-19 14:02:33   

      score  predicted_label  predicted_prob_up  actual_next_return  \
0  0.282611                1               0.75            0.014286   
1  0.259448                1               0.75            0.019

In [78]:
import joblib
os.makedirs('models', exist_ok=True)
joblib.dump(tfidf, 'models/tfidf.pkl')
joblib.dump(clf_xgb, 'models/xgb.pkl')
joblib.dump(clf_rf, 'models/rf.pkl')
joblib.dump(clf_lr, 'models/lr.pkl')


# %%
# 11) Next steps / improvements (not executable):
# - Use higher-quality news sources or a paid news API (NewsAPI, LexisNexis, RavenPack)
# - Use event extraction (who, what, when) and named entity recognition to focus on company-related facts
# - Use SBERT/Transformers for semantic embeddings (improves retrieval + classification)
# - Try temporal models (LSTM, transformers) or incorporate price features (momentum, volatility)
# - Use ensemble of sentiment, event tags, and numeric features (volume, volatility)
# - Backtest a simple trading strategy based on predictions, accounting for transaction costs and slippage


# End of notebook
print('Done')

Done
