In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# ------------------ shared: lexicon labeling ------------------
import re
import pandas as pd
import numpy as np

# Expanded lexicons (you can further expand)
positive_words = [
    "gain","gains","gained","rally","rallies","rallied","growth","growing","rise","rises","rising",
    "soar","soars","soared","surge","surges","surged","jump","jumps","jumped","spike","spikes","spiked",
    "increase","increases","increased","higher","strength","strengthen","strengthened",
    "record","record-high","record breaking","all time high","beat","beats","beating","topped","tops",
    "exceed","exceeds","exceeded","outperform","outperforms","outperformed","above expectations",
    "profit","profits","profitable","margin expansion","raised forecast","upgrade","upgraded","improved",
    "recovery","rebound","bounce back","bullish","optimistic","resilient","solid","robust","momentum",
    "expands","expanding","expansion","success","secured","partnership","acquires","launch","launched",
    "innovation","breakthrough","best","better","stronger","positive","boost","boosted","skyrocket"
]

negative_words = [
    "loss","losses","losing","drop","drops","dropped","decline","declines","declined",
    "fall","falls","fell","selloff","sell-off","plunge","plunges","plunged","tumble","tumbles","tumbled",
    "slump","slumps","slumped","weak","weaker","weakness","downturn","slowdown","cooling demand",
    "missed expectations","miss","misses","missed","earnings miss","cuts forecast","cut forecast",
    "lower guidance","warn","warns","warning","profit slump","loss widening","margin compression",
    "downgrade","downgrades","downgraded","layoff","layoffs","job cuts","fired","recall","recalled",
    "bankrupt","bankruptcy","collapse","collapsed","probe","investigation","lawsuit","sued","sue","sues",
    "fraud","fraudulent","data breach","breach","fine","fined","penalty","risk","risks","uncertainty",
    "volatility","sell pressure","pullback","shortage","shortfall","delays","disruption","spooked"
]

neutral_words = [
    "news","update","report","reports","statement","statements","announces","announce","announcement",
    "says","said","according","reveals","revealed","shows","show","coverage","story","details","highlights",
    "press release","media","article","interview","note","notes","expected","scheduled","meeting","conference"
]

_url_pat = re.compile(r"http\S+|www\.\S+")
_nonalpha_pat = re.compile(r"[^a-z\s]")
_multispace_pat = re.compile(r"\s+")

def clean_text_basic(s: str) -> str:
    s = str(s).lower()
    s = _url_pat.sub(" ", s)
    s = _nonalpha_pat.sub(" ", s)
    s = _multispace_pat.sub(" ", s).strip()
    return s

def create_label(text: str) -> int:
    t = str(text).lower()
    # positive first
    for w in positive_words:
        if w in t:
            return 2
    for w in negative_words:
        if w in t:
            return 0
    for w in neutral_words:
        if w in t:
            return 1
    return 1  # default neutral
# --------------------------------------------------------------

In [3]:
# 2. LOAD TRAINING CSV (IGNORE LABEL COLUMN)
df = pd.read_csv("stock_news.csv")   # path to your data
df = df[["headline"]]

In [4]:
# 2. Split BEFORE preprocessing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [5]:
# 3. Preprocess
train_df["clean"] = train_df["headline"].apply(clean_text_basic)
test_df["clean"]  = test_df["headline"].apply(clean_text_basic)

In [6]:
# 4. Label via lexicon (weak supervision)
train_df["label"] = train_df["clean"].apply(create_label).astype(int)
test_df["label"]  = test_df["clean"].apply(create_label).astype(int)

In [7]:
# 5. Vectorize (unigrams + bigrams)
vectorizer = TfidfVectorizer(max_features=80000, ngram_range=(1,2), min_df=2)
X_train = vectorizer.fit_transform(train_df["clean"])
X_test  = vectorizer.transform(test_df["clean"])
y_train = train_df["label"]
y_test  = test_df["label"]

In [8]:
# 6. Train Logistic Regression (use class_weight if very imbalanced)
model = LogisticRegression(C=1.0, max_iter=5000, class_weight="balanced", n_jobs=-1)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [9]:
# 7. Eval
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.9551923076923077
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       390
           1       0.97      0.98      0.97      3539
           2       0.98      0.92      0.95      1271

    accuracy                           0.96      5200
   macro avg       0.91      0.92      0.92      5200
weighted avg       0.96      0.96      0.96      5200



In [39]:
out = test_df.copy()
out["pred"] = preds
out["pred_proba"] = list(model.predict_proba(X_test)) if hasattr(model, "predict_proba") else None
display(out.head(20))

Unnamed: 0,headline,clean,label,pred,pred_proba
4148,Applied Materials Gives Weak Forecast as Short...,applied materials gives weak forecast as short...,0,0,"[0.8441924221437664, 0.05648101406211483, 0.09..."
7841,Rockwell Automation (ROK) Q2 Earnings & Sales ...,rockwell automation rok q earnings sales miss ...,0,0,"[0.9167833765638017, 0.02718935526872792, 0.05..."
18898,CSX (CSX) Stock Sinks As Market Gains: What Yo...,csx csx stock sinks as market gains what you s...,2,2,"[0.005070680855592842, 0.024057176440060484, 0..."
4194,5 Things About Take Two Interactive Smart Inve...,things about take two interactive smart invest...,1,1,"[0.04832756952644645, 0.8418379953222785, 0.10..."
8409,Pacific Coast Building Products buys $13 milli...,pacific coast building products buys million r...,1,1,"[0.10106801960164227, 0.7889731340824624, 0.10..."
24725,2 Reasons XPEL Could Excel,reasons xpel could excel,1,1,"[0.17199841242016767, 0.6118302290589632, 0.21..."
2508,Semiconductor Sales Hit All-Time High: 4 Solid...,semiconductor sales hit all time high solid st...,2,2,"[0.02065780920863062, 0.03053083293960021, 0.9..."
20879,Investors in PACCAR (NASDAQ:PCAR) have made a ...,investors in paccar nasdaq pcar have made a re...,1,1,"[0.07939950673933727, 0.7012761261236286, 0.21..."
17758,Wellmark® Selects Principal® as Workplace Reti...,wellmark selects principal as workplace retire...,1,1,"[0.10641822674759673, 0.6245504750986987, 0.26..."
2905,"Omnicom (OMC) Q1 Earnings Surpass Estimates, I...",omnicom omc q earnings surpass estimates incre...,2,2,"[0.03479853448418425, 0.25295723718847574, 0.7..."


In [40]:
import yfinance as yf
import pandas as pd
import re
import feedparser   # FREE rss reader

# ----------------------------------------------------
# CLEANING
# ----------------------------------------------------
_url_pat = re.compile(r"http\S+|www\.\S+")
_nonalpha_pat = re.compile(r"[^a-z\s]")
_multispace_pat = re.compile(r"\s+")

def clean_text_basic(s):
    s = s.lower()
    s = _url_pat.sub(" ", s)
    s = _nonalpha_pat.sub(" ", s)
    s = _multispace_pat.sub(" ", s).strip()
    return s


# ----------------------------------------------------
# FETCH NEWS (Yahoo Finance)
# ----------------------------------------------------
def fetch_yahoo_news(ticker):
    try:
        stock = yf.Ticker(ticker)
        raw = stock.news

        if not raw:
            return []

        return [item["title"] for item in raw if "title" in item]

    except:
        return []


# ----------------------------------------------------
# FALLBACK: GOOGLE FINANCE RSS (NO API KEY)
# ----------------------------------------------------
def fetch_google_news(company):
    url = f"https://news.google.com/rss/search?q={company}+stock"
    feed = feedparser.parse(url)

    return [entry.title for entry in feed.entries]


# ----------------------------------------------------
# FINAL NEWS FETCHER (SAFE + NEVER EMPTY)
# ----------------------------------------------------
def get_news(company):
    headlines = fetch_yahoo_news(company)

    if len(headlines) == 0:
        print("⚠ Yahoo returned 0 news — using Google Finance RSS instead.")
        headlines = fetch_google_news(company)

    if len(headlines) == 0:
        print("❌ No news found from ANY source.")
        return pd.DataFrame({"headline": []})

    return pd.DataFrame({"headline": headlines})


# ----------------------------------------------------
# USE MODEL TO PREDICT SENTIMENT
# ----------------------------------------------------
def test_model_live(company):
    df = get_news(company)

    if len(df) == 0:
        print("No news available — cannot run prediction.")
        return None

    df["clean"] = df["headline"].apply(clean_text_basic)

    # TF-IDF vectorizer must already be loaded
    X = vectorizer.transform(df["clean"])

    df["prediction"] = model.predict(X)
    mapping = {0:"Negative", 1:"Neutral", 2:"Positive"}
    df["sentiment"] = df["prediction"].map(mapping)

    print("\n===== LIVE NEWS SENTIMENT =====\n")
    print(df)



    return df


# ----------------------------------------------------
# RUN (NEVER FAILS)
# ----------------------------------------------------
test_model_live("TSLA")

⚠ Yahoo returned 0 news — using Google Finance RSS instead.

===== LIVE NEWS SENTIMENT =====

                                             headline  \
0   Tesla (TSLA) Live Share price, Invest From Ind...   
1   Tesla, Inc. Stock (TSLA) Opinions on AI Chip A...   
2   Elon Musk sends open invite to buy Tesla stock...   
3   Tesla Rises As Elon Musk Makes This Austin Rob...   
4   Evaluating TSLA Stock's Actual Performance - T...   
..                                                ...   
95  Tesla's stock plunges 8% after another weak qu...   
96  Why Tesla (TSLA) Stock Is Trading Lower Today ...   
97  Dow Jones Futures: Intel, AI Play Jump After B...   
98  Tesla stock falls amid broad tech sell-off, do...   
99  Why Tesla Stock Is Viewed As 'Must Own' With E...   

                                                clean  prediction sentiment  
0   tesla tsla live share price invest from india ...           1   Neutral  
1   tesla inc stock tsla opinions on ai chip advan...           1

Unnamed: 0,headline,clean,prediction,sentiment
0,"Tesla (TSLA) Live Share price, Invest From Ind...",tesla tsla live share price invest from india ...,1,Neutral
1,"Tesla, Inc. Stock (TSLA) Opinions on AI Chip A...",tesla inc stock tsla opinions on ai chip advan...,1,Neutral
2,Elon Musk sends open invite to buy Tesla stock...,elon musk sends open invite to buy tesla stock...,1,Neutral
3,Tesla Rises As Elon Musk Makes This Austin Rob...,tesla rises as elon musk makes this austin rob...,2,Positive
4,Evaluating TSLA Stock's Actual Performance - T...,evaluating tsla stock s actual performance the...,1,Neutral
...,...,...,...,...
95,Tesla's stock plunges 8% after another weak qu...,tesla s stock plunges after another weak quart...,0,Negative
96,Why Tesla (TSLA) Stock Is Trading Lower Today ...,why tesla tsla stock is trading lower today ya...,1,Neutral
97,"Dow Jones Futures: Intel, AI Play Jump After B...",dow jones futures intel ai play jump after bul...,2,Positive
98,"Tesla stock falls amid broad tech sell-off, do...",tesla stock falls amid broad tech sell off dow...,0,Negative
