In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
from collections import defaultdict
from tqdm import tqdm

tqdm.pandas()

class SmartTickerExtractor:
    def __init__(self):
        self.patterns = [
            (r'\$([A-Z]{1,5})\b', 1, 10.0),
            (r'(?:NYSE|NASDAQ|LSE|TSE):\s*([A-Z]{1,5})\b', 1, 10.0),
            (r'\(([A-Z]{1,5})\)', 1, 8.0),
            (r'\$([A-Z]{2,5}\.[A-Z]{1,2})\b', 1, 9.0),
            (r'\(([A-Z]{2,5}\.[A-Z]{1,2})\)', 1, 8.0),
            (r'\b([A-Z]{2,5}\.[A-Z]{1,2})\b', 0, 5.0),
            (r'\b([A-Z]{1,5})\b', 0, 3.0),
        ]
        self.valid_tickers = {
            "AAPL","MSFT","NVDA","AMD","INTC","QCOM","CSCO","ORCL","IBM","ADBE","CRM",
            "GOOGL","META","NFLX","DIS","VZ","T","TMUS","PARA","WBD","TTWO","EA",
            "AMZN","TSLA","HD","MCD","NKE","SBUX","LOW","BKNG","TGT","LVS","RCL",
            "PG","KO","PEP","WMT","COST","PM","MO","CL","KMB","TAP","GIS",
            "XOM","CVX","COP","SLB","HAL","EOG","PSX","VLO","MPC","OXY","BKR",
            "JPM","BAC","WFC","C","GS","MS","AXP","SCHW","BK","BLK","TFC",
            "JNJ","PFE","MRK","UNH","LLY","ABBV","TMO","DHR","BMY","AMGN","CVS",
            "CAT","GE","BA","HON","LMT","NOC","DE","MMM","RTX","GD","ETN",
            "LIN","SHW","APD","NEM","DD","FCX","ECL","VMC","MLM","CF","ALB",
            "NEE","DUK","SO","D","AEP","EXC","SRE","XEL","PEG","ED","WEC",
            "PLD","AMT","EQIX","CCI","O","PSA","SPG","WELL","VICI","DLR","AVB"
        }
        self.excluded_words = {
            'CEO','CFO','COO','IPO','ETF','SEC','FDA','US','UK','USD',
            'CNN','BBC','AI','IT','TV','APP','GDP','Q1','Q2','Q3','Q4',
            'LLC','INC','LTD','CORP','CO','GROUP','PLC'
        }

    def extract_pattern_tickers(self, text):
        text = text.upper() # test
        ticker_scores = defaultdict(float)
        for pattern, group_idx, score in self.patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                ticker = match.group(group_idx).upper()
                if ticker in self.valid_tickers:
                    ticker_scores[ticker] += score
        return ticker_scores

    def is_valid_ticker(self, ticker):
        if not ticker or ticker in self.excluded_words:
            return False
        return bool(re.match(r'^[A-Z]{1,5}$', ticker)) or bool(re.match(r'^[A-Z0-9]{2,6}\.[A-Z]{1,2}$', ticker))

    def extract_tickers(self, text):
        scores = self.extract_pattern_tickers(text or "")
        return list(scores.keys())

sector_map = {
    "AAPL":"Technology","MSFT":"Technology","NVDA":"Technology","AMD":"Technology","INTC":"Technology",
    "QCOM":"Technology","CSCO":"Technology","ORCL":"Technology","IBM":"Technology","ADBE":"Technology","CRM":"Technology",
    "GOOGL":"Communication Services","META":"Communication Services","NFLX":"Communication Services",
    "DIS":"Communication Services","VZ":"Communication Services","T":"Communication Services","TMUS":"Communication Services",
    "PARA":"Communication Services","WBD":"Communication Services","TTWO":"Communication Services","EA":"Communication Services",
    "AMZN":"Consumer Discretionary","TSLA":"Consumer Discretionary","HD":"Consumer Discretionary","MCD":"Consumer Discretionary",
    "NKE":"Consumer Discretionary","SBUX":"Consumer Discretionary","LOW":"Consumer Discretionary","BKNG":"Consumer Discretionary",
    "TGT":"Consumer Discretionary","LVS":"Consumer Discretionary","RCL":"Consumer Discretionary",
    "PG":"Consumer Staples","KO":"Consumer Staples","PEP":"Consumer Staples","WMT":"Consumer Staples","COST":"Consumer Staples",
    "PM":"Consumer Staples","MO":"Consumer Staples","CL":"Consumer Staples","KMB":"Consumer Staples","TAP":"Consumer Staples","GIS":"Consumer Staples",
    "XOM":"Energy","CVX":"Energy","COP":"Energy","SLB":"Energy","HAL":"Energy","EOG":"Energy",
    "PSX":"Energy","VLO":"Energy","MPC":"Energy","OXY":"Energy","BKR":"Energy",
    "JPM":"Financials","BAC":"Financials","WFC":"Financials","C":"Financials","GS":"Financials","MS":"Financials",
    "AXP":"Financials","SCHW":"Financials","BK":"Financials","BLK":"Financials","TFC":"Financials",
    "JNJ":"Healthcare","PFE":"Healthcare","MRK":"Healthcare","UNH":"Healthcare","LLY":"Healthcare","ABBV":"Healthcare",
    "TMO":"Healthcare","DHR":"Healthcare","BMY":"Healthcare","AMGN":"Healthcare","CVS":"Healthcare",
    "CAT":"Industrials","GE":"Industrials","BA":"Industrials","HON":"Industrials","LMT":"Industrials","NOC":"Industrials",
    "DE":"Industrials","MMM":"Industrials","RTX":"Industrials","GD":"Industrials","ETN":"Industrials",
    "LIN":"Materials","SHW":"Materials","APD":"Materials","NEM":"Materials","DD":"Materials","FCX":"Materials",
    "ECL":"Materials","VMC":"Materials","MLM":"Materials","CF":"Materials","ALB":"Materials",
    "NEE":"Utilities","DUK":"Utilities","SO":"Utilities","D":"Utilities","AEP":"Utilities","EXC":"Utilities",
    "SRE":"Utilities","XEL":"Utilities","PEG":"Utilities","ED":"Utilities","WEC":"Utilities",
    "PLD":"Real Estate","AMT":"Real Estate","EQIX":"Real Estate","CCI":"Real Estate","O":"Real Estate","PSA":"Real Estate",
    "SPG":"Real Estate","WELL":"Real Estate","VICI":"Real Estate","DLR":"Real Estate","AVB":"Real Estate",
}

class FFNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        embedded = self.embedding(x)
        avg_embedded = embedded.mean(dim=1)
        return self.fc(avg_embedded)

def preprocess_text(text, vocab, max_len=30):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    ids = [vocab.get(t, 1) for t in tokens][:max_len]
    ids += [0] * (max_len - len(ids))
    return torch.tensor([ids], dtype=torch.long)

def predict_sentiment(text, model, vocab, le):
    x = preprocess_text(text, vocab)
    with torch.no_grad():
        logits = model(x)
        probs = F.softmax(logits, dim=1).numpy()[0]
        pred_idx = np.argmax(probs)
        label = le.inverse_transform([pred_idx])[0]
        confidence = float(probs[pred_idx])
        score = (
            -100 * confidence if "negative" in label.lower()
            else 100 * confidence if "positive" in label.lower()
            else 0
        )
    return label, score, confidence

# =========================================================
# MAIN PIPELINE
# =========================================================
def process_pipeline(input_csv, output_csv):
    print(f"Loading data from: {input_csv}")
    df = pd.read_csv(input_csv, dtype={"published": str})
    print(f"Loaded {len(df):,} rows\n")

    print("Extracting tickers and sectors")
    extractor = SmartTickerExtractor()
    tickers_list, sectors_list = [], []

    for _, row in df.iterrows():
        title_text = str(row.get("title", ""))
        tickers = extractor.extract_tickers(title_text)
        sectors = list({sector_map.get(t, None) for t in tickers if sector_map.get(t, None)}) if tickers else []
        tickers_list.append(tickers)
        sectors_list.append(sectors)

    df["tickers"] = tickers_list
    df["sectors"] = sectors_list

    print("Loading model and running sentiment inference")
    vocab = joblib.load(r"D:\CSE 6242\Project\data\vocab.pkl")
    le = joblib.load(r"D:\CSE 6242\Project\data\label_encoder.pkl")

    model = FFNN_Embedding(
        vocab_size=len(vocab),
        embed_dim=100,
        hidden_dim=256,
        num_classes=len(le.classes_)
    )
    model.load_state_dict(torch.load(r"D:\CSE 6242\Project\data\sentiment_ffnn_model.pt", map_location="cpu"))
    model.eval()

    tqdm.pandas(desc="Running Sentiment Model")
    results = df["title"].progress_apply(lambda t: predict_sentiment(str(t), model, vocab, le))

    df["sentiment_label"] = results.apply(lambda x: x[0])
    df["sentiment_score"] = results.apply(lambda x: x[1])
    df["confidence"] = results.apply(lambda x: x[2])

    columns_to_keep = [
        "published", "title", "tickers", "sectors",
        "sentiment_label", "sentiment_score", "confidence"
    ]
    df[columns_to_keep].to_csv(output_csv, index=False)
    print(f"Output saved to: {output_csv}")
    print(df[columns_to_keep].head(10))

process_pipeline(
    input_csv=r"D:\CSE 6242\Project\data\merged_sentiment_news.csv",
    output_csv=r"D:\CSE 6242\Project\data\final_merged_with_sentiment.csv"
)


In [None]:
import pandas as pd
import ast
from datetime import timedelta

input_path = r"D:\CSE 6242\Project\data\final_merged_with_sentiment.csv"
df = pd.read_csv(input_path)
print(f"Loaded {len(df):,} rows")

def safe_parse_list(x):
    try:
        if isinstance(x, str) and x.startswith("["):
            return ast.literal_eval(x)
        elif isinstance(x, list):
            return x
        elif pd.isna(x):
            return []
        else:
            return [str(x)]
    except Exception:
        return []

df["tickers"] = df["tickers"].apply(safe_parse_list)
df["sectors"] = df["sectors"].apply(safe_parse_list)

df["published"] = pd.to_datetime(df["published"], errors="coerce")
max_date = df["published"].max().normalize()
cutoff = max_date - timedelta(days=90)
df = df[df["published"] >= cutoff].copy()
df["date"] = df["published"].dt.normalize()

print(f"Filtered to {cutoff.date()} to {max_date.date()} ({len(df):,} rows remain)")

df_exp_tickers = df.explode("tickers")
df_exp_sectors = df.explode("sectors")

ticker_daily = (
    df_exp_tickers.dropna(subset=["tickers"])
    .groupby(["date", "tickers"])["sentiment_score"]
    .mean()
    .reset_index()
    .rename(columns={"tickers": "ticker", "sentiment_score": "avg_sentiment_score"})
)

sector_daily = (
    df_exp_sectors.dropna(subset=["sectors"])
    .groupby(["date", "sectors"])["sentiment_score"]
    .mean()
    .reset_index()
    .rename(columns={"sectors": "sector", "sentiment_score": "avg_sentiment_score"})
)

def add_rolling(df, group_col):
    df = df.sort_values(["date"])
    df["rolling_90d_sentiment"] = (
        df.groupby(group_col)["avg_sentiment_score"]
        .transform(lambda s: s.rolling(window=90, min_periods=1).mean())
    )
    return df

ticker_stats = add_rolling(ticker_daily, "ticker")
sector_stats = add_rolling(sector_daily, "sector")

# === 7. Save outputs ===
ticker_out = r"D:\CSE 6242\Project\data\merged_ticker_sentiment_stats_90d.csv"
sector_out = r"D:\CSE 6242\Project\data\merged_sector_sentiment_stats_90d.csv"

ticker_stats.to_csv(ticker_out, index=False)
sector_stats.to_csv(sector_out, index=False)

print(f"Saved ticker stats to {ticker_out}")
print(f"Saved sector stats to {sector_out}")

print("Ticker sentiment sample:")
print(ticker_stats.head(10))

print("Sector sentiment sample:")
print(sector_stats.head(10))
