In [None]:
# ----------------------------
# 03 - Preprocessing: Merge FNSPID Stocks + News
# ----------------------------
import pandas as pd
import os

DATA_DIR = "../data/raw/FNSPID"
OUT_DIR = "../data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------
# 1. Load Stock Data
# ----------------------------
# Example: Apple stock
aapl_path = os.path.join(DATA_DIR, "full_history", "AAPL.csv")
aapl = pd.read_csv(aapl_path)

print("Stock sample:")
print(aapl.head())

# Format columns (adjust if needed)
aapl["Date"] = pd.to_datetime(aapl["Date"])
aapl["Ticker"] = "AAPL"

# ----------------------------
# 2. Load News Data
# ----------------------------
news_path = os.path.join(DATA_DIR, "nasdaq_exteral_data.csv")
news = pd.read_csv(news_path)

print("News sample:")
print(news.head())

# Make sure Date is datetime
news["Date"] = pd.to_datetime(news["date"])  # adjust column name if needed
# Ensure ticker column matches stock symbols (check if it's "stock" or "symbol")
news.rename(columns={"stock":"Ticker"}, inplace=True)

# ----------------------------
# 3. Clean / Encode Sentiment
# ----------------------------
# If sentiment is textual → map to numeric
if news["sentiment"].dtype == "object":
    sentiment_map = {"positive": 1, "neutral": 0, "negative": -1}
    news["Sentiment"] = news["sentiment"].map(sentiment_map)
else:
    news["Sentiment"] = news["sentiment"]

# ----------------------------
# 4. Aggregate Daily Sentiment
# ----------------------------
news_daily = news.groupby(["Date","Ticker"])["Sentiment"].mean().reset_index()
print("Aggregated news:")
print(news_daily.head())

# ----------------------------
# 5. Merge Stock + News
# ----------------------------
merged = pd.merge(aapl, news_daily, on=["Date","Ticker"], how="left")
merged["Sentiment"].fillna(0, inplace=True)  # assume neutral if no news

print("Merged sample:")
print(merged.head())

# ----------------------------
# 6. Save Final Dataset
# ----------------------------
out_path = os.path.join(OUT_DIR, "AAPL_stock_news.csv")
merged.to_csv(out_path, index=False)

print("✅ Saved merged dataset to:", out_path, "with shape:", merged.shape)
