📦 Cell 1: Install dependencies (skip if local)

In [None]:
!pip install pandas matplotlib seaborn scikit-learn


📚 Cell 2: Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler


📁 Cell 3: Load the dataset

In [None]:
# Load the merged sentiment + stock price data
df = pd.read_csv("../backend/outputs/results/merged_sentiment_price.csv")

# Show a preview
df.head()


🧹 Cell 4: Clean and preprocess

In [None]:
# Drop rows with missing essential data
df = df.dropna(subset=["sentiment", "score", "NextDayReturn"])

# Map sentiment to numeric
sentiment_map = {"Positive": 1, "Neutral": 0, "Negative": -1}
df["SentimentEncoded"] = df["sentiment"].map(sentiment_map)

# Quick overview
df.describe()


📊 Cell 5: Plot sentiment distribution

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x="sentiment", data=df, palette="Set2")
plt.title("Sentiment Distribution")
plt.show()


📈 Cell 6: Correlation between sentiment score and stock return

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x="score", y="NextDayReturn", hue="sentiment", data=df)
plt.title("Sentiment Score vs. Next Day Return")
plt.axhline(0, color='gray', linestyle='--')
plt.axvline(0, color='gray', linestyle='--')
plt.show()


📅 Cell 7: Sentiment over time

In [None]:
df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace=True)

plt.figure(figsize=(10, 5))
sns.lineplot(x="date", y="score", data=df, label="Sentiment Score")
sns.lineplot(x="date", y="NextDayReturn", data=df, label="Next Day Return")
plt.title("Sentiment Score and Return Over Time")
plt.legend()
plt.show()


🧠 (Optional) Cell 8: Feature importance preview

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Prepare input features and target
X = df[["score", "SentimentEncoded"]]
y = (df["NextDayReturn"] > 0).astype(int)

model = RandomForestClassifier()
model.fit(X, y)

# Feature importances
importance = model.feature_importances_
sns.barplot(x=["score", "SentimentEncoded"], y=importance)
plt.title("Feature Importance")
plt.show()
