 Cell 1: Install dependencies (skip if local)

In [None]:
!pip install scikit-learn pandas matplotlib seaborn


Cell 2: Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


Cell 3: Load and preprocess the data

In [None]:
# Load merged sentiment + stock price data
df = pd.read_csv("../outputs/results/merged_sentiment_price.csv")

# Drop rows with missing values
df = df.dropna(subset=["score", "sentiment", "NextDayReturn"])

# Target: whether stock went up the next day
df["Target"] = (df["NextDayReturn"] > 0).astype(int)

# Encode sentiment label
df["SentimentEncoded"] = df["sentiment"].map({"Positive": 1, "Neutral": 0, "Negative": -1})

# Feature matrix
X = df[["score", "SentimentEncoded"]]
y = df["Target"]


Cell 4: Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


Cell 5: Train and evaluate model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Cell 6: Visualize confusion matrix

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
