In [0]:
%sql
use stock_prediction.default;

CREATE OR REPLACE TEMP VIEW stock_moves AS
SELECT
  company_id,
  timestamp,
  close_price,
  -- Calculate the price 30 minutes in the future (The "Effect")
  LEAD(close_price) OVER (PARTITION BY company_id ORDER BY timestamp) as future_price,
  -- Calculate the percentage change
  (LEAD(close_price) OVER (PARTITION BY company_id ORDER BY timestamp) - close_price) / close_price as future_return
FROM stocks;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW correlation_dataset AS
SELECT
  s.ticker,
  s.time as decision_time,
  s.future_return, -- The Effect (Price movement)
  -- The Cause (Aggregated News)
  COUNT(n.id) as news_count,
  AVG(n.sentiment_score) as avg_sentiment
FROM stock_moves s
JOIN sentiment_scores n
ON 
  s.ticker = n.company_id -- Match the Company
  AND 
  -- Match news that happened in the 60 mins BEFORE the price point
  n.published_at BETWEEN s.time - INTERVAL 60 MINUTES AND s.time
GROUP BY 1, 2, 3;

In [0]:
%python
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load the joined data
df = spark.table("correlation_dataset").toPandas()

# 2. Filter for significant news (ignore noise where news_count is low)
df_filtered = df[df['news_count'] > 2] 

# 3. Calculate Correlation
correlation = df_filtered['avg_sentiment'].corr(df_filtered['future_return'])
print(f"Correlation between Sentiment and Future Return: {correlation:.4f}")

# 4. Plot (This is where plotting is actually useful!)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_filtered, x='avg_sentiment', y='future_return', alpha=0.5)
plt.title(f"Does News Sentiment Predict Price? (Corr: {correlation:.2f})")
plt.xlabel("Average Sentiment (Last 60 Mins)")
plt.ylabel("Future Price Return (Next 30 Mins)")
plt.axhline(0, color='grey', linestyle='--')
plt.axvline(0, color='grey', linestyle='--')
plt.show()