In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import random

In [2]:
# Paths
PROCESSED_DATA_DIR = Path("../data/processed")
FEATURES_DATA_DIR = PROCESSED_DATA_DIR

In [3]:
# Load cleaned data
tweets_file = PROCESSED_DATA_DIR / "tweet_finance_clean.csv"
news_file = PROCESSED_DATA_DIR / "news_data_clean.csv"

tweets_df = pd.read_csv(tweets_file) if tweets_file.exists() else pd.DataFrame()
news_df = pd.read_csv(news_file) if news_file.exists() else pd.DataFrame()

print(f"Tweets loaded: {tweets_df.shape}")
print(f"News loaded: {news_df.shape}")

Tweets loaded: (100, 6)
News loaded: (1247, 7)


In [4]:
# Prepare datetime columns
if not tweets_df.empty:
    # Ensure datetime format
    tweets_df['date'] = pd.to_datetime(tweets_df['date'], errors='coerce')
    tweets_df = tweets_df.dropna(subset=['date'])
    tweets_df['date'] = tweets_df['date'].dt.date  # Keep only day

if not news_df.empty:
    if 'date' in news_df.columns:
        news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
    news_df = news_df.dropna(subset=['date'])

In [5]:
# Dummy sentiment scoring (placeholder until next notebook)
def dummy_sentiment():
    # Temporary sentiment score: -1 negative, 0 neutral, 1 positive
    return random.choice([-1, 0, 1])

if not tweets_df.empty:
    tweets_df['sentiment_score'] = tweets_df['clean_text'].apply(lambda x: dummy_sentiment())

if not news_df.empty:
    news_df['sentiment_score'] = news_df['clean_title'].apply(lambda x: dummy_sentiment())

In [6]:
# Aggregate tweets
if not tweets_df.empty:
    tweet_sentiment_daily = tweets_df.groupby('date').agg(
        tweet_sentiment_mean=('sentiment_score', 'mean'),
        tweet_sentiment_std=('sentiment_score', 'std'),
        tweet_count=('sentiment_score', 'count')
    ).reset_index()
else:
    tweet_sentiment_daily = pd.DataFrame()

# Aggregate news
if not news_df.empty:
    news_sentiment_daily = news_df.groupby('date').agg(
        news_sentiment_mean=('sentiment_score', 'mean'),
        news_sentiment_std=('sentiment_score', 'std'),
        news_count=('sentiment_score', 'count')
    ).reset_index()
else:
    news_sentiment_daily = pd.DataFrame()

In [7]:
# Merge tweet and news features
if not tweet_sentiment_daily.empty and not news_sentiment_daily.empty:
    sentiment_features = pd.merge(tweet_sentiment_daily, news_sentiment_daily, on='date', how='outer')
elif not tweet_sentiment_daily.empty:
    sentiment_features = tweet_sentiment_daily
elif not news_sentiment_daily.empty:
    sentiment_features = news_sentiment_daily
else:
    sentiment_features = pd.DataFrame()

# Sort by date
if not sentiment_features.empty:
    sentiment_features = sentiment_features.sort_values('date').reset_index(drop=True)

In [8]:
# Save features
if not sentiment_features.empty:
    output_path = FEATURES_DATA_DIR / "sentiment_features.csv"
    sentiment_features.to_csv(output_path, index=False)
    print(f"Saved sentiment features to {output_path}")
else:
    print("No sentiment features generated")

Saved sentiment features to ..\data\processed\sentiment_features.csv


In [9]:
# Quick check
sentiment_features.head(10)

Unnamed: 0,date,tweet_sentiment_mean,tweet_sentiment_std,tweet_count,news_sentiment_mean,news_sentiment_std,news_count
0,2025-07-02,,,,0.101695,0.864928,59
1,2025-07-03,,,,0.025,0.76753,40
2,2025-07-04,,,,0.238095,0.830949,21
3,2025-07-05,,,,-0.416667,0.792961,12
4,2025-07-06,,,,0.333333,0.723747,15
5,2025-07-07,,,,0.060606,0.762159,66
6,2025-07-08,,,,-0.108434,0.855571,83
7,2025-07-09,,,,-0.040541,0.834844,74
8,2025-07-10,,,,0.075,0.858965,40
9,2025-07-11,,,,-0.148148,0.810483,54
