In [1]:
# Install yfinance if not already installed
#!pip install yfinance

# Import necessary libraries
import yfinance as yahooFinance
import pandas as pd
import numpy as np

# Disable all warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Fetch historical stock data
def get_stock_data(symbol, start_date, end_date):
    stock_data = yahooFinance.download(symbol, start=start_date, end=end_date)
    if stock_data.empty:
        raise ValueError(f'No data found for symbol {symbol} between {start_date} and {end_date}.')
    return stock_data

In [3]:
# Specify stock and date range
stock_symbol = 'NVDA'
stock_start_date = '2024-11-01'
stock_end_date = '2025-02-07'

# Get stock data
stock_data = get_stock_data(stock_symbol, stock_start_date, stock_end_date)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [4]:
stock_data

Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-11-01,135.390671,137.300543,134.560741,134.690722,207127800
2024-11-04,136.040634,138.950437,135.560671,137.200558,187528200
2024-11-05,139.900360,140.360320,137.320536,137.440523,160537400
2024-11-06,145.599960,146.479905,141.950218,142.950149,242043900
2024-11-07,148.869751,148.919735,146.159931,146.379917,207323300
...,...,...,...,...,...
2025-01-31,120.070000,127.849998,119.190002,123.779999,390372900
2025-02-03,116.660004,118.570000,113.010002,114.750000,371235700
2025-02-04,118.650002,121.199997,116.699997,116.959999,256550000
2025-02-05,124.830002,125.000000,120.760002,121.760002,262230800


In [5]:
# Data Preprocessing
# Move "Date" from index to a column
stock_data = stock_data.reset_index()

# Ensure Date is sorted
stock_data = stock_data.sort_values(by='Date')

# Ensure Data is from November to January
stock_data = stock_data[(stock_data['Date'] >= '2024-11-01') & (stock_data['Date'] <= '2025-02-08')]

# Drop the first row (Ticker row)
stock_data.columns = stock_data.columns.droplevel(1)

# Removes 'Price' as a column index name
stock_data.columns.name = None

# Feature engineering
stock_data['Return'] = stock_data['Close'].pct_change()                 # Daily return
stock_data['Log_Return'] = np.log(stock_data['Close'] / stock_data['Close'].shift(1))  # Log return
stock_data['MA_7'] = stock_data['Close'].rolling(window=7).mean()     # 7-day moving average
stock_data['Volatility'] = stock_data['Close'].rolling(window=10).std() # 10-day volatility

# Impute Missing Values
stock_data['MA_7'].bfill(inplace=True)  # Fill backward
stock_data['Volatility'].bfill(inplace=True)
stock_data['Return'].fillna(0, inplace=True)  # Replace NaN returns with 0
stock_data['Log_Return'].fillna(0, inplace=True)


# handle missing data
stock_data.ffill(inplace=True)

# Reset index
stock_data.reset_index(drop=True, inplace=True)

In [6]:
# Print Stock data
stock_data = stock_data[['Date', 'Close', 'Return', 'Log_Return', 'MA_7', 'Volatility']]
stock_data

Unnamed: 0,Date,Close,Return,Log_Return,MA_7,Volatility
0,2024-11-01,135.390671,0.000000,0.000000,142.667313,5.013364
1,2024-11-04,136.040634,0.004801,0.004789,142.667313,5.013364
2,2024-11-05,139.900360,0.028372,0.027977,142.667313,5.013364
3,2024-11-06,145.599960,0.040740,0.039932,142.667313,5.013364
4,2024-11-07,148.869751,0.022457,0.022209,142.667313,5.013364
...,...,...,...,...,...,...
60,2025-01-31,120.070000,-0.036743,-0.037435,129.381428,11.195738
61,2025-02-03,116.660004,-0.028400,-0.028811,125.015714,12.174614
62,2025-02-04,118.650002,0.017058,0.016914,121.591430,12.209785
63,2025-02-05,124.830002,0.052086,0.050775,122.507144,10.405330


In [7]:
# Read in news data
news_data = pd.read_csv('data/avg_news_sentiment.csv')
# Filter to November - January
news_data = news_data[(news_data['date'] >= '2024-11-01') & (news_data['date'] <= '2025-02-07')]
news_data['date'] = pd.to_datetime(news_data['date']).dt.date

# Reset index
news_data.drop(columns=[col for col in news_data.columns if 'Unnamed' in col], axis=1, inplace=True)
news_data.reset_index(drop=True, inplace=True)

# Show news data
news_data.head()

Unnamed: 0,date,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,avg_7d,avg_10d
0,2024-11-01,0.296796,0.296796,0.327983,0.258469,0.283578,0.303098
1,2024-11-02,0.464111,0.464111,0.406224,0.304364,0.309111,0.310695
2,2024-11-03,0.344659,0.344659,0.368522,0.358544,0.30016,0.30928
3,2024-11-04,0.27201,0.27201,0.36026,0.367068,0.305498,0.306583
4,2024-11-05,0.323438,0.323438,0.313369,0.340203,0.341167,0.310389


In [8]:
# Read in reddit training data
reddit_train_data = pd.read_csv('data/train_reddit_df_sentiment.csv')
# Read in reddit test data
reddit_test_data = pd.read_csv('data/test_reddit_df_sentiment.csv')
# Concatenate into one
reddit_data = pd.concat([reddit_train_data, reddit_test_data])

# Fill NaN Upvotes with 1 (if any)
reddit_data["Upvotes"] = reddit_data["Upvotes"].fillna(1)

# Convert 'Date' column to datetime format
reddit_data['Date_Posted'] = pd.to_datetime(reddit_data['Date_Posted']).dt.date
# Ensure Date is sorted
reddit_data = reddit_data.sort_values(by='Date_Posted')

In [9]:
# Compute weighted average sentiment for Reddit (Upvotes as weight)
# Organize into daily averages
daily_avg_reddit = reddit_data.groupby("Date_Posted").apply(
    lambda x: (x['Sentiment'] * x["Upvotes"]).sum() / x["Upvotes"].sum()
).reset_index(name="Sentiment")

# Apply rolling averages
daily_avg_reddit['avg_1d_reddit'] = daily_avg_reddit['Sentiment'].rolling(1).mean()
daily_avg_reddit['avg_3d_reddit'] = daily_avg_reddit['Sentiment'].rolling(3).mean()
daily_avg_reddit['avg_5d_reddit'] = daily_avg_reddit['Sentiment'].rolling(5).mean()
daily_avg_reddit['avg_7d_reddit'] = daily_avg_reddit['Sentiment'].rolling(7).mean()
daily_avg_reddit['avg_10d_reddit'] = daily_avg_reddit['Sentiment'].rolling(10).mean()

# Fill NaNs Instead of Dropping
daily_avg_reddit.bfill(inplace=True)  # Backfill missing values
daily_avg_reddit.ffill(inplace=True)  # Forward fill as backup

daily_avg_reddit.head()

Unnamed: 0,Date_Posted,Sentiment,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit
0,2024-11-01,0.872352,0.872352,0.756182,0.554164,0.631125,0.434983
1,2024-11-02,0.573693,0.573693,0.756182,0.554164,0.631125,0.434983
2,2024-11-04,0.8225,0.8225,0.756182,0.554164,0.631125,0.434983
3,2024-11-05,0.88706,0.88706,0.761084,0.554164,0.631125,0.434983
4,2024-11-06,-0.384782,-0.384782,0.441592,0.554164,0.631125,0.434983


In [10]:
# Join news and reddit data on date
news_reddit_df = pd.merge(news_data, daily_avg_reddit, left_on='date', right_on='Date_Posted', how='outer')
news_reddit_df['date'] = pd.to_datetime(news_reddit_df['date'])
news_reddit_df.head()

Unnamed: 0,date,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,avg_7d,avg_10d,Date_Posted,Sentiment,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit
0,2024-11-01,0.296796,0.296796,0.327983,0.258469,0.283578,0.303098,2024-11-01,0.872352,0.872352,0.756182,0.554164,0.631125,0.434983
1,2024-11-02,0.464111,0.464111,0.406224,0.304364,0.309111,0.310695,2024-11-02,0.573693,0.573693,0.756182,0.554164,0.631125,0.434983
2,2024-11-03,0.344659,0.344659,0.368522,0.358544,0.30016,0.30928,,,,,,,
3,2024-11-04,0.27201,0.27201,0.36026,0.367068,0.305498,0.306583,2024-11-04,0.8225,0.8225,0.756182,0.554164,0.631125,0.434983
4,2024-11-05,0.323438,0.323438,0.313369,0.340203,0.341167,0.310389,2024-11-05,0.88706,0.88706,0.761084,0.554164,0.631125,0.434983


In [11]:
sentiment_df = news_reddit_df.copy()
# Merge 'date' and 'Date_Posted' columns, prioritizing non-null values
sentiment_df['Date'] = sentiment_df['date'].fillna(sentiment_df['Date_Posted'])
sentiment_df.drop(['date', 'Date_Posted'], axis=1, inplace=True)

sentiment_df.rename(
    columns={
        'ticker_sentiment_score': 'news_sentiment', 'Sentiment': 'reddit_sentiment',
        'avg_1d': 'news_avg_1d', 'avg_3d': 'news_avg_3d', 
        'avg_5d': 'news_avg_5d', 'avg_7d': 'news_avg_7d', 'avg_10d': 'news_avg_10d',
        'avg_1d_reddit': 'reddit_avg_1d', 'avg_3d_reddit': 'reddit_avg_3d',
        'avg_5d_reddit': 'reddit_avg_5d', 'avg_7d_reddit': 'reddit_avg_7d', 'avg_10d_reddit': 'reddit_avg_10d'
    },
    inplace=True
)

In [12]:
# Function to categorize sentiment using separate thresholds
def categorize_sentiment(score, mean, std, threshold_factor=0.75):
    threshold = threshold_factor * std  # Dynamic threshold
    if score > mean + threshold:
        return "Positive"
    elif score < mean - threshold:
        return "Negative"
    else:
        return "Neutral"

In [13]:
# Compute mean and standard deviation separately for news and Reddit sentiment
news_mean, news_std = sentiment_df["news_sentiment"].mean(), sentiment_df["news_sentiment"].std()
reddit_mean, reddit_std = sentiment_df["reddit_sentiment"].mean(), sentiment_df["reddit_sentiment"].std()

# Compute mean and std for each moving average separately
stats = {
    "news_avg_1d": (sentiment_df["news_avg_1d"].mean(), sentiment_df["news_avg_1d"].std()),
    "news_avg_3d": (sentiment_df["news_avg_3d"].mean(), sentiment_df["news_avg_3d"].std()),
    "news_avg_5d": (sentiment_df["news_avg_5d"].mean(), sentiment_df["news_avg_5d"].std()),
    "news_avg_7d": (sentiment_df["news_avg_7d"].mean(), sentiment_df["news_avg_7d"].std()),
    "news_avg_10d": (sentiment_df["news_avg_10d"].mean(), sentiment_df["news_avg_10d"].std()),
    
    "reddit_avg_1d": (sentiment_df["reddit_avg_1d"].mean(), sentiment_df["reddit_avg_1d"].std()),
    "reddit_avg_3d": (sentiment_df["reddit_avg_3d"].mean(), sentiment_df["reddit_avg_3d"].std()),
    "reddit_avg_5d": (sentiment_df["reddit_avg_5d"].mean(), sentiment_df["reddit_avg_5d"].std()),
    "reddit_avg_7d": (sentiment_df["reddit_avg_7d"].mean(), sentiment_df["reddit_avg_7d"].std()),
    "reddit_avg_10d": (sentiment_df["reddit_avg_10d"].mean(), sentiment_df["reddit_avg_10d"].std()),
}

# Fill NaNs before applying categorization to avoid errors
sentiment_df.bfill(inplace=True)
sentiment_df.ffill(inplace=True)

# Apply separate categorization for news and Reddit sentiment
sentiment_df["news_sentiment_label"] = sentiment_df["news_sentiment"].apply(lambda x: categorize_sentiment(x, news_mean, news_std))
sentiment_df["reddit_sentiment_label"] = sentiment_df["reddit_sentiment"].apply(lambda x: categorize_sentiment(x, reddit_mean, reddit_std))

# Apply categorization for moving averages (separately for news and Reddit)
for col in stats.keys():
    mean, std = stats[col]  # Get the correct mean and std for this column
    sentiment_df[f"{col}_sentiment"] = sentiment_df[col].apply(lambda x: categorize_sentiment(x, mean, std))

sentiment_df.head()

Unnamed: 0,news_sentiment,news_avg_1d,news_avg_3d,news_avg_5d,news_avg_7d,news_avg_10d,reddit_sentiment,reddit_avg_1d,reddit_avg_3d,reddit_avg_5d,...,news_avg_1d_sentiment,news_avg_3d_sentiment,news_avg_5d_sentiment,news_avg_7d_sentiment,news_avg_10d_sentiment,reddit_avg_1d_sentiment,reddit_avg_3d_sentiment,reddit_avg_5d_sentiment,reddit_avg_7d_sentiment,reddit_avg_10d_sentiment
0,0.296796,0.296796,0.327983,0.258469,0.283578,0.303098,0.872352,0.872352,0.756182,0.554164,...,Neutral,Neutral,Negative,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
1,0.464111,0.464111,0.406224,0.304364,0.309111,0.310695,0.573693,0.573693,0.756182,0.554164,...,Positive,Positive,Neutral,Neutral,Neutral,Neutral,Positive,Neutral,Positive,Neutral
2,0.344659,0.344659,0.368522,0.358544,0.30016,0.30928,0.8225,0.8225,0.756182,0.554164,...,Neutral,Positive,Positive,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
3,0.27201,0.27201,0.36026,0.367068,0.305498,0.306583,0.8225,0.8225,0.756182,0.554164,...,Neutral,Neutral,Positive,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
4,0.323438,0.323438,0.313369,0.340203,0.341167,0.310389,0.88706,0.88706,0.761084,0.554164,...,Neutral,Neutral,Neutral,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral


In [14]:
# Join stock data on date
stock_news_reddit_df = pd.merge(stock_data, sentiment_df, left_on='Date', right_on='Date', how='outer')

# Fill NaNs Instead of Dropping
stock_news_reddit_df.bfill(inplace=True)  # Backfill missing values
stock_news_reddit_df.ffill(inplace=True)  # Forward fill as backup

stock_news_reddit_df.head(15)

Unnamed: 0,Date,Close,Return,Log_Return,MA_7,Volatility,news_sentiment,news_avg_1d,news_avg_3d,news_avg_5d,...,news_avg_1d_sentiment,news_avg_3d_sentiment,news_avg_5d_sentiment,news_avg_7d_sentiment,news_avg_10d_sentiment,reddit_avg_1d_sentiment,reddit_avg_3d_sentiment,reddit_avg_5d_sentiment,reddit_avg_7d_sentiment,reddit_avg_10d_sentiment
0,2024-11-01,135.390671,0.0,0.0,142.667313,5.013364,0.296796,0.296796,0.327983,0.258469,...,Neutral,Neutral,Negative,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
1,2024-11-02,136.040634,0.004801,0.004789,142.667313,5.013364,0.464111,0.464111,0.406224,0.304364,...,Positive,Positive,Neutral,Neutral,Neutral,Neutral,Positive,Neutral,Positive,Neutral
2,2024-11-03,136.040634,0.004801,0.004789,142.667313,5.013364,0.344659,0.344659,0.368522,0.358544,...,Neutral,Positive,Positive,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
3,2024-11-04,136.040634,0.004801,0.004789,142.667313,5.013364,0.27201,0.27201,0.36026,0.367068,...,Neutral,Neutral,Positive,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
4,2024-11-05,139.90036,0.028372,0.027977,142.667313,5.013364,0.323438,0.323438,0.313369,0.340203,...,Neutral,Neutral,Neutral,Neutral,Neutral,Positive,Positive,Neutral,Positive,Neutral
5,2024-11-06,145.59996,0.04074,0.039932,142.667313,5.013364,0.350788,0.350788,0.315412,0.351001,...,Neutral,Neutral,Neutral,Positive,Neutral,Negative,Neutral,Neutral,Positive,Neutral
6,2024-11-07,148.869751,0.022457,0.022209,142.667313,5.013364,0.337153,0.337153,0.337126,0.32561,...,Neutral,Neutral,Neutral,Neutral,Neutral,Positive,Neutral,Neutral,Positive,Neutral
7,2024-11-08,147.619827,-0.008396,-0.008432,142.667313,5.013364,0.430341,0.430341,0.372761,0.342746,...,Positive,Positive,Neutral,Positive,Positive,Neutral,Neutral,Positive,Positive,Neutral
8,2024-11-09,145.249985,-0.016054,-0.016184,142.667313,5.013364,0.329232,0.329232,0.365575,0.35419,...,Neutral,Neutral,Neutral,Neutral,Positive,Positive,Positive,Positive,Positive,Neutral
9,2024-11-10,145.249985,-0.016054,-0.016184,142.667313,5.013364,0.332782,0.332782,0.364118,0.356059,...,Neutral,Neutral,Positive,Neutral,Positive,Negative,Neutral,Neutral,Neutral,Neutral


In [15]:
# Save dataset as csv
stock_news_reddit_df.to_csv('data/stock_news_reddit_df_v2.csv', index=False)