### Stock Price Prediction- Multiple Sources

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yahooFinance
import xgboost

In [2]:
# Read in stock data
def get_stock_data(symbol, start_date, end_date):
    stock_data = yahooFinance.download(symbol, start=start_date, end=end_date)
    if stock_data.empty:
        raise ValueError(f'No data found for symbol {symbol} between {start_date} and {end_date}.')
    return stock_data

In [5]:
# Specify stock and date range (October through first week of February)
stock_symbol = 'NVDA'
stock_start_date = '2024-11-01'
stock_end_date = '2025-02-07'

# Get stock data
stock_data = get_stock_data(stock_symbol, stock_start_date, stock_end_date)

[*********************100%***********************]  1 of 1 completed


In [6]:
stock_data = stock_data.reset_index()

In [7]:
stock_data.columns = stock_data.columns.droplevel('Ticker')
stock_data.head()

Price,Date,Close,High,Low,Open,Volume
0,2024-11-01,135.390671,137.300543,134.560741,134.690722,207127800
1,2024-11-04,136.040634,138.950437,135.560671,137.200558,187528200
2,2024-11-05,139.90036,140.36032,137.320536,137.440523,160537400
3,2024-11-06,145.59996,146.479905,141.950218,142.950149,242043900
4,2024-11-07,148.869751,148.919735,146.159931,146.379917,207323300


In [8]:
# Data Preprocessing
# Handle Missing Data
stock_data = stock_data.ffill()

# Feature engineering
stock_data['Return'] = stock_data['Close'].pct_change()                 # Daily return
stock_data['Log_Return'] = np.log(stock_data['Close'] / stock_data['Close'].shift(1))  # Log return
stock_data['MA_7'] = stock_data['Close'].rolling(window=7).mean()     # 7-day moving average
stock_data['Volatility'] = stock_data['Close'].rolling(window=10).std() # 10-day volatility

# Drop rows with NaN values
stock_data.dropna(inplace=True)

In [9]:
# Print Stock data
stock_data = stock_data[['Date', 'Close', 'Return', 'Log_Return', 'MA_7', 'Volatility']]
stock_data.head()


Price,Date,Close,Return,Log_Return,MA_7,Volatility
9,2024-11-14,146.749878,0.00335,0.003344,146.947013,5.013364
10,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615
11,2024-11-18,140.140335,-0.012889,-0.012973,145.181418,3.273268
12,2024-11-19,146.999863,0.048948,0.047787,145.092852,2.758274
13,2024-11-20,145.879944,-0.007619,-0.007648,145.182846,2.757732


In [10]:
# Read in news data
news_data = pd.read_csv('avg_news_sentiment.csv')
# Filter to November - January
news_data = news_data[(news_data['date'] >= '2024-11-01') & (news_data['date'] <= '2025-02-07')]
# Show news data
news_data.head()

Unnamed: 0.1,Unnamed: 0,date,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,avg_7d,avg_10d
2,11,2024-11-01,0.296796,0.296796,0.327983,0.258469,0.283578,0.303098
3,12,2024-11-02,0.464111,0.464111,0.406224,0.304364,0.309111,0.310695
4,13,2024-11-03,0.344659,0.344659,0.368522,0.358544,0.30016,0.30928
5,14,2024-11-04,0.27201,0.27201,0.36026,0.367068,0.305498,0.306583
6,15,2024-11-05,0.323438,0.323438,0.313369,0.340203,0.341167,0.310389


In [20]:
# Read in reddit training data
reddit_train_data = pd.read_csv('train_reddit_df_sentiment.csv')
# Read in reddit test data
reddit_test_data = pd.read_csv('test_reddit_df_sentiment.csv')
# Concatenate into one
reddit_data = pd.concat([reddit_train_data, reddit_test_data])
reddit_data.head()

Unnamed: 0,Post_Title,Post_URL,Post_Text,Date_Posted,Upvotes,Comments,Subreddit,Sentiment,Sentiment Category
0,Intel's revenue forecast disappoints as invest...,https://www.reddit.com/r/stocks/comments/1idxs...,"Intel's (INTC.O), opens new tab first-quarter ...",2025-01-30 21:17:18,238,79,stocks,-0.5106,Negative
1,Nvidia’s Prime time to buy,https://www.reddit.com/r/stocks/comments/1idqh...,\nStocks are emotional in nature. The Nvidia i...,2025-01-30 16:12:44,9,42,stocks,0.9874,Positive
2,These are the stocks on my watchlist (01/30),https://www.reddit.com/r/stocks/comments/1ido0...,This is a daily watchlist for short-term tradi...,2025-01-30 14:20:34,25,15,stocks,0.8167,Positive
3,1/30) - Thursday's Pre-Market News & Stock Movers,https://www.reddit.com/r/stocks/comments/1idni...,#Good morning traders and investors of the r/s...,2025-01-30 13:57:10,11,2,stocks,0.9979,Positive
4,Meta's CAPEX Spending Exceeds the Combined Net...,https://www.reddit.com/r/stocks/comments/1id9r...,**META** plans to spend **$60-$65 billion** in...,2025-01-30 00:50:17,239,87,stocks,0.4574,Positive


In [21]:
# Organize into daily averages
daily_avg_reddit = reddit_data.groupby('Date_Posted')['Sentiment'].mean().reset_index()

# Apply rolling averages
daily_avg_reddit['avg_1d_reddit'] = daily_avg_reddit['Sentiment'].rolling(1).mean()
daily_avg_reddit['avg_3d_reddit'] = daily_avg_reddit['Sentiment'].rolling(3).mean()
daily_avg_reddit['avg_5d_reddit'] = daily_avg_reddit['Sentiment'].rolling(5).mean()
daily_avg_reddit['avg_7d_reddit'] = daily_avg_reddit['Sentiment'].rolling(7).mean()
daily_avg_reddit['avg_10d_reddit'] = daily_avg_reddit['Sentiment'].rolling(10).mean()

daily_avg_reddit = daily_avg_reddit.dropna()
daily_avg_reddit.head()

Unnamed: 0,Date_Posted,Sentiment,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit
9,2024-11-06 17:04:12,-0.9223,-0.9223,0.134333,0.44454,0.538043,0.65901
10,2024-11-06 23:27:53,0.9908,0.9908,0.3178,0.4782,0.597886,0.65935
11,2024-11-07 00:18:22,0.9996,0.9996,0.356033,0.47868,0.601871,0.66158
12,2024-11-07 23:07:49,0.9364,0.9364,0.9756,0.57788,0.618143,0.66931
13,2024-11-08 04:08:36,0.954,0.954,0.963333,0.5917,0.611971,0.70752


In [22]:
# Categorize nltk sentiment score
def categorize_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [23]:
# Add Reddit averages sentiment label
daily_avg_reddit['avg_1d_reddit_sentiment'] = daily_avg_reddit['avg_1d_reddit'].apply(categorize_sentiment)
daily_avg_reddit['avg_3d_reddit_sentiment'] = daily_avg_reddit['avg_3d_reddit'].apply(categorize_sentiment)
daily_avg_reddit['avg_5d_reddit_sentiment'] = daily_avg_reddit['avg_5d_reddit'].apply(categorize_sentiment)
daily_avg_reddit['avg_7d_reddit_sentiment'] = daily_avg_reddit['avg_7d_reddit'].apply(categorize_sentiment)
daily_avg_reddit['avg_10d_reddit_sentiment'] = daily_avg_reddit['avg_10d_reddit'].apply(categorize_sentiment)

daily_avg_reddit['Date_Posted'] = pd.to_datetime(daily_avg_reddit['Date_Posted'])
daily_avg_reddit['Date_Posted'] = daily_avg_reddit['Date_Posted'].dt.strftime('%Y-%m-%d')
daily_avg_reddit.head()


Unnamed: 0,Date_Posted,Sentiment,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit,avg_1d_reddit_sentiment,avg_3d_reddit_sentiment,avg_5d_reddit_sentiment,avg_7d_reddit_sentiment,avg_10d_reddit_sentiment
9,2024-11-06,-0.9223,-0.9223,0.134333,0.44454,0.538043,0.65901,Negative,Positive,Positive,Positive,Positive
10,2024-11-06,0.9908,0.9908,0.3178,0.4782,0.597886,0.65935,Positive,Positive,Positive,Positive,Positive
11,2024-11-07,0.9996,0.9996,0.356033,0.47868,0.601871,0.66158,Positive,Positive,Positive,Positive,Positive
12,2024-11-07,0.9364,0.9364,0.9756,0.57788,0.618143,0.66931,Positive,Positive,Positive,Positive,Positive
13,2024-11-08,0.954,0.954,0.963333,0.5917,0.611971,0.70752,Positive,Positive,Positive,Positive,Positive


In [24]:
# Join news and reddit data on date
news_reddit_df = pd.merge(news_data, daily_avg_reddit, left_on='date', right_on='Date_Posted', how='inner')
news_reddit_df['date'] = pd.to_datetime(news_reddit_df['date'])
news_reddit_df.head()

Unnamed: 0.1,Unnamed: 0,date,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,avg_7d,avg_10d,Date_Posted,Sentiment,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit,avg_1d_reddit_sentiment,avg_3d_reddit_sentiment,avg_5d_reddit_sentiment,avg_7d_reddit_sentiment,avg_10d_reddit_sentiment
0,16,2024-11-06,0.350788,0.350788,0.315412,0.351001,0.358509,0.304735,2024-11-06,-0.9223,-0.9223,0.134333,0.44454,0.538043,0.65901,Negative,Positive,Positive,Positive,Positive
1,16,2024-11-06,0.350788,0.350788,0.315412,0.351001,0.358509,0.304735,2024-11-06,0.9908,0.9908,0.3178,0.4782,0.597886,0.65935,Positive,Positive,Positive,Positive,Positive
2,17,2024-11-07,0.337153,0.337153,0.337126,0.32561,0.341279,0.314987,2024-11-07,0.9996,0.9996,0.356033,0.47868,0.601871,0.66158,Positive,Positive,Positive,Positive,Positive
3,17,2024-11-07,0.337153,0.337153,0.337126,0.32561,0.341279,0.314987,2024-11-07,0.9364,0.9364,0.9756,0.57788,0.618143,0.66931,Positive,Positive,Positive,Positive,Positive
4,18,2024-11-08,0.430341,0.430341,0.372761,0.342746,0.360357,0.350645,2024-11-08,0.954,0.954,0.963333,0.5917,0.611971,0.70752,Positive,Positive,Positive,Positive,Positive


In [25]:
# Join stock data on date
stock_news_reddit_df = pd.merge(stock_data, news_reddit_df, left_on='Date', right_on='date', how='inner')
stock_news_reddit_df.head()


Unnamed: 0.1,Date,Close,Return,Log_Return,MA_7,Volatility,Unnamed: 0,date,ticker_sentiment_score,avg_1d,...,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit,avg_1d_reddit_sentiment,avg_3d_reddit_sentiment,avg_5d_reddit_sentiment,avg_7d_reddit_sentiment,avg_10d_reddit_sentiment
0,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,25,2024-11-15,0.346622,0.346622,...,0.9991,0.990667,0.6407,0.7124,0.67765,Positive,Positive,Positive,Positive,Positive
1,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,25,2024-11-15,0.346622,0.346622,...,-0.3544,0.548033,0.7214,0.530114,0.5938,Negative,Positive,Positive,Positive,Positive
2,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,25,2024-11-15,0.346622,0.346622,...,0.9333,0.526,0.71018,0.540343,0.62362,Positive,Positive,Positive,Positive,Positive
3,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,25,2024-11-15,0.346622,0.346622,...,0.7566,0.445167,0.6668,0.7567,0.63223,Positive,Positive,Positive,Positive,Positive
4,2024-11-18,140.140335,-0.012889,-0.012973,145.181418,3.273268,28,2024-11-18,0.303204,0.303204,...,0.4394,0.346,0.5575,0.480914,0.63384,Positive,Positive,Positive,Positive,Positive


In [26]:
# Organize dataset
stock_news_reddit_df = stock_news_reddit_df[['Date', 'Close', 'Return', 'Log_Return',
                                             'MA_7', 'Volatility', 'ticker_sentiment_score',
                                             'avg_1d', 'avg_3d','avg_5d', 'avg_7d', 'avg_10d',
                                             'avg_1d_reddit', 'avg_3d_reddit', 'avg_5d_reddit',
                                             'avg_7d_reddit', 'avg_10d_reddit', 'avg_1d_reddit_sentiment',
                                             'avg_3d_reddit_sentiment','avg_5d_reddit_sentiment',
                                             'avg_7d_reddit_sentiment', 'avg_10d_reddit_sentiment' ]]
stock_news_reddit_df.head()

Unnamed: 0,Date,Close,Return,Log_Return,MA_7,Volatility,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,...,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit,avg_1d_reddit_sentiment,avg_3d_reddit_sentiment,avg_5d_reddit_sentiment,avg_7d_reddit_sentiment,avg_10d_reddit_sentiment
0,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.9991,0.990667,0.6407,0.7124,0.67765,Positive,Positive,Positive,Positive,Positive
1,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,-0.3544,0.548033,0.7214,0.530114,0.5938,Negative,Positive,Positive,Positive,Positive
2,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.9333,0.526,0.71018,0.540343,0.62362,Positive,Positive,Positive,Positive,Positive
3,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.7566,0.445167,0.6668,0.7567,0.63223,Positive,Positive,Positive,Positive,Positive
4,2024-11-18,140.140335,-0.012889,-0.012973,145.181418,3.273268,0.303204,0.303204,0.310892,0.318424,...,0.4394,0.346,0.5575,0.480914,0.63384,Positive,Positive,Positive,Positive,Positive


In [27]:
# Save dataset as csv
stock_news_reddit_df.to_csv('stock_news_reddit_df.csv', index=False)