In [1]:
import pandas as pd
import xgboost
from sklearn.metrics import mean_squared_error,  mean_absolute_error, r2_score
from math import sqrt

In [2]:
# Read in dataset
nvidia_data = pd.read_csv('stock_news_reddit_df.csv')
nvidia_data.head()

Unnamed: 0,Date,Close,Return,Log_Return,MA_7,Volatility,ticker_sentiment_score,avg_1d,avg_3d,avg_5d,...,avg_1d_reddit,avg_3d_reddit,avg_5d_reddit,avg_7d_reddit,avg_10d_reddit,avg_1d_reddit_sentiment,avg_3d_reddit_sentiment,avg_5d_reddit_sentiment,avg_7d_reddit_sentiment,avg_10d_reddit_sentiment
0,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.9991,0.990667,0.6407,0.7124,0.67765,Positive,Positive,Positive,Positive,Positive
1,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,-0.3544,0.548033,0.7214,0.530114,0.5938,Negative,Positive,Positive,Positive,Positive
2,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.9333,0.526,0.71018,0.540343,0.62362,Positive,Positive,Positive,Positive,Positive
3,2024-11-15,141.970215,-0.03257,-0.033112,146.428478,4.108615,0.346622,0.346622,0.330836,0.290071,...,0.7566,0.445167,0.6668,0.7567,0.63223,Positive,Positive,Positive,Positive,Positive
4,2024-11-18,140.140335,-0.012889,-0.012973,145.181418,3.273268,0.303204,0.303204,0.310892,0.318424,...,0.4394,0.346,0.5575,0.480914,0.63384,Positive,Positive,Positive,Positive,Positive


In [3]:
# Split into training and test data
print(len(nvidia_data))
# Training: November - First weeks of January
train_data = nvidia_data[(nvidia_data['Date'] >= '2024-11-01') & (nvidia_data['Date'] <= '2025-01-25')]
print(len(train_data))
# Test: Last week of January
test_data = nvidia_data[(nvidia_data['Date'] >= '2025-01-26') & (nvidia_data['Date'] <= '2025-01-31')]
print(len(test_data))

340
252
88


In [4]:
# Set features
PERCENTAGE_OF_FEATURES_USED = 1
LEARNING_RATE = 0.01
MAX_DEPTH = 10 # 6 8
NUMBER_OF_BOOSTING_ROUNDS = 100000

In [5]:
# Create model
model = xgboost.XGBRegressor(colsample_bytree = PERCENTAGE_OF_FEATURES_USED,
                             learning_rate = LEARNING_RATE,
                             max_depth = MAX_DEPTH,
                             n_estimators = NUMBER_OF_BOOSTING_ROUNDS)

In [6]:
day_list = ['1', '3', '5', '7', '10']
for day in day_list:
    # Get dummies for reddit sentiment fields
    train_data = pd.get_dummies(train_data, columns = [f'avg_{day}d_reddit_sentiment'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns = [f'avg_{day}d_reddit_sentiment'], drop_first=True)

    # Get features and labels
    feature_columns = ['MA_7', 'Volatility','ticker_sentiment_score',
                       f'avg_{day}d', f'avg_{day}d_reddit', f'avg_{day}d_reddit_sentiment_Neutral',
                        f'avg_{day}d_reddit_sentiment_Positive', f'avg_{day}d_reddit_sentiment_Negative']

    label_column = ['Close']

    existing_columns = [col for col in feature_columns if col in test_data.columns]

    # Get train features
    train_features = train_data[existing_columns]
    train_labels = train_data[label_column]

    # Get test features
    test_features = test_data[existing_columns]
    test_labels = test_data[label_column]

    # Train model
    model.fit(train_features, train_labels)

    # Get predictions
    prediction = model.predict(test_features)

    # Calculate accuracy
    r2xgb = r2_score(test_labels, prediction)
    print(f'The R squared of the xgboost method for day {day} is: {r2xgb:.2f}')

    mae = mean_absolute_error(test_labels, prediction)
    print(f'The Mean Absolute Error (MAE) of the xgboost method for day {day} is: {mae:.2f}')

    rmse = sqrt(mean_squared_error(test_labels, prediction))
    print(f'The Root Mean Squared Error (RMSE) of the xgboost method for day {day} is: {rmse:.2f}')

The R squared of the xgboost method for day 1 is: -12.39
The Mean Absolute Error (MAE) of the xgboost method for day 1 is: 14.54
The Root Mean Squared Error (RMSE) of the xgboost method for day 1 is: 15.21
The R squared of the xgboost method for day 3 is: -18.42
The Mean Absolute Error (MAE) of the xgboost method for day 3 is: 16.10
The Root Mean Squared Error (RMSE) of the xgboost method for day 3 is: 18.32
The R squared of the xgboost method for day 5 is: -12.26
The Mean Absolute Error (MAE) of the xgboost method for day 5 is: 12.58
The Root Mean Squared Error (RMSE) of the xgboost method for day 5 is: 15.14
The R squared of the xgboost method for day 7 is: -21.91
The Mean Absolute Error (MAE) of the xgboost method for day 7 is: 18.15
The Root Mean Squared Error (RMSE) of the xgboost method for day 7 is: 19.90
The R squared of the xgboost method for day 10 is: -23.79
The Mean Absolute Error (MAE) of the xgboost method for day 10 is: 20.00
The Root Mean Squared Error (RMSE) of the xgb