In [118]:
import yfinance as yf
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Load sentiment data
sentiment_data = pd.read_csv("predictions2.csv")
unique_stocks = sentiment_data["stock"].unique()

# Initialize LabelEncoder
encoder = LabelEncoder()
encoder.fit(unique_stocks)

# Function to fetch stock data and compute technical indicators
def get_stock_data(ticker):
    stock = yf.download(ticker, start="2024-01-01", end="2025-02-22", interval="1d")
    stock.columns = stock.columns.droplevel(1) if isinstance(stock.columns, pd.MultiIndex) else stock.columns
    stock["Stock"] = ticker  # Add stock identifier
    
    # Calculate EMA
    stock["EMA_10"] = stock["Close"].ewm(span=10, adjust=False).mean()
    stock["EMA_25"] = stock["Close"].ewm(span=25, adjust=False).mean()

    # MACD Calculation
    short_ema = stock["Close"].ewm(span=12, adjust=False).mean()
    long_ema = stock["Close"].ewm(span=26, adjust=False).mean()
    stock["MACD_12_26_9"] = short_ema - long_ema
    stock["MACDs_12_26_9"] = stock["MACD_12_26_9"].ewm(span=9, adjust=False).mean()
    stock["MACDh_12_26_9"] = stock["MACD_12_26_9"] - stock["MACDs_12_26_9"]

    # Bollinger Bands
    stock["BBM_5"] = stock["Close"].rolling(window=5).mean()
    stock["BBU_5"] = stock["BBM_5"] + (stock["Close"].rolling(window=5).std() * 1)
    stock["BBL_5"] = stock["BBM_5"] - (stock["Close"].rolling(window=5).std() * 1)
    stock["BBB_5"] = (stock["BBU_5"] - stock["BBL_5"]) / stock["BBM_5"]
    stock["BBP_5"] = (stock["Close"] - stock["BBL_5"]) / (stock["BBU_5"] - stock["BBL_5"])
    stock["target_1"] = stock["Close"].shift(-1)
    
    # Encode stock name
    stock["Stock"] = encoder.transform([ticker])[0]
    
    return stock.iloc[:-1]  # Remove last row

# Fetch and combine data for all stocks
all_data = pd.concat([get_stock_data(stock) for stock in unique_stocks])
all_data = all_data.reset_index()
all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data = all_data[all_data['Date'] >= '2025-01-22']
sen_data = all_data.copy()
all_data = all_data.drop(columns=["Date"])
all_data = all_data.dropna()
sentiment_data = sentiment_data.rename(columns={'stock': 'Stock', 'date': 'Date'})
sentiment_data["Stock"] = encoder.transform(sentiment_data["Stock"])
sen_data['Date'] = pd.to_datetime(sen_data['Date'])
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])
merged_data = sen_data.merge(sentiment_data, on=['Date', 'Stock'], how='left')
merged_data = merged_data.drop(columns=["Date"])
merged_data = merged_data.dropna()

# Split features and target
features = all_data.drop(columns=["target_1"])
target = all_data["target_1"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=1)

# Train the model
model = XGBRegressor(objective="reg:squarederror", n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate MSE, MAPE, and R² for test set
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"\nModel Performance on Test Set:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")
print(f"R-squared (R²): {r2:.4f}")

def predict_next_days(model, last_row, feature_columns, days=5):
    predictions = []
    last_row = last_row.copy()
    
    for _ in range(days):
        feature_data = pd.DataFrame([last_row[feature_columns]])

        # Predict the next price
        predicted_price = model.predict(feature_data)[0]
        predictions.append(predicted_price)

        # Update the last row with new predicted price
        last_row["Close"] = predicted_price

        # Recalculate EMA
        last_row["EMA_10"] = (last_row["EMA_10"] * 9 + predicted_price) / 10
        last_row["EMA_25"] = (last_row["EMA_25"] * 24 + predicted_price) / 25

        # Recalculate MACD components
        short_ema = (last_row["MACD_12_26_9"] * 11 + predicted_price) / 12
        long_ema = (last_row["MACD_12_26_9"] * 25 + predicted_price) / 26
        last_row["MACD_12_26_9"] = short_ema - long_ema
        last_row["MACDs_12_26_9"] = (last_row["MACDs_12_26_9"] * 8 + last_row["MACD_12_26_9"]) / 9
        last_row["MACDh_12_26_9"] = last_row["MACD_12_26_9"] - last_row["MACDs_12_26_9"]

        # Recalculate Bollinger Bands
        last_row["BBM_5"] = (last_row["BBM_5"] * 4 + predicted_price) / 5
        last_row["BBU_5"] = last_row["BBM_5"] + (last_row["BBB_5"] * last_row["BBM_5"])
        last_row["BBL_5"] = last_row["BBM_5"] - (last_row["BBB_5"] * last_row["BBM_5"])
        last_row["BBP_5"] = (predicted_price - last_row["BBL_5"]) / (last_row["BBU_5"] - last_row["BBL_5"])

    return predictions

feature_columns = X_train.columns
future_predictions = {}

for stock in unique_stocks:
    stock_encoded = encoder.transform([stock])[0]
    last_row = all_data[all_data["Stock"] == stock_encoded].iloc[-1]
    future_predictions[stock] = predict_next_days(model, last_row, feature_columns, days=5)

# Display predictions
for stock, predictions in future_predictions.items():
    print(f"\nPredicted prices for {stock}:")
    for i, price in enumerate(predictions, start=1):
        print(f"Day {i}: {price:.2f}")



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********


Model Performance on Test Set:
Mean Squared Error (MSE): 70.4969
Mean Absolute Percentage Error (MAPE): 2.27%
R-squared (R²): 0.9978

Predicted prices for ISRG:
Day 1: 591.90
Day 2: 586.67
Day 3: 586.75
Day 4: 587.15
Day 5: 587.26

Predicted prices for IDXX:
Day 1: 452.91
Day 2: 455.90
Day 3: 459.24
Day 4: 459.47
Day 5: 457.15

Predicted prices for BSX:
Day 1: 104.65
Day 2: 101.56
Day 3: 102.70
Day 4: 102.40
Day 5: 103.00

Predicted prices for LLY:
Day 1: 873.65
Day 2: 912.32
Day 3: 924.49
Day 4: 924.49
Day 5: 914.11

Predicted prices for EW:
Day 1: 73.21
Day 2: 75.37
Day 3: 73.41
Day 4: 72.96
Day 5: 72.80

Predicted prices for ZTS:
Day 1: 160.52
Day 2: 162.11
Day 3: 161.98
Day 4: 162.06
Day 5: 161.85

Predicted prices for SYK:
Day 1: 382.77
Day 2: 389.18
Day 3: 391.49
Day 4: 400.24
Day 5: 400.60

Predicted prices for DHR:
Day 1: 210.27
Day 2: 215.46
Day 3: 219.34
Day 4: 224.88
Day 5: 226.64

Predicted prices for ABT:
Day 1: 134.51
Day 2: 132.59
Day 3: 130.14
Day 4: 128.02
Day 5: 127.

In [119]:
# Feature selection
features = merged_data.drop(columns=["target_1"])
target = merged_data["target_1"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=1)
model = XGBRegressor(objective="reg:squarederror", n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate MSE, MAPE, and R² for test set
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"\nModel Performance on Test Set:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")
print(f"R-squared (R²): {r2:.4f}")

# Predict next 5 days
def predict_next_days(model, last_row, feature_columns, days=5):
    predictions = []
    last_row = last_row.copy()
    
    for _ in range(days):
        feature_data = pd.DataFrame([last_row[feature_columns]])

        # Predict the next price
        predicted_price = model.predict(feature_data)[0]
        predictions.append(predicted_price)

        # Update the last row with new predicted price
        last_row["Close"] = predicted_price

        # Recalculate EMA
        last_row["EMA_10"] = (last_row["EMA_10"] * 9 + predicted_price) / 10
        last_row["EMA_25"] = (last_row["EMA_25"] * 24 + predicted_price) / 25

        # Recalculate MACD components
        short_ema = (last_row["MACD_12_26_9"] * 11 + predicted_price) / 12
        long_ema = (last_row["MACD_12_26_9"] * 25 + predicted_price) / 26
        last_row["MACD_12_26_9"] = short_ema - long_ema
        last_row["MACDs_12_26_9"] = (last_row["MACDs_12_26_9"] * 8 + last_row["MACD_12_26_9"]) / 9
        last_row["MACDh_12_26_9"] = last_row["MACD_12_26_9"] - last_row["MACDs_12_26_9"]

        # Recalculate Bollinger Bands
        last_row["BBM_5"] = (last_row["BBM_5"] * 4 + predicted_price) / 5
        last_row["BBU_5"] = last_row["BBM_5"] + (last_row["BBB_5"] * last_row["BBM_5"])
        last_row["BBL_5"] = last_row["BBM_5"] - (last_row["BBB_5"] * last_row["BBM_5"])
        last_row["BBP_5"] = (predicted_price - last_row["BBL_5"]) / (last_row["BBU_5"] - last_row["BBL_5"])

    return predictions

feature_columns = X_train.columns
future_predictions = {}

for stock in unique_stocks:
    stock_encoded = encoder.transform([stock])[0]
    last_row = merged_data[merged_data["Stock"] == stock_encoded].iloc[-1]
    future_predictions[stock] = predict_next_days(model, last_row, feature_columns, days=5)

# Display predictions
for stock, predictions in future_predictions.items():
    print(f"\nPredicted prices for {stock}:")
    for i, price in enumerate(predictions, start=1):
        print(f"Day {i}: {price:.2f}")


Model Performance on Test Set:
Mean Squared Error (MSE): 86.1385
Mean Absolute Percentage Error (MAPE): 1.70%
R-squared (R²): 0.9978

Predicted prices for ISRG:
Day 1: 592.38
Day 2: 644.14
Day 3: 698.65
Day 4: 698.65
Day 5: 698.65

Predicted prices for IDXX:
Day 1: 452.80
Day 2: 453.95
Day 3: 453.81
Day 4: 453.99
Day 5: 453.75

Predicted prices for BSX:
Day 1: 104.50
Day 2: 104.42
Day 3: 102.58
Day 4: 102.68
Day 5: 102.68

Predicted prices for LLY:
Day 1: 873.74
Day 2: 871.50
Day 3: 871.44
Day 4: 871.68
Day 5: 871.04

Predicted prices for EW:
Day 1: 73.38
Day 2: 71.22
Day 3: 72.47
Day 4: 70.92
Day 5: 71.93

Predicted prices for ZTS:
Day 1: 160.00
Day 2: 162.37
Day 3: 161.29
Day 4: 160.70
Day 5: 159.31

Predicted prices for SYK:
Day 1: 382.69
Day 2: 384.95
Day 3: 388.11
Day 4: 390.29
Day 5: 391.19

Predicted prices for DHR:
Day 1: 208.76
Day 2: 211.43
Day 3: 210.47
Day 4: 211.22
Day 5: 211.54

Predicted prices for ABT:
Day 1: 134.59
Day 2: 136.48
Day 3: 136.72
Day 4: 136.43
Day 5: 136.