# Investment Optimized Portfolio Project

## Data Preparation for Evaluation

### Evaluation based on P/E, P/B, ROE, D/E, and EPS

In [None]:
import pandas as pd
import yfinance as yf

# Load the CSV file, skipping the first four rows
tickers_df = pd.read_csv('Yahoo_tickers_full.csv', skiprows=4)  # Skip the first four rows to start from row 5

# Extract stock symbols from the first column
symbols = tickers_df.iloc[:, 0].tolist()  # Assuming the first column contains the symbols

# Function to fetch stock data
def fetch_stock_data(stock_symbols):
    data = {}
    for symbol in stock_symbols:
        try:
            ticker = yf.Ticker(symbol)
            info = ticker.info
            data[symbol] = {
                'P/E Ratio': info.get('trailingPE', None),
                'P/B Ratio': info.get('priceToBook', None),
                'ROE': info.get('returnOnEquity', None),
                'D/E Ratio': info.get('debtToEquity', None),
                'EPS': info.get('trailingEps', None)
            }
        except Exception as e:
            print(f"Failed to fetch data for {symbol}: {str(e)}")
    return pd.DataFrame.from_dict(data, orient='index')

# Fetch and process stock data for a subset of symbols to ensure the process works correctly
subset_symbols = symbols[:500]  # Process a smaller subset to avoid overwhelming the API
stock_data = fetch_stock_data(subset_symbols)

  tickers_df = pd.read_csv('Yahoo_tickers_full.csv', skiprows=4)  # Skip the first four rows to start from row 5
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/ATVI?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=ATVI&crumb=4R8DxtQ0nwi
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/RAD?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=RAD&crumb=4R8DxtQ0nwi
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/ABC?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=ABC&crumb=4R8DxtQ0nwi
ERROR:yfinance:404 Client Error: Not Found for url: https

In [None]:
stock_data

Unnamed: 0,P/E Ratio,P/B Ratio,ROE,D/E Ratio,EPS
AAPL,29.531832,39.318794,1.47250,140.968,6.44
BAC,13.555172,1.166296,0.08724,,2.90
AMZN,51.901688,8.871657,0.20305,74.107,3.56
T,9.314516,1.188272,0.13035,126.105,1.86
GOOG,27.10107,7.481926,0.29764,9.690,6.53
...,...,...,...,...,...
AUPH,,2.284172,-0.16686,24.629,-0.43
ATV,,,,,
ATLC,5.870588,0.887143,0.19623,376.013,4.25
AST,,,,,


Notes: There are some errors in retrieving some stocks on YF, this is due to either a symbol mismatch or the stock not being listed on YF. To solve this issue, we would rather need a list of all listed stock symbols retrieved directly from YF or we must cross check their presence.

### Ranking the Stocks

In [None]:
def rank_stocks(df):
    # Convert columns to numeric, coercing errors and filling NaNs with a specified method
    df['P/E Ratio'] = pd.to_numeric(df['P/E Ratio'], errors='coerce')
    df['P/B Ratio'] = pd.to_numeric(df['P/B Ratio'], errors='coerce')
    df['ROE'] = pd.to_numeric(df['ROE'], errors='coerce')
    df['D/E Ratio'] = pd.to_numeric(df['D/E Ratio'], errors='coerce')
    df['EPS'] = pd.to_numeric(df['EPS'], errors='coerce')

    # Handle NaNs by ranking them at the bottom or using a fill value
    df.fillna({'P/E Ratio': df['P/E Ratio'].max() + 1,
               'P/B Ratio': df['P/B Ratio'].max() + 1,
               'ROE': df['ROE'].min() - 1,
               'D/E Ratio': df['D/E Ratio'].max() + 1,
               'EPS': df['EPS'].min() - 1}, inplace=True)

    # Calculate a composite score
    df['Score'] = (
        -df['P/E Ratio'].rank(pct=True) +  # Invert P/E ratio ranking
        df['P/B Ratio'].rank(pct=True) +
        df['ROE'].rank(pct=True) +
        -df['D/E Ratio'].rank(pct=True) +  # Invert D/E ratio ranking
        df['EPS'].rank(pct=True)
    )
    return df.sort_values(by='Score', ascending=False)

In [None]:
ranked_stocks = pd.DataFrame(rank_stocks(stock_data))

In [None]:
ranked_stocks

Unnamed: 0,P/E Ratio,P/B Ratio,ROE,D/E Ratio,EPS,Score
ACLS,15.035607,4.120834,0.31201,8.255,7.58,2.058
ACGL,7.891870,2.017473,0.29050,14.088,12.67,2.030
AMP,14.889419,8.937179,0.69356,80.877,29.39,2.022
AMAT,25.198824,9.768322,0.45193,30.562,8.50,1.980
ACN,27.406166,7.008486,0.27381,11.298,11.03,1.970
...,...,...,...,...,...,...
ANCX,inf,127.768720,-12.16070,5142.882,-204.57,-0.331
BRK.AX,inf,0.722222,0.21463,5142.882,-204.57,-0.391
SAVE,inf,0.426636,-0.39334,714.408,-4.45,-0.393
ANTH,inf,0.000005,-5.68847,5142.882,-0.01,-0.570


## Model for Prediction

### Random Forest

The top 20 ranked stocks are fetched from Yahoo Finance and data on Logged Return, 10 and 30 days Moving Averages, RSI, and MACD is extracted. Then the model is trained on a 60 days rolling window.

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
if isinstance(ranked_stocks.index, pd.Index):
    top_20_symbols = ranked_stocks.index[:20].tolist()  # Extract the top 20 symbols
else:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

# Fetch data for all stocks
def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

# Train and predict using Random Forest
def train_and_predict(stock_data):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal']]
        y = df['Log Return']
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        predictions = []
        actuals = []

        for t in range(60, len(df)):  # Start from day 60 to have enough data
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+1]  # Ensure X_test is a DataFrame with proper headers
            y_test = y.iloc[t]  # Correctly define y_test as the actual return for day t

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)  # X_test now retains its DataFrame structure

            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)[0]
            predictions.append(pred)
            actuals.append(y_test)  # Append the actual return to the list

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results



# Main execution
data = fetch_data(top_20_symbols)
results = train_and_predict(data)

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.001420724893260663, Directional Accuracy = 0.6422155688622755
ACGL: MSE = 0.0005398955996170286, Directional Accuracy = 0.6661676646706587
AMP: MSE = 0.0008451903989077414, Directional Accuracy = 0.6332335329341318
AMAT: MSE = 0.0008847260405953365, Directional Accuracy = 0.6541916167664671
ACN: MSE = 0.00033938086198847855, Directional Accuracy = 0.6482035928143712
GOOGL: MSE = 0.00036372973486459363, Directional Accuracy = 0.6541916167664671
ALKS: MSE = 0.0006664913253973519, Directional Accuracy = 0.655688622754491
GOOG: MSE = 0.00036379623350204127, Directional Accuracy = 0.6332335329341318
LULU: MSE = 0.0005984280893512031, Directional Accuracy = 0.6541916167664671
ASML: MSE = 0.0006088141305769712, Directional Accuracy = 0.6437125748502994
APA: MSE = 0.0036087173570817587, Directional Accuracy = 0.6511976047904192
AOS: MSE = 0.00034603327543799016, Directional Accuracy = 0.6452095808383234
RS: MSE = 0.00045391143949959285, Directional Accuracy = 0.6541916167664671
O

### Enhanced Random Forest w/ Hyperparameter tuning (w/ GridSearch) and more Features

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
if isinstance(ranked_stocks.index, pd.Index):
    top_20_symbols = ranked_stocks.index[:20].tolist()  # Extract the top 20 symbols
else:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

# Train and predict using Random Forest

def train_and_predict(stock_data):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
        y = df['Log Return']

        # Setup parameter grid for Random Forest
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [10, 20, None]
        }
        model = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3)
        predictions = []
        actuals = []

        # Rolling window cross-validation
        for t in range(60, len(df)):  # Start from day 60 to have enough data
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+1]
            y_test = y.iloc[t]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)[0]
            predictions.append(pred)
            actuals.append(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {
            'MSE': mse,
            'Directional Accuracy': direction_accuracy,
            'Best Parameters': model.best_params_
        }
    return model_results




# Main execution
data = fetch_data(top_20_symbols)
results = train_and_predict(data)

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}, Best Params = {result['Best Parameters']}")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

KeyboardInterrupt: 

### Random Forest with Computation Reduction

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict(stock_data, retrain_frequency=10):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
        y = df['Log Return']

        param_grid = {
            'n_estimators': [50, 100],
            'max_depth': [10, 20]
        }
        model = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, n_jobs=-1)
        predictions = []
        actuals = []

        for t in range(60, len(df), retrain_frequency):  # Retrain every 'retrain_frequency' days
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+retrain_frequency]
            y_test = y.iloc[t:t+retrain_frequency]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
            predictions.extend(preds)
            actuals.extend(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy, 'Best Parameters': model.best_params_}
    return model_results

data = fetch_data(top_20_symbols)
results = train_and_predict(data)

for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}, Best Params = {result['Best Parameters']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.001437561478133166, Directional Accuracy = 0.6377245508982036, Best Params = {'max_depth': 10, 'n_estimators': 100}
ACGL: MSE = 0.0005877557314821336, Directional Accuracy = 0.6631736526946108, Best Params = {'max_depth': 10, 'n_estimators': 50}
AMP: MSE = 0.0007757387659181219, Directional Accuracy = 0.6511976047904192, Best Params = {'max_depth': 10, 'n_estimators': 50}
AMAT: MSE = 0.0009224784220016895, Directional Accuracy = 0.6347305389221557, Best Params = {'max_depth': 10, 'n_estimators': 50}
ACN: MSE = 0.000321648254478857, Directional Accuracy = 0.657185628742515, Best Params = {'max_depth': 10, 'n_estimators': 100}
LULU: MSE = 0.0005637827122813877, Directional Accuracy = 0.6526946107784432, Best Params = {'max_depth': 10, 'n_estimators': 100}
GOOGL: MSE = 0.00035534995627881674, Directional Accuracy = 0.6347305389221557, Best Params = {'max_depth': 10, 'n_estimators': 50}
GOOG: MSE = 0.0003485346768457437, Directional Accuracy = 0.6287425149700598, Best Params 

### Random Forest with Macroeconomic Data

In [None]:
import pandas as pd

# Load the CSV file, specifying the date column and format
df = pd.read_csv('long_term_rates_2000_2023 (2).csv', index_col='Date', parse_dates=['Date'], date_format='%m/%d/%y')

# Sort the DataFrame to ensure the dates are in the correct order
df.sort_index(inplace=True)

# Attempt to filter the DataFrame for the specified range and columns
try:
    filtered_df = df.loc['2019-01-01':'2022-01-01', ['LT COMPOSITE (>10 Yrs)', 'TREASURY 20-Yr CMT']]
    print("\nFiltered DataFrame:")
    print(filtered_df)
except KeyError as e:
    print(f"\nKey error: {e}. Check if the column names are correct and present in the DataFrame.")
    print("Available columns in the DataFrame:", df.columns)
except Exception as e:
    print(f"\nAn error occurred: {e}")


filtered_df


Filtered DataFrame:
            LT COMPOSITE (>10 Yrs)  TREASURY 20-Yr CMT
Date                                                  
2019-01-02                    2.90                2.83
2019-01-03                    2.84                2.75
2019-01-04                    2.91                2.83
2019-01-07                    2.93                2.86
2019-01-08                    2.94                2.88
...                            ...                 ...
2021-12-27                    1.87                1.92
2021-12-28                    1.88                1.94
2021-12-29                    1.94                2.00
2021-12-30                    1.91                1.97
2021-12-31                    1.89                1.94

[752 rows x 2 columns]


Unnamed: 0_level_0,LT COMPOSITE (>10 Yrs),TREASURY 20-Yr CMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,2.90,2.83
2019-01-03,2.84,2.75
2019-01-04,2.91,2.83
2019-01-07,2.93,2.86
2019-01-08,2.94,2.88
...,...,...
2021-12-27,1.87,1.92
2021-12-28,1.88,1.94
2021-12-29,1.94,2.00
2021-12-30,1.91,1.97


In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
if isinstance(ranked_stocks.index, pd.Index):
    top_20_symbols = ranked_stocks.index[:20].tolist()  # Extract the top 20 symbols
else:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df = df.join(filtered_df, how='left')
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

# Train and predict using Random Forest
def train_and_predict(stock_data):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'LT COMPOSITE (>10 Yrs)', 'TREASURY 20-Yr CMT']]
        y = df['Log Return']
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        predictions = []
        actuals = []

        for t in range(60, len(df)):  # Start from day 60 to have enough data
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+1]  # Ensure X_test is a DataFrame with proper headers
            y_test = y.iloc[t]  # Correctly define y_test as the actual return for day t

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)  # X_test now retains its DataFrame structure

            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)[0]
            predictions.append(pred)
            actuals.append(y_test)  # Append the actual return to the list

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results



# Main execution
data = fetch_data(top_20_symbols)
results = train_and_predict(data)

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.0014158340137435364, Directional Accuracy = 0.6268882175226587
ACGL: MSE = 0.0005543470714607405, Directional Accuracy = 0.6676737160120846
AMP: MSE = 0.0008526875894099036, Directional Accuracy = 0.6510574018126888
AMAT: MSE = 0.0008463313828946902, Directional Accuracy = 0.6450151057401813
ACN: MSE = 0.00035408531399140847, Directional Accuracy = 0.6419939577039275
LULU: MSE = 0.0006026484784817245, Directional Accuracy = 0.6510574018126888
GOOGL: MSE = 0.00036925366641242676, Directional Accuracy = 0.6404833836858006
GOOG: MSE = 0.00036604032526418184, Directional Accuracy = 0.6374622356495468
ALKS: MSE = 0.0006818062742527973, Directional Accuracy = 0.649546827794562
ASML: MSE = 0.0006246931523275714, Directional Accuracy = 0.6540785498489426
AOS: MSE = 0.000348191041720763, Directional Accuracy = 0.6450151057401813
APA: MSE = 0.0032215665085776344, Directional Accuracy = 0.6419939577039275
AIT: MSE = 0.0008442760129790056, Directional Accuracy = 0.6435045317220544
OD

### XGBoost

In [None]:
import xgboost as xgb

import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict(stock_data, retrain_frequency=10):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
        y = df['Log Return']

        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1]
        }
        model = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=42), param_grid, cv=3, n_jobs=-1)
        predictions = []
        actuals = []

        for t in range(60, len(df), retrain_frequency):
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+retrain_frequency]
            y_test = y.iloc[t:t+retrain_frequency]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
            predictions.extend(preds)
            actuals.extend(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {
            'MSE': mse,
            'Directional Accuracy': direction_accuracy,
            'Best Parameters': model.best_params_
        }
    return model_results

# Fetch and train
data = fetch_data(top_20_symbols)
results = train_and_predict(data, retrain_frequency=5)  # Adjusted retrain frequency for demonstration

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}, Best Params = {result['Best Parameters']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

KeyboardInterrupt: 

### XGBoost with Computation Reduction

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict(stock_data, retrain_frequency=20):  # Increased retrain frequency
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
        y = df['Log Return']

        param_grid = {
            'n_estimators': [50],  # Reduced complexity
            'max_depth': [3],  # Reduced complexity
            'learning_rate': [0.1]  # Increased learning rate
        }
        model = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=42, tree_method='hist'),
                             param_grid, cv=2, n_jobs=-1)  # Reduced CV folds
        predictions = []
        actuals = []

        for t in range(60, len(df), retrain_frequency):
            X_train, y_train = X.iloc[:t], y.iloc[:t]
            X_test = X.iloc[t:t+retrain_frequency]
            y_test = y.iloc[t:t+retrain_frequency]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
            predictions.extend(preds)
            actuals.extend(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {
            'MSE': mse,
            'Directional Accuracy': direction_accuracy,
            'Best Parameters': model.best_params_
        }
    return model_results

data = fetch_data(top_20_symbols)
results = train_and_predict(data)

for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}, Best Params = {result['Best Parameters']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.0014780294724324756, Directional Accuracy = 0.6392215568862275, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
ACGL: MSE = 0.0007942098670351811, Directional Accuracy = 0.6497005988023952, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
AMP: MSE = 0.0009186034440342555, Directional Accuracy = 0.6437125748502994, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
ALKS: MSE = 0.0006838962861329693, Directional Accuracy = 0.6437125748502994, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
LULU: MSE = 0.0005867351919391669, Directional Accuracy = 0.6317365269461078, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
AOS: MSE = 0.00032980003861956, Directional Accuracy = 0.6422155688622755, Best Params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
ASML: MSE = 0.0006972201974289244, Directional Accuracy = 0.6407185628742516, Best Params = {'lear

### Neural Network (NN)

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df = df.join(filtered_df, how='left')
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict_nn(stock_data, retrain_frequency=10):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR', 'LT COMPOSITE (>10 Yrs)', 'TREASURY 20-Yr CMT']]
        y = df['Log Return']

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Neural Network Model
        model = Sequential([
            Dense(64, input_dim=X.shape[1], activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='linear')
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        predictions = []
        actuals = []

        for t in range(60, len(df), retrain_frequency):
            X_train, y_train = X_scaled[:t], y.iloc[:t]
            X_test = X_scaled[t:t+retrain_frequency]
            y_test = y.iloc[t:t+retrain_frequency]

            model.fit(X_train, y_train, epochs=10, verbose=0, batch_size=32)

            preds = model.predict(X_test).flatten()
            predictions.extend(preds)
            actuals.extend(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results

data = fetch_data(top_20_symbols)
results = train_and_predict_nn(data, retrain_frequency=5)  # Adjusted retrain frequency for demonstration

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.0013688209963380199, Directional Accuracy = 0.6601208459214502
ACGL: MSE = 0.0006956657226052754, Directional Accuracy = 0.6510574018126888
AMP: MSE = 0.0008915837631617808, Directional Accuracy = 0.6419939577039275
AMAT: MSE = 0.0009288158291357263, Directional Accuracy = 0.6782477341389728
ACN: MSE = 0.0004187405895035336, Directional Accuracy = 0.6329305135951662
LULU: MSE = 0.0007594996774502444, Directional Accuracy = 0.6676737160120846
ALKS: MSE = 0.0008049781346626221, Directional Accuracy = 0.6827794561933535
GOOG: MSE = 0.0007032897765297655, Directional Accuracy = 0.6344410876132931
GOOGL: MSE = 0.00034437803306121083, Directional Accuracy = 0.6525679758308157
ASML: MSE = 0.0007001947993921355, Directional Accuracy = 0.6419939577039275
AOS: MSE = 0.00034858409260837234, Directional Accuracy = 0.6344410876132931
APA: MSE = 0.0033574215995302342, Directional Accuracy = 0.6525679758308157
RS: MSE = 0.0015340990917012745, Directional Accuracy = 0.6510574018126888
AI

### LSTM

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict_lstm(stock_data, retrain_frequency=10, look_back=60):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']].values
        y = df['Log Return'].values

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Reshape input to be [samples, time steps, features]
        X_lstm = np.array([X_scaled[i - look_back:i, :] for i in range(look_back, len(X_scaled))])
        y_lstm = y[look_back:]

        # LSTM Model
        model = Sequential([
            LSTM(50, input_shape=(look_back, X_lstm.shape[2]), return_sequences=True),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(1)
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        predictions = []
        actuals = []

        for t in range(0, len(X_lstm), retrain_frequency):
            X_train, y_train = X_lstm[:t], y_lstm[:t]
            X_test = X_lstm[t:t+retrain_frequency]
            y_test = y_lstm[t:t+retrain_frequency]

            if len(X_train) > 0:  # Ensure there is training data
                model.fit(X_train, y_train, epochs=10, verbose=0, batch_size=32)

            if len(X_test) > 0:  # Ensure there is test data
                preds = model.predict(X_test).flatten()
                predictions.extend(preds)
                actuals.extend(y_test)

        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results

data = fetch_data(top_20_symbols)
results = train_and_predict_lstm(data, retrain_frequency=5, look_back=10)  # Adjusted frequency and look-back period

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.003430493860920796, Directional Accuracy = 0.49442896935933145
ACGL: MSE = 0.0007741473013674566, Directional Accuracy = 0.5167130919220055
AMP: MSE = 0.0014433721821759272, Directional Accuracy = 0.5041782729805014
AMAT: MSE = 0.0012344128333562317, Directional Accuracy = 0.5013927576601671
ACN: MSE = 0.0007454020047834028, Directional Accuracy = 0.467966573816156
LULU: MSE = 0.0015668797672076578, Directional Accuracy = 0.5167130919220055
GOOGL: MSE = 0.000601808907210015, Directional Accuracy = 0.5111420612813371
GOOG: MSE = 0.0009767239545464992, Directional Accuracy = 0.479108635097493
ALKS: MSE = 0.001475763554953702, Directional Accuracy = 0.5041782729805014
ASML: MSE = 0.0009598993069653255, Directional Accuracy = 0.49303621169916434
AOS: MSE = 0.0007551533520984827, Directional Accuracy = 0.47493036211699163
APA: MSE = 0.005459128343872751, Directional Accuracy = 0.4805013927576602
AIT: MSE = 0.0013104054385982796, Directional Accuracy = 0.47493036211699163
ODC: 

### Transformer

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Layer

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

# Helper function to create sequences for the Transformer
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[(i + time_steps)])
    return np.array(Xs), np.array(ys)

def fetch_data(symbols):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

# Define Transformer Block as a layer
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        # Ensure the output of the ffn matches the embed_dim, which is the size of each input vector
        self.ffn = Sequential([
            Dense(ff_dim, activation="relu"),  # FFN with intermediate dimension
            Dense(embed_dim)  # Output must match embed_dim
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)  # MultiHeadAttention on the inputs
        attn_output = self.dropout1(attn_output, training=training)  # Apply dropout to the attention output
        out1 = self.layernorm1(inputs + attn_output)  # Add & normalize

        ffn_output = self.ffn(out1)  # Pass the normalized output to the FFN
        ffn_output = self.dropout2(ffn_output, training=training)  # Apply dropout to the FFN output
        return self.layernorm2(out1 + ffn_output)  # Add & normalize

# Build and train the Transformer model
def train_and_predict_transformer(stock_data, look_back=60):
    model_results = {}
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']].values
        y = df['Log Return'].values

        print(f"{symbol} - Initial X shape: {X.shape}, y shape: {y.shape}")

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        print(f"{symbol} - Scaled X shape: {X_scaled.shape}")

        # Create sequences
        X_seq, y_seq = create_sequences(X_scaled, y, time_steps=look_back)

        print(f"{symbol} - X_seq shape: {X_seq.shape}, y_seq shape: {y_seq.shape}")

        # Ensure y_seq has the correct length
        if X_seq.shape[0] != y_seq.shape[0]:
            print(f"Mismatch in shapes for {symbol}: X_seq length {X_seq.shape[0]}, y_seq length {y_seq.shape[0]}")
            continue  # Skip this symbol if there's a mismatch

        # Define model
        model = Sequential([
            TransformerBlock(embed_dim=8, num_heads=2, ff_dim=32),
            Dense(10, activation="relu"),
            Dense(1, activation="linear")
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        # Train the model
        print(f"Training model for {symbol} with X_seq shape: {X_seq.shape} and y_seq shape: {y_seq.shape}")
        model.fit(X_seq, y_seq, batch_size=32, epochs=10, verbose=1)

        # Evaluate the model
        print(f"Predicting with model for {symbol}")
        predictions = model.predict(X_seq).flatten()
        print(f"Predictions shape before trimming: {predictions.shape}")

        # Ensure predictions match the target length
        if predictions.shape[0] != y_seq.shape[0]:
            print(f"Mismatch in predictions for {symbol}: predictions length {predictions.shape[0]}, y_seq length {y_seq.shape[0]}")
            predictions = predictions[:y_seq.shape[0]]  # Trim predictions to match actuals if needed

        actuals = y_seq
        print(f"Predictions shape after trimming: {predictions.shape}, Actuals shape: {actuals.shape}")
        mse = mean_squared_error(actuals, predictions)
        direction_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))

        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results

# Example usage
data = fetch_data(top_20_symbols)
check_data_consistency(data)  # Check data consistency for all symbols
results = train_and_predict_transformer(data)

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS - Initial X shape: (728, 8), y shape: (728,)
ACLS - Scaled X shape: (728, 8)
ACLS - X_seq shape: (718, 10, 8), y_seq shape: (718,)
Shapes match for ACLS: X_seq length 718, y_seq length 718
ACGL - Initial X shape: (728, 8), y shape: (728,)
ACGL - Scaled X shape: (728, 8)
ACGL - X_seq shape: (718, 10, 8), y_seq shape: (718,)
Shapes match for ACGL: X_seq length 718, y_seq length 718
AMP - Initial X shape: (728, 8), y shape: (728,)
AMP - Scaled X shape: (728, 8)
AMP - X_seq shape: (718, 10, 8), y_seq shape: (718,)
Shapes match for AMP: X_seq length 718, y_seq length 718
AMAT - Initial X shape: (728, 8), y shape: (728,)
AMAT - Scaled X shape: (728, 8)
AMAT - X_seq shape: (718, 10, 8), y_seq shape: (718,)
Shapes match for AMAT: X_seq length 718, y_seq length 718
ACN - Initial X shape: (728, 8), y shape: (728,)
ACN - Scaled X shape: (728, 8)
ACN - X_seq shape: (718, 10, 8), y_seq shape: (718,)
Shapes match for ACN: X_seq length 718, y_seq length 718
LULU - Initial X shape: (728, 8), y sh

## Testing the Model on Unseen Data

### Neural Network (NN) - Best Model

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Assuming 'ranked_stocks' is already defined and the stock symbols are used as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_and_split_data(symbols):
    stock_data = {}
    test_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start='2019-01-01', end='2022-01-01')
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df = df[['Log Return', 'MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']].dropna()

        # Split data into training and test datasets
        train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)  # Ensure no shuffling to maintain time series integrity
        stock_data[symbol] = train_df
        test_data[symbol] = test_df
    return stock_data, test_data

def compute_bollinger_bands(data, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    """Calculate Average True Range (ATR)."""
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_evaluate_nn(stock_data, test_data):
    model_results = {}
    for symbol, df in stock_data.items():
        X_train = df.drop('Log Return', axis=1)
        y_train = df['Log Return']
        X_test = test_data[symbol].drop('Log Return', axis=1)
        y_test = test_data[symbol]['Log Return']

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model = Sequential([
            Dense(64, input_dim=X_train.shape[1], activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='linear')
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        model.fit(X_train_scaled, y_train, epochs=10, verbose=0, batch_size=32)
        preds = model.predict(X_test_scaled).flatten()

        mse = mean_squared_error(y_test, preds)
        direction_accuracy = np.mean(np.sign(y_test) == np.sign(preds))

        model_results[symbol] = {'MSE': mse, 'Directional Accuracy': direction_accuracy}
    return model_results

stock_data, test_data = fetch_and_split_data(top_20_symbols)
results = train_and_evaluate_nn(stock_data, test_data)

# Display results
for symbol, result in results.items():
    print(f"{symbol}: MSE = {result['MSE']}, Directional Accuracy = {result['Directional Accuracy']}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: MSE = 0.0026692795286791934, Directional Accuracy = 0.5136986301369864
ACGL: MSE = 0.0004616105174793549, Directional Accuracy = 0.6301369863013698
AMP: MSE = 0.0014234585742465763, Directional Accuracy = 0.5
AMAT: MSE = 0.0029917790095302416, Directional Accuracy = 0.4726027397260274
ACN: MSE = 0.0008211226683832764, Directional Accuracy = 0.5547945205479452
LULU: MSE = 0.0011105100522139882, Directional Accuracy = 0.5753424657534246
ALKS: MSE = 0.0018555898703657067, Directional Accuracy = 0.5273972602739726
GOOG: MSE = 0.005823486702980438, Directional Accuracy = 0.4383561643835616
GOOGL: MSE = 0.0008674724422004268, Directional Accuracy = 0.4726027397260274
ASML: MSE = 0.002370540573427012, Directional Accuracy = 0.5136986301369864
AOS: MSE = 0.0018130825017287511, Directional Accuracy = 0.5068493150684932
APA: MSE = 0.001056440441136788, Directional Accuracy = 0.6027397260273972
RS: MSE = 0.0007363225371935431, Directional Accuracy = 0.4520547945205479
AIT: MSE = 0.001495461

#### Implement the model on the Future and Validate the Prediction

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming 'ranked_stocks' is defined and contains the stock symbols
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols, start_date, end_date):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start=start_date, end=end_date)
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df.dropna(inplace=True)
        stock_data[symbol] = df
    return stock_data

def compute_bollinger_bands(data, window=20, num_std=2):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def train_and_predict_nn(stock_data):
    model_results = {}
    scaler = StandardScaler()
    for symbol, df in stock_data.items():
        X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
        y = df['Log Return']
        X_scaled = scaler.fit_transform(X)

        model = Sequential([
            Dense(64, input_dim=X_scaled.shape[1], activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='linear')
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        model.fit(X_scaled, y, epochs=10, batch_size=32, verbose=0)

        model_results[symbol] = model
    return model_results, scaler

# Fetch and train data
historical_data = fetch_data(top_20_symbols, '2019-01-01', '2022-01-01')
models, scaler = train_and_predict_nn(historical_data)

# Now predict for the next available period
future_data = fetch_data(top_20_symbols, '2022-01-02', '2023-01-01')
for symbol, model in models.items():
    df = future_data[symbol]
    X = df[['MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR']]
    X_scaled = scaler.transform(X)
    predicted_returns = model.predict(X_scaled).flatten()
    predicted_direction = np.sign(predicted_returns)
    actual_direction = np.sign(df['Log Return'])

    # Check if the prediction was correct
    correct_predictions = np.mean(predicted_direction == actual_direction)
    print(f"{symbol}: Predicted correctly {correct_predictions * 100:.2f}% of the time.")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: Predicted correctly 53.60% of the time.
ACGL: Predicted correctly 52.25% of the time.
AMP: Predicted correctly 52.25% of the time.
AMAT: Predicted correctly 57.21% of the time.
ACN: Predicted correctly 46.85% of the time.
LULU: Predicted correctly 45.95% of the time.
ALKS: Predicted correctly 50.00% of the time.
GOOG: Predicted correctly 54.05% of the time.
GOOGL: Predicted correctly 46.40% of the time.
ASML: Predicted correctly 53.60% of the time.
AOS: Predicted correctly 44.59% of the time.
APA: Predicted correctly 54.05% of the time.
RS: Predicted correctly 52.25% of the time.
AIT: Predicted correctly 49.10% of the time.
ASR: Predicted correctly 53.15% of the time.
ODC: Predicted correctly 47.75% of the time.
ADBE: Predicted correctly 50.00% of the time.
ADP: Predicted correctly 51.80% of the time.
AYI: Predicted correctly 53.60% of the time.
ARLP: Predicted correctly 49.55% of the time.


#### Enhanced NN Model

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Assuming 'ranked_stocks' is defined and contains the stock symbols as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols, start_date, end_date):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start=start_date, end=end_date)
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df['Volume Change'] = df['Volume'].pct_change()
        df['Volatility'] = df['Log Return'].rolling(window=30).std()

        # Clean infinite and NaN values
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        stock_data[symbol] = df
    return stock_data

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def compute_bollinger_bands(data, window=20, num_std=2):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def train_and_predict_nn(stock_data):
    model_results = {}
    scaler = StandardScaler()
    for symbol, df in stock_data.items():
        X = df[['Log Return', 'MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR', 'Volume Change', 'Volatility']].values
        y = np.where(df['Log Return'].shift(-1) > 0, 1, 0)[:-1]  # Predict next day's direction

        X_scaled = scaler.fit_transform(X[:-1])  # Avoid last unlabelled instance

        model = Sequential([
            Dense(50, input_dim=X_scaled.shape[1], activation='relu'),
            Dropout(0.2),
            Dense(50, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=0, validation_split=0.1,
                  callbacks=[EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True),
                             ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=2)])

        # Prediction and accuracy assessment
        predicted = (model.predict(X_scaled) > 0.5).flatten()
        accuracy = accuracy_score(y, predicted)
        model_results[symbol] = {'Model': model, 'Accuracy': accuracy}

    return model_results, scaler

# Load and train data
historical_data = fetch_data(top_20_symbols, '2012-01-01', '2022-01-01')
models, scaler = train_and_predict_nn(historical_data)

# Fetch future data
future_data = fetch_data(top_20_symbols, '2022-01-02', '2023-01-01')

# Predict and assess the future data
for symbol, result in models.items():
    model = result['Model']
    future_df = future_data[symbol]
    X_future = future_df[['Log Return', 'MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR', 'Volume Change', 'Volatility']].values

    # Scale future features
    X_future_scaled = scaler.transform(X_future)

    # Predict the direction
    predicted_future = (model.predict(X_future_scaled) > 0.5).flatten()
    actual_future = np.where(future_df['Log Return'].shift(-1) > 0, 1, 0)[:-1]  # Get actual future direction

    # Calculate accuracy
    future_accuracy = accuracy_score(actual_future, predicted_future[:-1])  # Exclude last because it has no actual value

    print(f"{symbol}: Future Prediction Accuracy = {future_accuracy:.2f}")

    # Predict the next day's direction after the last available data
    last_instance = X_future_scaled[-1].reshape(1, -1)
    predicted_next_return = model.predict(last_instance).flatten()[0]
    predicted_next_direction = 'up' if predicted_next_return > 0.5 else 'down'
    actual_next_direction = 'up' if future_df['Log Return'].iloc[-1] > 0 else 'down'

    correct_prediction = predicted_next_direction == actual_next_direction
    print(f"{symbol}: Predicted direction for the next period: {predicted_next_direction} (Actual: {actual_next_direction}) - Correct: {correct_prediction}")



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: Future Prediction Accuracy = 0.52
ACLS: Predicted direction for the next period: down (Actual: down) - Correct: True
ACGL: Future Prediction Accuracy = 0.52
ACGL: Predicted direction for the next period: up (Actual: down) - Correct: False
AMP: Future Prediction Accuracy = 0.49
AMP: Predicted direction for the next period: up (Actual: down) - Correct: False
AMAT: Future Prediction Accuracy = 0.47
AMAT: Predicted direction for the next period: up (Actual: up) - Correct: True
ACN: Future Prediction Accuracy = 0.45
ACN: Predicted direction for the next period: up (Actual: down) - Correct: False
LULU: Future Prediction Accuracy = 0.54
LULU: Predicted direction for the next period: up (Actual: up) - Correct: True
ALKS: Future Prediction Accuracy = 0.50
ALKS: Predicted direction for the next period: down (Actual: down) - Correct: True
GOOG: Future Prediction Accuracy = 0.45
GOOG: Predicted direction for the next period: up (Actual: down) - Correct: False
GOOGL: Future Prediction Accurac

### Monthly Prediction

Now we try shifting from daily predictions to predicting for larger periods, such as a month, and training the model up until the last period before the future period, then evaluating the future period. This approach can indeed improve by smoothing out daily noise and focusing on more significant trends over longer periods.

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Assuming 'ranked_stocks' is defined and contains the stock symbols as the index
top_20_symbols = ranked_stocks.index[:20].tolist() if isinstance(ranked_stocks.index, pd.Index) else None
if top_20_symbols is None:
    raise ValueError("The 'ranked_stocks' DataFrame does not have symbols as index or column.")

def fetch_data(symbols, start_date, end_date):
    stock_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start=start_date, end=end_date)
        df['Log Return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
        df['MA_10'] = df['Adj Close'].rolling(window=10).mean()
        df['MA_30'] = df['Adj Close'].rolling(window=30).mean()
        df['RSI'] = compute_rsi(df['Adj Close'], 14)
        df['MACD'], df['MACD_Signal'] = compute_macd(df['Adj Close'])
        df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Adj Close'])
        df['ATR'] = compute_atr(df)
        df['Volume Change'] = df['Volume'].pct_change()
        df['Volatility'] = df['Log Return'].rolling(window=30).std()

        # Clean infinite and NaN values
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        stock_data[symbol] = df
    return stock_data

def compute_rsi(data, window):
    diff = data.diff(1)
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(com=window - 1, adjust=False).mean()
    ema_down = down.ewm(com=window - 1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data, fast_period=12, slow_period=26, signal_period=9):
    exp1 = data.ewm(span=fast_period, adjust=False).mean()
    exp2 = data.ewm(span=slow_period, adjust=False).mean()
    macd = exp1 - exp2
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    return macd, signal

def compute_bollinger_bands(data, window=20, num_std=2):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def compute_atr(data, window=14):
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr

def train_and_predict_nn(stock_data):
    model_results = {}
    scaler = StandardScaler()
    for symbol, df in stock_data.items():
        X = df[['Log Return', 'MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR', 'Volume Change', 'Volatility']].values
        y = np.where(df['Log Return'].shift(-1) > 0, 1, 0)[:-1]  # Predict next day's direction

        X_scaled = scaler.fit_transform(X[:-1])  # Avoid last unlabelled instance

        model = Sequential([
            Dense(50, input_dim=X_scaled.shape[1], activation='relu'),
            Dropout(0.2),
            Dense(50, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=0, validation_split=0.1,
                  callbacks=[EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True),
                             ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=2)])

        # Prediction and accuracy assessment
        predicted = (model.predict(X_scaled) > 0.5).flatten()
        accuracy = accuracy_score(y, predicted)
        model_results[symbol] = {'Model': model, 'Accuracy': accuracy}

    return model_results, scaler

# Load and train data
historical_data = fetch_data(top_20_symbols, '2019-01-01', '2022-01-01')
models, scaler = train_and_predict_nn(historical_data)

# Fetch future data
future_data = fetch_data(top_20_symbols, '2022-01-02', '2023-01-01')

# Predict and assess the future data
for symbol, result in models.items():
    model = result['Model']
    future_df = future_data[symbol]
    X_future = future_df[['Log Return', 'MA_10', 'MA_30', 'RSI', 'MACD', 'MACD_Signal', 'Upper_BB', 'Lower_BB', 'ATR', 'Volume Change', 'Volatility']].values

    if len(X_future) == 0:
        print(f"{symbol}: No data available for the future period.")
        continue

    # Scale future features
    X_future_scaled = scaler.transform(X_future[:-1])  # Avoid last unlabelled instance

    # Predict the direction for each day
    predicted_future = (model.predict(X_future_scaled) > 0.5).flatten()
    actual_future = np.where(future_df['Log Return'].shift(-1) > 0, 1, 0)[:-1]  # Get actual future direction

    if len(actual_future) == 0:
        print(f"{symbol}: No actual future values available for comparison.")
        continue

    # Align lengths of predicted and actual future
    future_df = future_df.iloc[:-1].copy()
    future_df.loc[:, 'Predicted Direction'] = predicted_future
    future_df.loc[:, 'Actual Direction'] = actual_future

    # Aggregate daily predictions to monthly predictions
    monthly_predictions = future_df.resample('M').agg({'Predicted Direction': lambda x: x.mean() > 0.5, 'Actual Direction': 'last'})

    # Calculate monthly accuracy
    future_accuracy = accuracy_score(monthly_predictions['Actual Direction'], monthly_predictions['Predicted Direction'])

    print(f"{symbol}: Future Monthly Prediction Accuracy = {future_accuracy:.2f}")

    # Predict the next month's direction after the last available data
    if len(X_future_scaled) > 0:
        last_instance = X_future_scaled[-1].reshape(1, -1)
        predicted_next_return = model.predict(last_instance).flatten()[0]
        predicted_next_direction = 'up' if predicted_next_return > 0.5 else 'down'
        actual_next_direction = 'up' if future_df['Log Return'].iloc[-1] > 0 else 'down'

        correct_prediction = predicted_next_direction == actual_next_direction
        print(f"{symbol}: Predicted direction for the next period: {predicted_next_direction} (Actual: {actual_next_direction}) - Correct: {correct_prediction}")
    else:
        print(f"{symbol}: No data available to predict the next period's direction.")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

ACLS: Future Monthly Prediction Accuracy = 0.55
ACLS: Predicted direction for the next period: down (Actual: up) - Correct: False
ACGL: Future Monthly Prediction Accuracy = 0.36
ACGL: Predicted direction for the next period: up (Actual: up) - Correct: True
AMP: Future Monthly Prediction Accuracy = 0.45
AMP: Predicted direction for the next period: up (Actual: up) - Correct: True
AMAT: Future Monthly Prediction Accuracy = 0.45
AMAT: Predicted direction for the next period: up (Actual: up) - Correct: True
ACN: Future Monthly Prediction Accuracy = 0.64
ACN: Predicted direction for the next period: up (Actual: up) - Correct: True
LULU: Future Monthly Prediction Accuracy = 0.45
LULU: Predicted direction for the next period: down (Actual: up) - Correct: False
ALKS: Future Monthly Prediction Accuracy = 0.27
ALKS: Predicted direction for the next period: down (Actual: up) - Correct: False
GOOG: Future Monthly Prediction Accuracy = 0.45
GOOG: Predicted direction for the next period: up (Actual: