<a href="https://colab.research.google.com/github/elodie0778/pirhana/blob/main/goldfish5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% [markdown]
# # 1. Mount Google Drive
#
# Mount Google Drive to access and save data persistently.

# %%
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# %% [markdown]
# # 2. Install Necessary Libraries
#
# Install and upgrade necessary libraries. Some libraries might already be installed in Google Colab, but specifying them ensures compatibility.

# %%
# Install necessary libraries
!pip install --upgrade optuna ta catboost
!pip install optuna-integration

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py

In [None]:
# %%
# Import Libraries
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime, timedelta

# Machine Learning Libraries
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

# For Technical Indicators
import ta

# For Optuna Hyperparameter Tuning
import optuna
from optuna.pruners import MedianPruner
from optuna.exceptions import TrialPruned

# For CatBoost
from catboost import CatBoostRegressor, Pool, cv as catboost_cv

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For saving/loading data
import pickle

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# %% [markdown]
# # 3. Import Libraries
#
# Import all necessary libraries for data processing, model training, evaluation, and visualization.

# %%
# Import Libraries
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime, timedelta

# Machine Learning Libraries
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# For Technical Indicators
import ta

# For Optuna Hyperparameter Tuning
import optuna
from optuna.pruners import MedianPruner
from optuna.exceptions import TrialPruned

# For CatBoost
from catboost import CatBoostRegressor, Pool, cv as catboost_cv

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For saving/loading data
import pickle

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# %% [markdown]
# # 4. Define Paths and Initialize Directories
#
# Define file paths for data storage and ensure necessary directories exist.

# %%
# Define paths
DATA_PATH = '/content/drive/MyDrive/data3/'
STUDIES_DIR = os.path.join(DATA_PATH, 'optuna_studies3')
FORECASTS_PATH = os.path.join(DATA_PATH, 'stock_forecasts3.pkl')
PERFORMANCE_PATH = os.path.join(DATA_PATH, 'model_performance3.pkl')
BEST_PARAMS_PATH = os.path.join(DATA_PATH, 'best_params_dict3.pkl')
FINAL_RANKINGS_PATH = '/content/drive/MyDrive/final_stock_rankings3.csv'

# Create necessary directories if they don't exist
os.makedirs(STUDIES_DIR, exist_ok=True)

In [None]:
# %% [markdown]
# # 5. Load Existing Data
#
# Load existing best parameters, forecasts, and model performance data if available. This ensures continuity in your workflow.

# %%
# Load existing best parameters if available
if os.path.exists(BEST_PARAMS_PATH):
    with open(BEST_PARAMS_PATH, 'rb') as f:
        best_params_dict = pickle.load(f)
    print("Loaded existing best parameters.")
else:
    best_params_dict = {}
    print("No existing best parameters found. Starting fresh.")

# Load existing forecasts if available
if os.path.exists(FORECASTS_PATH):
    with open(FORECASTS_PATH, 'rb') as f:
        stock_forecasts = pickle.load(f)
    print("Loaded existing forecasts.")
else:
    stock_forecasts = {}
    print("No existing forecasts found. Starting fresh.")

# Load existing model performance data if available
if os.path.exists(PERFORMANCE_PATH):
    with open(PERFORMANCE_PATH, 'rb') as f:
        model_performance = pickle.load(f)
    print("Loaded existing model performance data.")
else:
    model_performance = []
    print("No existing model performance data found. Starting fresh.")

Loaded existing best parameters.
Loaded existing forecasts.
Loaded existing model performance data.


In [None]:
# %% [markdown]
# # 6. Load and Preprocess Stock Data
#
# Load all cleaned stock CSV files from the `data3` folder, merge them with `FinalScore` from `final_stock_rankings3.csv`, and discard any stocks that do not have a `FinalScore`.

# %%
# List all CSV files in the directory matching '*_data.csv'
csv_files = glob.glob(os.path.join(DATA_PATH, '*_data.csv'))

print(f"Total CSV files found: {len(csv_files)}")

# Load FinalScore data
if os.path.exists(FINAL_RANKINGS_PATH):
    final_rankings = pd.read_csv(FINAL_RANKINGS_PATH)
    print("Loaded final_stock_rankings3.csv.")
else:
    raise FileNotFoundError("final_stock_rankings3.csv not found in the data3 folder.")

# Keep only 'Ticker' and 'FinalScore'
final_rankings = final_rankings[['Ticker', 'FinalScore']].dropna()
final_rankings = final_rankings.drop_duplicates(subset='Ticker')

# List of tickers with FinalScore
valid_tickers = set(final_rankings['Ticker'].unique())

print(f"Number of valid tickers with FinalScore: {len(valid_tickers)}")

# Initialize a dictionary to store DataFrames
stock_data = {}

# Feature Engineering Function
def add_technical_indicators(df):
    df = df.copy()
    # Since the new data includes various price metrics, additional technical indicators can be minimal
    # However, if needed, you can add more indicators here
    df.dropna(inplace=True)
    return df

# Load and preprocess each CSV file
for file in csv_files:
    ticker = os.path.basename(file).split('_')[0]
    # Only consider tickers that appear in final_rankings
    if ticker not in valid_tickers:
        continue
    df = pd.read_csv(file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    df = add_technical_indicators(df)
    df.reset_index(drop=True, inplace=True)
    stock_data[ticker] = df

print(f"Total stocks loaded after filtering: {len(stock_data)}")

Total CSV files found: 282
Loaded final_stock_rankings3.csv.
Number of valid tickers with FinalScore: 210
Total stocks loaded after filtering: 210


In [None]:
# %% [markdown]
# # 7. Merge FinalScore and Additional Preprocessing
#
# Assign the `FinalScore` to each stock's DataFrame and prepare the data for modeling by defining features and the target variable.

# %%
# Initialize a dictionary to store FinalScore for each stock
final_score_dict = final_rankings.set_index('Ticker')['FinalScore'].to_dict()

# Assign FinalScore to each stock's DataFrame
for ticker, df in stock_data.items():
    df['FinalScore'] = final_score_dict.get(ticker, 0)  # Assign 0 if not found, though all tickers should have FinalScore
    stock_data[ticker] = df

# Feature Engineering and Target Definition
feature_cols = ['Market_Cap_EBT_Excl_Unusual_Items', 'Market_Cap_Total_Revenue',
               'Market_Cap_Book_Value', 'Price_Last_Quarter_Levered_FCF',
               'Price_Funds_From_Operations', 'Price_Tangible_Book',
               'TEV_Excl_Opp_Leases_Total_Revenue', 'TEV_Employees',
               'Price_Last Quarter_EPS(SNL)', 'Price_Last_Quarter_Net_FCF(SNL)',
               'Price_Sales(SNL)', 'Price_Adj_Funds_From_Opperations',
               'Price_Forward_EPS', 'PEG_Ratio',
               'Market_Cap_Forward_Total_Revenue',
               'Diluted_Market_Cap_Outstanding_Forward_Total_Revenue',
               'Diluted_Market_Cap_Exercisable_Forward_Total_Revenue',
               'Short_Interest_Ratio', 'Day_Open_Price', 'Day_Close_Price',
               'Day_High_Price', 'Day_Low_Price', 'VWAP',
               'Shares_Outstanding', 'Volume', 'FinalScore']

# Define prediction horizon (30 days ahead)
TARGET_WINDOW_DAYS = 30

# Initialize lists to store features, targets, and sample weights
X_list = []
y_list = []
sample_weights = []

# Iterate through each stock to prepare data
for ticker, df in stock_data.items():
    if len(df) < (TARGET_WINDOW_DAYS + 1):
        continue  # Skip stocks with insufficient data

    # Create target variable: percentage return over the next TARGET_WINDOW_DAYS
    df['Future_Close'] = df['Day_Close_Price'].shift(-TARGET_WINDOW_DAYS)
    df['Target_Return'] = ((df['Future_Close'] - df['Day_Close_Price']) / df['Day_Close_Price']) * 100
    df.dropna(inplace=True)

    # Features and target
    X = df[feature_cols].values
    y = df['Target_Return'].values

    # Assign weights: higher weight for higher FinalScore
    # Normalize FinalScore to [0.5, 1.5] for weighting purposes
    score = df['FinalScore'].iloc[0]  # Assuming FinalScore is constant per stock
    max_score = final_rankings['FinalScore'].max()
    weight = 1 + (score / max_score)  # Adjust as needed

    weights = np.full_like(y, weight, dtype=np.float32)

    X_list.append(X)
    y_list.append(y)
    sample_weights.append(weights)

# Combine all stocks' data
X = np.vstack(X_list)
y = np.hstack(y_list)
sample_weights = np.hstack(sample_weights)

print(f"Total samples after combining all stocks: {X.shape[0]}")

Total samples after combining all stocks: 754600


In [None]:
# %% [markdown]
# # 8. Define Helper Functions
#
# Define functions to check the existence of Optuna studies and to allocate investments based on expected profits.

# %%
# Function to determine if a study exists
def study_exists(study_name, storage_name):
    from optuna.storages import RDBStorage
    try:
        storage = RDBStorage(url=storage_name)
        storage.get_study_id_from_name(study_name)
        return True
    except KeyError:
        return False

# Function to allocate investment to the top-performing stock
def allocate_investment(profit_df, total_investment=30):
    """
    Allocates the total investment to the stock with the highest expected profit.

    Parameters:
    - profit_df (pd.DataFrame): DataFrame containing 'Stock', 'Expected_Profit'.
    - total_investment (float): Total amount to invest.

    Returns:
    - allocation (dict): Dictionary with 'Stock', 'Investment', 'Expected_Profit'.
    - total_profit (float): Total expected profit from the investment.
    """
    if profit_df.empty:
        print("No adequate stocks available for investment.")
        return None, 0.0

    # Sort the DataFrame by Expected Profit in descending order (higher is better)
    sorted_df = profit_df.sort_values(by='Expected_Profit', ascending=False).reset_index(drop=True)

    # Select the top stock
    top_stock = sorted_df.iloc[0]

    # Allocate the entire investment to the top stock
    allocation = {
        'Stock': top_stock['Stock'],
        'Investment': round(total_investment, 2),
        'Expected_Profit': round(top_stock['Expected_Profit'], 2)
    }

    # Total profit is the expected profit from the top stock
    total_profit = allocation['Expected_Profit']

    return allocation, total_profit

In [None]:
# %%
import pandas as pd
import numpy as np

# 1. Define the replacement function
def replace_NM_with_prev_or_zero(df):
    """
    Replace all occurrences of 'NM' in any column with the value from the row above in that column.
    If the row above is inaccessible, replace 'NM' with 0.0.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to process.

    Returns:
    - pd.DataFrame: The processed DataFrame with 'NM' replaced appropriately.
    """
    df = df.copy()  # To avoid modifying the original DataFrame

    # Replace 'NM' with NaN to facilitate forward filling
    df.replace('NM', np.nan, inplace=True)

    # Forward fill to replace NaN with the value from the row above
    df.fillna(method='ffill', inplace=True)

    # For any remaining NaN (which were 'NM' in the first row), replace with 0.0
    df.fillna(0.0, inplace=True)

    # Convert all columns to numeric types where possible
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    return df

# 2. Apply the function to all stocks in stock_data
for ticker, df in stock_data.items():
    stock_data[ticker] = replace_NM_with_prev_or_zero(df)
    print(f"Processed stock: {ticker}")

# 3. Verify the replacement
def verify_replacement(stock_data):
    """
    Verify that 'NM' has been replaced with previous row's value or 0.0 in all columns for all stocks.

    Parameters:
    - stock_data (dict): Dictionary containing stock tickers as keys and DataFrames as values.

    Returns:
    - None
    """
    for ticker, df in stock_data.items():
        # Check if 'NM' still exists in any column
        if (df.astype(str) == 'NM').any().any():
            print(f"'NM' still found in stock: {ticker}")
        else:
            print(f"No 'NM' values found in stock: {ticker}")

# Run the verification
verify_replacement(stock_data)

Processed stock: ANET
Processed stock: AXON
Processed stock: ANSS
Processed stock: AVGO
Processed stock: AMZN
Processed stock: AON
Processed stock: APD
Processed stock: AMGN
Processed stock: AMD
Processed stock: AMP
Processed stock: AKAM
Processed stock: AMAT
Processed stock: ALNY
Processed stock: ADP
Processed stock: AJG
Processed stock: AIZ
Processed stock: ADSK
Processed stock: A
Processed stock: ADBE
Processed stock: ADI
Processed stock: AXP
Processed stock: AAPL
Processed stock: ACGL
Processed stock: ABBV
Processed stock: BKNG
Processed stock: BK
Processed stock: BR
Processed stock: BAC
Processed stock: BKR
Processed stock: BLK
Processed stock: BXP
Processed stock: BBY
Processed stock: CRM
Processed stock: CRH
Processed stock: CSCO
Processed stock: CPRT
Processed stock: COR
Processed stock: COP
Processed stock: COST
Processed stock: CPB
Processed stock: CNH
Processed stock: COO
Processed stock: CMS
Processed stock: COF
Processed stock: CMI
Processed stock: CI
Processed stock: CMCS

In [None]:
# %% [markdown]
# # 9. Hyperparameter Tuning and Model Training
#
# Train CatBoost models for each stock using Optuna for hyperparameter tuning. Calculate R² and MAPE for each model, and ensure that progress is saved persistently.

# %%
# Define the number of trials for Optuna
N_TRIALS_CB = 1  # Adjust based on required accuracy and available computation time

# Initialize a dictionary to store best parameters
if os.path.exists(BEST_PARAMS_PATH):
    with open(BEST_PARAMS_PATH, 'rb') as f:
        best_params_dict = pickle.load(f)
else:
    best_params_dict = {}

# Initialize a dictionary to store forecasts
if os.path.exists(FORECASTS_PATH):
    with open(FORECASTS_PATH, 'rb') as f:
        stock_forecasts = pickle.load(f)
else:
    stock_forecasts = {}

# Initialize a list to store model performance
if os.path.exists(PERFORMANCE_PATH):
    with open(PERFORMANCE_PATH, 'rb') as f:
        model_performance = pickle.load(f)
else:
    model_performance = []

# Iterate through each stock to train models
for ticker in stock_data:
    # Skip if the stock has already been processed
    if ticker in stock_forecasts and 'CatBoost' in stock_forecasts[ticker]:
        print(f"Skipping CatBoost for {ticker} as it has already been processed.")
        continue

    print(f"\nProcessing CatBoost model for stock: {ticker}")
    df = stock_data[ticker]

    # Define features and target
    X_stock = df[feature_cols]
    y_stock = df['Target_Return']
    w_stock = sample_weights[:len(X_stock)]  # Assuming sample_weights are aligned

    # Define study name and storage
    study_name = f"cb_study_{ticker}"
    storage_name = f"sqlite:///{STUDIES_DIR}/cb_study_{ticker}.db"

    # Check if the study already exists
    if study_exists(study_name, storage_name):
        study = optuna.load_study(study_name=study_name, storage=storage_name)
        print(f"Loaded existing CatBoost study for {ticker}.")
    else:
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name, load_if_exists=True, pruner=MedianPruner())
        print(f"Created new CatBoost study for {ticker}.")

    # Define the objective function within the loop to capture current X and y
    def objective_cb(trial, X=X_stock, y=y_stock, w=w_stock):
        param = {
            'iterations': trial.suggest_int('iterations', 500, 3000),
            'depth': trial.suggest_int('depth', 4, 12),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 15),
            'random_strength': trial.suggest_float('random_strength', 1, 10),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10),
            'od_wait': 50,
            'random_seed': 42,
            'loss_function': 'RMSE',
            'verbose': False
        }

        model = CatBoostRegressor(**param)
        tscv = TimeSeriesSplit(n_splits=3)  # Reduced splits for faster tuning

        mse_scores = []
        r2_scores = []
        all_preds = []
        all_true = []

        for i, (train_index, test_index) in enumerate(tscv.split(X)):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]
            w_train_fold, w_val_fold = w[train_index], w[test_index]

            # Create Pool with sample weights
            train_pool = Pool(X_train_fold, y_train_fold, weight=w_train_fold)

            model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
            preds = model.predict(X_val_fold)
            mse = mean_squared_error(y_val_fold, preds)
            r2 = r2_score(y_val_fold, preds)
            mse_scores.append(mse)
            r2_scores.append(r2)
            all_preds.extend(preds)
            all_true.extend(y_val_fold)

            # Report intermediate objective value
            trial.report(mse, i+1)

            # Prune trial if not promising
            if trial.should_prune():
                raise TrialPruned()

        # Store the average R² and MAPE as user attributes
        trial.set_user_attr("r2", np.mean(r2_scores))
        trial.set_user_attr("mape", mean_absolute_percentage_error(all_true, all_preds))
        trial.set_user_attr("preds", all_preds)
        trial.set_user_attr("true", all_true)

        return np.mean(mse_scores)

    try:
        # Optimize hyperparameters using Optuna
        study.optimize(objective_cb, n_trials=N_TRIALS_CB, timeout=1800)  # 30 minutes timeout

        best_params = study.best_params
        print(f"Best params for {ticker} (CatBoost): {best_params}")

        # Retrieve the best trial
        best_trial = study.best_trial

        # Store best RMSE, R², and MAPE
        model_performance.append({
            'Stock': ticker,
            'Model': 'CatBoost',
            'RMSE': best_trial.value,  # Mean MSE across folds
            'R2': best_trial.user_attrs.get("r2", None),
            'MAPE': best_trial.user_attrs.get("mape", None)
        })

        print(f"Best RMSE for {ticker}: {best_trial.value}")
        print(f"Best R² for {ticker}: {best_trial.user_attrs.get('r2', None)}")
        print(f"Best MAPE for {ticker}: {best_trial.user_attrs.get('mape', None)}")

        # Save best parameters
        best_params_dict.setdefault(ticker, {})
        best_params_dict[ticker]['CatBoost'] = best_params

        # Train the best model on the entire dataset
        best_model = CatBoostRegressor(**best_params, loss_function='RMSE', random_seed=42, verbose=False)
        train_pool_full = Pool(X_stock, y_stock, weight=w_stock)
        best_model.fit(train_pool_full, eval_set=(X_stock, y_stock), use_best_model=True)

        # Forecasting: Predict the Target_Return on the last day
        latest_features = df[feature_cols].iloc[-1].values.reshape(1, -1)
        forecast_return = best_model.predict(latest_features)[0]

        # Store the forecast
        stock_forecasts.setdefault(ticker, {})
        stock_forecasts[ticker]['CatBoost'] = forecast_return

        print(f"Completed CatBoost for {ticker}. Forecast Return: {forecast_return:.4f}%")

    except TrialPruned:
        print(f"CatBoost trial for {ticker} was pruned.")
    except Exception as e:
        print(f"An error occurred while processing CatBoost for {ticker}: {e}")

    # Save the best parameters after each stock
    with open(BEST_PARAMS_PATH, 'wb') as f:
        pickle.dump(best_params_dict, f)

    # Save the forecasts and performance after each stock
    with open(FORECASTS_PATH, 'wb') as f:
        pickle.dump(stock_forecasts, f)

    with open(PERFORMANCE_PATH, 'wb') as f:
        pickle.dump(model_performance, f)

    print(f"Saved CatBoost parameters, forecasts, and performance data for {ticker}.")

Skipping CatBoost for ANET as it has already been processed.
Skipping CatBoost for AXON as it has already been processed.
Skipping CatBoost for ANSS as it has already been processed.
Skipping CatBoost for AVGO as it has already been processed.
Skipping CatBoost for AMZN as it has already been processed.
Skipping CatBoost for AON as it has already been processed.
Skipping CatBoost for APD as it has already been processed.
Skipping CatBoost for AMGN as it has already been processed.
Skipping CatBoost for AMD as it has already been processed.
Skipping CatBoost for AMP as it has already been processed.
Skipping CatBoost for AKAM as it has already been processed.
Skipping CatBoost for AMAT as it has already been processed.
Skipping CatBoost for ALNY as it has already been processed.
Skipping CatBoost for ADP as it has already been processed.
Skipping CatBoost for AJG as it has already been processed.
Skipping CatBoost for AIZ as it has already been processed.
Skipping CatBoost for ADSK as i

[W 2024-12-15 18:46:41,101] Trial 12 failed with parameters: {'iterations': 2479, 'depth': 10, 'learning_rate': 0.0016346142348773651, 'l2_leaf_reg': 10.575331255724434, 'random_strength': 7.579996665359973, 'bagging_temperature': 9.912066989087034} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight

An error occurred while processing CatBoost for FTV: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for FTV.
Skipping CatBoost for DAL as it has already been processed.
Skipping CatBoost for DE as it has already been processed.
Skipping CatBoost for DFS as it has already been processed.
Skipping CatBoost for IEX as it has already been processed.
Skipping CatBoost for INTC as it has already been processed.
Skipping CatBoost for ISRG as it has already been processed.
Skipping CatBoost for IRM as it has already been processed.
Skipping CatBoost for HUM as it has already been processed.
Skipping CatBoost for IQV as it has already been processed.
Skipping CatBoost for IP as it has already been processed.
Skipping CatBoost for INTU as it has already been processed.
Skipping CatBoost for IDXX as it has already been 

[W 2024-12-15 18:46:41,790] Trial 10 failed with parameters: {'iterations': 2036, 'depth': 9, 'learning_rate': 0.008696016788586338, 'l2_leaf_reg': 3.4116411033731726, 'random_strength': 4.235705604709785, 'bagging_temperature': 3.458990543177589} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, 

An error occurred while processing CatBoost for HPE: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for HPE.
Skipping CatBoost for IBM as it has already been processed.
Skipping CatBoost for HON as it has already been processed.
Skipping CatBoost for HOLX as it has already been processed.
Skipping CatBoost for HES as it has already been processed.
Skipping CatBoost for HBAN as it has already been processed.
Skipping CatBoost for HIG as it has already been processed.
Skipping CatBoost for HD as it has already been processed.
Skipping CatBoost for HCA as it has already been processed.
Skipping CatBoost for GS as it has already been processed.
Skipping CatBoost for GWW as it has already been processed.
Skipping CatBoost for GOOGL as it has already been processed.
Skipping CatBoost for GE as it has already been p

[W 2024-12-15 18:46:42,702] Trial 9 failed with parameters: {'iterations': 2693, 'depth': 5, 'learning_rate': 0.007143893396071087, 'l2_leaf_reg': 3.427038565838897, 'random_strength': 6.195992462761646, 'bagging_temperature': 1.6433126948804855} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, N

An error occurred while processing CatBoost for GDDY: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for GDDY.
Skipping CatBoost for IT as it has already been processed.
Skipping CatBoost for KR as it has already been processed.
Skipping CatBoost for LDOS as it has already been processed.
Skipping CatBoost for LRCX as it has already been processed.
Skipping CatBoost for LLY as it has already been processed.
Skipping CatBoost for LII as it has already been processed.
Skipping CatBoost for LH as it has already been processed.
Skipping CatBoost for JNPR as it has already been processed.
Skipping CatBoost for LNG as it has already been processed.
Skipping CatBoost for LOW as it has already been processed.

Processing CatBoost model for stock: KHC
Loaded existing CatBoost study for KHC.


[W 2024-12-15 18:46:49,651] Trial 8 failed with parameters: {'iterations': 1134, 'depth': 9, 'learning_rate': 0.09911739520370491, 'l2_leaf_reg': 9.864938223369561, 'random_strength': 3.742646793884469, 'bagging_temperature': 6.656532381717348} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, Non

An error occurred while processing CatBoost for KHC: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for KHC.
Skipping CatBoost for JPM as it has already been processed.
Skipping CatBoost for KEYS as it has already been processed.
Skipping CatBoost for LULU as it has already been processed.
Skipping CatBoost for KMI as it has already been processed.
Skipping CatBoost for KMB as it has already been processed.
Skipping CatBoost for KKR as it has already been processed.
Skipping CatBoost for KMX as it has already been processed.
Skipping CatBoost for JBHT as it has already been processed.
Skipping CatBoost for J as it has already been processed.
Skipping CatBoost for NFLX as it has already been processed.
Skipping CatBoost for NOC as it has already been processed.
Skipping CatBoost for NEM as it has already been 

[W 2024-12-15 18:46:50,202] Trial 2 failed with parameters: {'iterations': 2679, 'depth': 5, 'learning_rate': 0.038276625852483544, 'l2_leaf_reg': 9.379099059772471, 'random_strength': 3.2192486108702933, 'bagging_temperature': 3.528655789321925} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, N

An error occurred while processing CatBoost for TWLO: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for TWLO.
Skipping CatBoost for WDC as it has already been processed.
Skipping CatBoost for SCHW as it has already been processed.
Skipping CatBoost for TRV as it has already been processed.
Skipping CatBoost for TXN as it has already been processed.
Skipping CatBoost for SLB as it has already been processed.
Skipping CatBoost for WELL as it has already been processed.
Skipping CatBoost for UNP as it has already been processed.
Skipping CatBoost for TT as it has already been processed.
Skipping CatBoost for SBUX as it has already been processed.
Skipping CatBoost for TFC as it has already been processed.
Skipping CatBoost for TRMB as it has already been processed.

Processing CatBoost model for stock: TEAM
Loa

[W 2024-12-15 18:46:50,762] Trial 4 failed with parameters: {'iterations': 2036, 'depth': 12, 'learning_rate': 0.011070470702221577, 'l2_leaf_reg': 1.8062244701046068, 'random_strength': 5.871133912254729, 'bagging_temperature': 6.818287458097709} because of the following error: CatBoostError('catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-50e9b8b2f916>", line 89, in objective_cb
    model.fit(train_pool, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100, use_best_model=True)
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, 

An error occurred while processing CatBoost for TEAM: catboost/private/libs/algo/tensor_search_helpers.cpp:563: Too few sampling units (subsample=0.8, bootstrap_type=MVS): please increase sampling rate or disable sampling
Saved CatBoost parameters, forecasts, and performance data for TEAM.
Skipping CatBoost for WFC as it has already been processed.
Skipping CatBoost for WBD as it has already been processed.
Skipping CatBoost for SYF as it has already been processed.
Skipping CatBoost for VZ as it has already been processed.
Skipping CatBoost for T as it has already been processed.
Skipping CatBoost for TRGP as it has already been processed.
Skipping CatBoost for TSLA as it has already been processed.
Skipping CatBoost for TSCO as it has already been processed.
Skipping CatBoost for ZBH as it has already been processed.
Skipping CatBoost for SNPS as it has already been processed.
Skipping CatBoost for WY as it has already been processed.
Skipping CatBoost for WST as it has already been 

In [None]:
#PPG and done

In [None]:
# %% [markdown]
# # 10. Calculate Investment Profits and Allocate Funds
#
# Calculate expected profits based on model forecasts and allocate investment funds accordingly. Ensure that only adequate stocks are considered.

# %%
# Identify adequate and inadequate stocks based on performance data
all_stocks = list(stock_data.keys())

adequate_stocks = []
inadequate_stocks = []

for stock in all_stocks:
    # Check if the stock has a forecast and RMSE
    if stock in stock_forecasts and 'CatBoost' in stock_forecasts[stock]:
        # Further check if RMSE and R2 are not None (indicating successful processing)
        performance = next((item for item in model_performance if item['Stock'] == stock and item['Model'] == 'CatBoost'), None)
        if performance and performance['RMSE'] is not None and performance['R2'] is not None:
            adequate_stocks.append(stock)
        else:
            inadequate_stocks.append(stock)
    else:
        inadequate_stocks.append(stock)

print(f"Total Stocks: {len(all_stocks)}")
print(f"Number of Adequate Stocks: {len(adequate_stocks)}")
print(f"Number of Inadequate Stocks: {len(inadequate_stocks)}")
print("\nList of Inadequate Stocks:")
print(inadequate_stocks)

Total Stocks: 210
Number of Adequate Stocks: 204
Number of Inadequate Stocks: 6

List of Inadequate Stocks:
['FTV', 'HPE', 'GDDY', 'KHC', 'TWLO', 'TEAM']


In [None]:
# %%
# Initialize a list to store profit calculations
profit_list = []

for stock in adequate_stocks:
    try:
        # Retrieve the forecasted return
        forecast_return = stock_forecasts[stock]['CatBoost']

        # Get the current Close price (last available in the DataFrame)
        current_price = stock_data[stock]['Day_Close_Price'].iloc[-1]

        # Calculate the expected return in GBP
        expected_profit = (forecast_return / 100) * current_price

        # Append a dictionary to the list
        profit_list.append({
            'Stock': stock,
            'Current_Price': round(current_price, 2),
            'Forecasted_Return': round(forecast_return, 2),
            'Expected_Profit': round(expected_profit, 2)
        })
    except Exception as e:
        print(f"Error processing stock {stock}: {e}")
        # Optionally, log the error or mark the stock as inadequate
        inadequate_stocks.append(stock)

# Convert the list of dictionaries to a DataFrame
profit_df = pd.DataFrame(profit_list)

# Sort the DataFrame by Expected Profit in descending order (higher profit is better)
profit_df.sort_values(by='Expected_Profit', ascending=False, inplace=True)

# Reset index
profit_df.reset_index(drop=True, inplace=True)

# Display the profit DataFrame
profit_df

Unnamed: 0,Stock,Current_Price,Forecasted_Return,Expected_Profit
0,BKNG,4943.27,5.45,269.17
1,FICO,2332.29,7.52,175.31
2,MSTR,270.42,43.95,118.86
3,NFLX,795.04,14.72,117.05
4,NOW,1008.08,11.48,115.73
...,...,...,...,...
199,MTD,1310.03,-3.69,-48.35
200,SNPS,561.55,-8.69,-48.78
201,AMGN,325.28,-15.37,-50.00
202,NOC,528.99,-9.67,-51.15


In [None]:
# %%
# Total Investment
total_investment = 30.0  # in GBP

# Calculate the total expected profit of all adequate stocks
total_expected_profit = profit_df['Expected_Profit'].sum()

# Calculate the weight (allocation percentage) for each stock based on its expected profit
# Since higher Expected_Profit is better, allocate more to stocks with higher Expected_Profit
# Normalize Expected_Profit to sum to 1 for allocation
if total_expected_profit == 0:
    profit_df['Weight'] = 1 / len(profit_df)
else:
    profit_df['Weight'] = profit_df['Expected_Profit'] / total_expected_profit

# Calculate the investment allocation for each stock
profit_df['Investment'] = profit_df['Weight'] * total_investment

# Calculate the number of shares for each stock
profit_df['Number_of_Shares'] = profit_df['Investment'] / profit_df['Current_Price']

# Calculate the expected profit for the allocated investment
profit_df['Allocated_Profit'] = profit_df['Number_of_Shares'] * (profit_df['Forecasted_Return'])

# Round the numerical columns for better readability
profit_df[['Investment', 'Number_of_Shares', 'Allocated_Profit']] = profit_df[['Investment', 'Number_of_Shares', 'Allocated_Profit']].round(2)

# Display the updated DataFrame with allocations
profit_df[['Stock', 'Current_Price', 'Forecasted_Return', 'Expected_Profit', 'Investment', 'Number_of_Shares', 'Allocated_Profit']]

Unnamed: 0,Stock,Current_Price,Forecasted_Return,Expected_Profit,Investment,Number_of_Shares,Allocated_Profit
0,BKNG,4943.27,5.45,269.17,9.38,0.00,0.01
1,FICO,2332.29,7.52,175.31,6.11,0.00,0.02
2,MSTR,270.42,43.95,118.86,4.14,0.02,0.67
3,NFLX,795.04,14.72,117.05,4.08,0.01,0.08
4,NOW,1008.08,11.48,115.73,4.03,0.00,0.05
...,...,...,...,...,...,...,...
199,MTD,1310.03,-3.69,-48.35,-1.68,-0.00,0.00
200,SNPS,561.55,-8.69,-48.78,-1.70,-0.00,0.03
201,AMGN,325.28,-15.37,-50.00,-1.74,-0.01,0.08
202,NOC,528.99,-9.67,-51.15,-1.78,-0.00,0.03


In [None]:
# %%
# Final Ranked List with Investment Strategy
final_ranking = profit_df[['Stock', 'Current_Price', 'Forecasted_Return', 'Expected_Profit', 'Investment', 'Number_of_Shares', 'Allocated_Profit']].copy()

# Calculate the total allocated investment and total expected profit
total_allocated_investment = final_ranking['Investment'].sum()
total_allocated_profit = final_ranking['Allocated_Profit'].sum()

# Display the final ranking
print("Final Ranked List of Stocks Based on Expected Profit:")
display(final_ranking)

print(f"Total Allocated Investment: £{round(total_allocated_investment, 2)}")
print(f"Total Expected Profit: £{round(total_allocated_profit, 2)}")

Final Ranked List of Stocks Based on Expected Profit:


Unnamed: 0,Stock,Current_Price,Forecasted_Return,Expected_Profit,Investment,Number_of_Shares,Allocated_Profit
0,BKNG,4943.27,5.45,269.17,9.38,0.00,0.01
1,FICO,2332.29,7.52,175.31,6.11,0.00,0.02
2,MSTR,270.42,43.95,118.86,4.14,0.02,0.67
3,NFLX,795.04,14.72,117.05,4.08,0.01,0.08
4,NOW,1008.08,11.48,115.73,4.03,0.00,0.05
...,...,...,...,...,...,...,...
199,MTD,1310.03,-3.69,-48.35,-1.68,-0.00,0.00
200,SNPS,561.55,-8.69,-48.78,-1.70,-0.00,0.03
201,AMGN,325.28,-15.37,-50.00,-1.74,-0.01,0.08
202,NOC,528.99,-9.67,-51.15,-1.78,-0.00,0.03


Total Allocated Investment: £30.0
Total Expected Profit: £3.11


In [None]:
# %% [markdown]
# # 11. Display Inadequate Stocks
#
# Show a list of stocks that were excluded from the analysis due to the absence of `FinalScore` or failed model evaluations.

# %%
# Display Inadequate Stocks
if inadequate_stocks:
    inadequate_df = pd.DataFrame({'Inadequate Stocks': inadequate_stocks})
    print("List of Inadequate Stocks (Excluded from Analysis):")
    display(inadequate_df)
else:
    print("No Inadequate Stocks found.")

List of Inadequate Stocks (Excluded from Analysis):


Unnamed: 0,Inadequate Stocks
0,FTV
1,HPE
2,GDDY
3,KHC
4,TWLO
5,TEAM


In [None]:
# %% [markdown]
# # 12. Summary of Investment Strategy
#
# Provide a summarized view of the investment strategy, including total investment, expected profit, portfolio allocation, and a list of excluded stocks.

# %%
# Summary
print("### Investment Strategy Summary ###\n")

print(f"**Total Investment Available:** £{total_investment}\n")
print(f"**Total Expected Profit:** £{round(total_allocated_profit, 2)}\n")
print("**Portfolio Allocation:**")
display(final_ranking[['Stock', 'Investment', 'Number_of_Shares', 'Allocated_Profit']])

if inadequate_stocks:
    print("\n**Inadequate Stocks (Excluded from Portfolio):**")
    display(inadequate_df)
else:
    print("\n**All stocks were adequate for the analysis.**")

### Investment Strategy Summary ###

**Total Investment Available:** £30.0

**Total Expected Profit:** £3.11

**Portfolio Allocation:**


Unnamed: 0,Stock,Investment,Number_of_Shares,Allocated_Profit
0,BKNG,9.38,0.00,0.01
1,FICO,6.11,0.00,0.02
2,MSTR,4.14,0.02,0.67
3,NFLX,4.08,0.01,0.08
4,NOW,4.03,0.00,0.05
...,...,...,...,...
199,MTD,-1.68,-0.00,0.00
200,SNPS,-1.70,-0.00,0.03
201,AMGN,-1.74,-0.01,0.08
202,NOC,-1.78,-0.00,0.03



**Inadequate Stocks (Excluded from Portfolio):**


Unnamed: 0,Inadequate Stocks
0,FTV
1,HPE
2,GDDY
3,KHC
4,TWLO
5,TEAM


In [None]:
# %%
# Allocate Investment to Top N=5 Stocks

import pandas as pd

# Define the number of top stocks to select
TOP_N = 6

# Sort the profit DataFrame by 'Forecasted_Return' in descending order and select top N
top_n_stocks = profit_df.sort_values(by='Forecasted_Return', ascending=False).head(TOP_N).copy()

# Allocate £30 equally among the top N stocks
investment_per_stock = 30.0 / TOP_N
top_n_stocks['Investment'] = investment_per_stock

# Calculate the expected profit for each allocated investment
top_n_stocks['Allocated_Profit'] = (top_n_stocks['Forecasted_Return'] / 100) * top_n_stocks['Investment']

# Calculate the number of shares to purchase for each stock
top_n_stocks['Number_of_Shares'] = top_n_stocks['Investment'] / top_n_stocks['Current_Price']

# Round the numerical columns for better readability
top_n_stocks[['Investment', 'Number_of_Shares', 'Allocated_Profit']] = top_n_stocks[['Investment', 'Number_of_Shares', 'Allocated_Profit']].round(2)

# Reset index for clarity
top_n_stocks.reset_index(drop=True, inplace=True)

# Display the allocation table
print(f"### Investment Allocation for Top {TOP_N} Stocks ###\n")
display(top_n_stocks[['Stock', 'Current_Price', 'Forecasted_Return', 'Investment', 'Number_of_Shares', 'Allocated_Profit']])

# Optional: Save the top N=5 allocation to a CSV file
top_n_stocks.to_csv(os.path.join(DATA_PATH, f'top_{TOP_N}_stocks_allocation.csv'), index=False)
print(f"Top {TOP_N} stocks allocation saved to 'top_{TOP_N}_stocks_allocation.csv'.")

### Investment Allocation for Top 6 Stocks ###



Unnamed: 0,Stock,Current_Price,Forecasted_Return,Investment,Number_of_Shares,Allocated_Profit
0,MSTR,270.42,43.95,5.0,0.02,2.2
1,TSLA,321.22,22.45,5.0,0.02,1.12
2,WBD,9.18,19.09,5.0,0.54,0.95
3,DIS,99.02,16.7,5.0,0.05,0.84
4,AXON,603.18,16.43,5.0,0.01,0.82
5,MRVL,93.8,16.21,5.0,0.05,0.81


Top 6 stocks allocation saved to 'top_6_stocks_allocation.csv'.


In [None]:
# %%
# Allocate Investment to Top N=5 Stocks

import pandas as pd
import os

# Define the number of top stocks to select
TOP_N = 5

# Ensure that 'profit_df' is loaded
# If not already loaded, uncomment and modify the following lines accordingly
# with open(FORECASTS_PATH, 'rb') as f:
#     stock_forecasts = pickle.load(f)
# with open(PERFORMANCE_PATH, 'rb') as f:
#     model_performance = pickle.load(f)
# with open(BEST_PARAMS_PATH, 'rb') as f:
#     best_params_dict = pickle.load(f)

# Check if 'profit_df' exists in the workspace
if 'profit_df' not in locals():
    raise ValueError("The DataFrame 'profit_df' is not loaded. Please ensure it is created before running this cell.")

# Sort the profit DataFrame by 'Forecasted_Return' in descending order and select top N
top_n_stocks = profit_df.sort_values(by='Forecasted_Return', ascending=False).head(TOP_N).copy()

# Allocate £30 equally among the top N stocks
total_investment = 30.0  # Total investment in GBP
investment_per_stock = total_investment / TOP_N
top_n_stocks['Allocated_Amount (£)'] = round(investment_per_stock, 2)

# Calculate the expected return for each allocated investment
top_n_stocks['Expected_Return_on_Investment (£)'] = round(
    (top_n_stocks['Forecasted_Return'] / 100) * top_n_stocks['Allocated_Amount (£)'], 2
)

# Calculate the predicted price after the forecasted return
top_n_stocks['Predicted_Price (£)'] = round(
    top_n_stocks['Current_Price'] * (1 + top_n_stocks['Forecasted_Return'] / 100), 2
)

# Rearrange and select the desired columns
allocation_table = top_n_stocks[['Stock', 'Allocated_Amount (£)', 'Expected_Return_on_Investment (£)',
                                  'Forecasted_Return', 'Current_Price', 'Predicted_Price (£)']].copy()

# Rename columns for clarity
allocation_table.rename(columns={
    'Forecasted_Return': 'Expected_Return_Percentage (%)',
    'Current_Price': 'Current_Price (£)'
}, inplace=True)

# Reset index for a clean display
allocation_table.reset_index(drop=True, inplace=True)

# Display the allocation table
print(f"### Investment Allocation for Top {TOP_N} Stocks ###\n")
display(allocation_table)

# Optional: Save the top N=5 allocation to a CSV file
output_path = os.path.join(DATA_PATH, f'top_{TOP_N}_stocks_allocation.csv')
allocation_table.to_csv(output_path, index=False)
print(f"Top {TOP_N} stocks allocation saved to '{output_path}'.")

### Investment Allocation for Top 5 Stocks ###



Unnamed: 0,Stock,Allocated_Amount (£),Expected_Return_on_Investment (£),Expected_Return_Percentage (%),Current_Price (£),Predicted_Price (£)
0,MSTR,6.0,2.64,43.95,270.42,389.27
1,TSLA,6.0,1.35,22.45,321.22,393.33
2,WBD,6.0,1.15,19.09,9.18,10.93
3,DIS,6.0,1.0,16.7,99.02,115.56
4,AXON,6.0,0.99,16.43,603.18,702.28


Top 5 stocks allocation saved to '/content/drive/MyDrive/data3/top_5_stocks_allocation.csv'.


In [None]:
# %%
# Allocate Investment to Top N=10 Stocks

import pandas as pd

# Define the number of top stocks to select
TOP_N = 10

# Sort the profit DataFrame by 'Forecasted_Return' in descending order and select top N
top_n_stocks = profit_df.sort_values(by='Forecasted_Return', ascending=False).head(TOP_N).copy()

# Allocate £30 equally among the top N stocks
investment_per_stock = 30.0 / TOP_N
top_n_stocks['Investment'] = investment_per_stock

# Calculate the expected profit for each allocated investment
top_n_stocks['Allocated_Profit'] = (top_n_stocks['Forecasted_Return'] / 100) * top_n_stocks['Investment']

# Calculate the number of shares to purchase for each stock
top_n_stocks['Number_of_Shares'] = top_n_stocks['Investment'] / top_n_stocks['Current_Price']

# Round the numerical columns for better readability
top_n_stocks[['Investment', 'Number_of_Shares', 'Allocated_Profit']] = top_n_stocks[['Investment', 'Number_of_Shares', 'Allocated_Profit']].round(2)

# Reset index for clarity
top_n_stocks.reset_index(drop=True, inplace=True)

# Display the allocation table
print(f"### Investment Allocation for Top {TOP_N} Stocks ###\n")
display(top_n_stocks[['Stock', 'Current_Price', 'Forecasted_Return', 'Investment', 'Number_of_Shares', 'Allocated_Profit']])

# Optional: Save the top N=10 allocation to a CSV file
top_n_stocks.to_csv(os.path.join(DATA_PATH, f'top_{TOP_N}_stocks_allocation.csv'), index=False)
print(f"Top {TOP_N} stocks allocation saved to 'top_{TOP_N}_stocks_allocation.csv'.")

### Investment Allocation for Top 10 Stocks ###



Unnamed: 0,Stock,Current_Price,Forecasted_Return,Investment,Number_of_Shares,Allocated_Profit
0,MSTR,270.42,43.95,3.0,0.01,1.32
1,TSLA,321.22,22.45,3.0,0.01,0.67
2,WBD,9.18,19.09,3.0,0.33,0.57
3,DIS,99.02,16.7,3.0,0.03,0.5
4,AXON,603.18,16.43,3.0,0.0,0.49
5,MRVL,93.8,16.21,3.0,0.03,0.49
6,RCL,225.27,14.89,3.0,0.01,0.45
7,NFLX,795.04,14.72,3.0,0.0,0.44
8,DE,394.06,13.36,3.0,0.01,0.4
9,CNH,10.66,12.88,3.0,0.28,0.39


Top 10 stocks allocation saved to 'top_10_stocks_allocation.csv'.


In [None]:
# %% [markdown]
# # 13. Visualize R² and MAPE Scores
#
# Create visualizations to understand the distribution and relationship between R² and MAPE scores across all models.

# %%
# Distribution of R² Scores
plt.figure(figsize=(10,6))
sns.histplot(performance_df['R2'].dropna(), bins=30, kde=True, color='skyblue')
plt.title('Distribution of R² Scores for CatBoost Models')
plt.xlabel('R² Score')
plt.ylabel('Frequency')
plt.show()

# Distribution of MAPE
plt.figure(figsize=(10,6))
sns.histplot(performance_df['MAPE'].dropna(), bins=30, kde=True, color='salmon')
plt.title('Distribution of MAPE for CatBoost Models')
plt.xlabel('MAPE (%)')
plt.ylabel('Frequency')
plt.show()

# Scatter Plot of R² vs MAPE
plt.figure(figsize=(10,6))
sns.scatterplot(data=performance_df, x='R2', y='MAPE', hue='R2', palette='coolwarm', alpha=0.7)
plt.title('R² vs MAPE for CatBoost Models')
plt.xlabel('R² Score')
plt.ylabel('MAPE (%)')
plt.legend(title='R²')
plt.show()

# Boxplots for R² and MAPE
plt.figure(figsize=(14,6))

# Boxplot for R²
plt.subplot(1,2,1)
sns.boxplot(y=performance_df['R2'], color='lightblue')
plt.title('Boxplot of R² Scores')
plt.ylabel('R² Score')

# Boxplot for MAPE
plt.subplot(1,2,2)
sns.boxplot(y=performance_df['MAPE'], color='lightgreen')
plt.title('Boxplot of MAPE')
plt.ylabel('MAPE (%)')

plt.tight_layout()
plt.show()

# Summary Statistics for R² and MAPE
print("### Summary Statistics ###\n")
print(performance_df[['R2', 'MAPE']].describe())

NameError: name 'performance_df' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
# %% [markdown]
# # 14. Plot Predicted vs. Actual Returns for Top Performing Stocks
#
# Visualize the accuracy of your models by plotting predicted vs. actual returns for top-performing stocks based on R² scores.

# %%
# Select top-performing stocks based on R²
TOP_N_STOCKS = 5  # Adjust as needed
top_performing_stocks = performance_df.sort_values(by='R2', ascending=False).head(TOP_N_STOCKS)['Stock'].tolist()
print(f"Top Performing Stocks based on R²: {top_performing_stocks}")

# Plot Predicted vs Actual for top-performing stocks
for stock in top_performing_stocks:
    # Load the corresponding study
    study_name = f"cb_study_{stock}"
    storage_name = f"sqlite:///{STUDIES_DIR}/cb_study_{stock}.db"
    study_db_path = os.path.join(DATA_PATH, 'optuna_studies', f"cb_study_{stock}.db")

    if not os.path.exists(study_db_path):
        print(f"Study database for {stock} not found. Skipping plot.")
        continue

    try:
        study = optuna.load_study(study_name=study_name, storage=storage_name)
        best_trial = study.best_trial
        preds = best_trial.user_attrs.get("preds", [])
        true = best_trial.user_attrs.get("true", [])

        if preds and true:
            plt.figure(figsize=(8,6))
            sns.scatterplot(x=true, y=preds, alpha=0.6)
            plt.plot([min(true), max(true)], [min(true), max(true)], 'r--')  # Diagonal line
            plt.title(f'Predicted vs. Actual Returns for {stock}')
            plt.xlabel('Actual Returns (%)')
            plt.ylabel('Predicted Returns (%)')
            plt.show()
        else:
            print(f"Predictions or true values not found for {stock}. Skipping plot.")
    except Exception as e:
        print(f"Error loading study for {stock}: {e}")