# *__Working on BTCUSD predictions with GRU model(DynEx_CLoRA)__*

## __Check first before starting__

In [1]:
import os

# Change the working directory to the project root
Working_directory = os.path.normpath("C:/Users/james/OneDrive/文件/Continual_Learning")
os.chdir(Working_directory)
print(f"Working directory: {os.getcwd()}")

Working directory: C:\Users\james\OneDrive\文件\Continual_Learning


## __All imports__

In [2]:
# Operating system and file management
import os
import shutil
import contextlib
import traceback
import gc

# Jupyter notebook widgets and display
import ipywidgets as widgets
from IPython.display import display

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Plotting and visualization
import matplotlib.pyplot as plt
from mpl_interactions import zoom_factory, panhandler

# Machine learning and preprocessing
from sklearn.model_selection import train_test_split
import pickle
from ta import trend, momentum, volatility, volume

# Mathematical and scientific computing
import math
from scipy.ndimage import gaussian_filter1d

# Type hinting
from typing import Callable, Tuple

# Deep learning with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

## __All functions (For data processing)__

In [3]:
def ensure_folder(folder_path: str) -> None:
    """Ensure the given folder exists, create it if not."""
    os.makedirs(folder_path, exist_ok=True)

def plot_with_matplotlib(data: pd.DataFrame, 
                         title: str, 
                         interactive: bool = False, 
                         save_path: str = None, 
                         show_plot: bool = True, 
                         save_matplotlib_object: str = None) -> None:
    """
    Plot time-series data using Matplotlib with optional trend-based coloring.

    Args:
        - data (pd.DataFrame): Data containing a 'close' column (required).
        - title (str): Plot title.
        - interactive (bool): Enable zoom & pan if True.
        - save_path (str, optional): Path to save the figure.
        - show_plot (bool): Whether to display the plot.
        - save_matplotlib_object (str, optional): Path to save the Matplotlib object.

    Returns:
        - None: Displays or saves the plot as specified.
    """
    # Check if 'close' column exists
    if 'close' not in data.columns:
        raise ValueError("DataFrame must contain a 'close' column.")

    # Set default color from Matplotlib cycle
    default_blue = plt.rcParams['axes.prop_cycle'].by_key()['color'][0]
    
    # Define colors for different trends
    trend_colors = {
        0: 'black',
        1: 'yellow',
        2: 'red',
        3: 'green',
        4: default_blue
    }

    # Create figure and axis for plotting
    fig, ax = plt.subplots(figsize=(12, 6))

    # Plot with trend-based coloring if 'trend' column exists
    if 'trend' in data.columns:
        legend_added = set()
        prev_idx = data.index[0]
        for idx, row in data.iterrows():
            if idx != prev_idx:
                trend_key = int(row['trend'])
                label = f'Trend {trend_key}' if trend_key not in legend_added else None
                ax.plot([prev_idx, idx], 
                        [data.loc[prev_idx, 'close'], row['close']],
                        color=trend_colors[trend_key], 
                        linestyle='-', 
                        linewidth=1,
                        label=label)
                legend_added.add(trend_key)
            prev_idx = idx
        ax.set_title(f"{title} (Connected, Colored by Trend)")
    else:
        # Plot default line if no 'trend' column
        ax.plot(data.index, data['close'], label='Closing Price', linestyle='-', marker='o', 
                markersize=2, linewidth=1, color=default_blue, markerfacecolor='green', markeredgecolor='black')
        ax.set_title(title)
    
    # Set axis labels and add legend/grid
    ax.set_xlabel('Date')
    ax.set_ylabel('Closing Price (USD)')
    ax.legend()
    ax.grid()
    
    # Enable interactive features if requested
    if interactive:
        zoom_factory(ax)
        panhandler(fig)

    # Save the plot if a path is provided
    if save_path:
        fig.tight_layout()
        fig.savefig(save_path, dpi=300, bbox_inches='tight')

    # Save the Matplotlib object if requested
    if save_matplotlib_object:
        with open(save_matplotlib_object, 'wb') as f:
            pickle.dump(fig, f)

    # Display the plot if requested
    if show_plot:
        plt.show()

def load_and_show_pickle(pickle_file_path: str):
    """
    Load a pickled Matplotlib figure object and display it.

    Args:
        - pickle_file_path (str): Path to the pickled Matplotlib figure file.

    Returns:
        - None: Displays the loaded figure.
    """
    # Load and display the pickled figure
    try:
        with open(pickle_file_path, "rb") as f:
            loaded_fig = pickle.load(f)

        print(f"Figure successfully loaded and displayed from: {pickle_file_path}")
        plt.show(block=True)

    except FileNotFoundError:
        print(f"Error: File not found at {pickle_file_path}.")
    except Exception as e:
        print(f"Error loading the pickled figure: {e}")

def save_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save DataFrame to CSV.
    """
    df.to_csv(file_path)
    print(f"\nSuccessfully saved data with moving average to CSV: \n\t{file_path}\n")

def read_csv_file(file_path: str, preview_rows: int = 5, 
                  days_towards_end: int = None, 
                  days_from_start: int = None, description: str = ""):
    """
    Reads a CSV file and returns a pandas DataFrame filtered by date range.

    Args:
        - file_path (str): The path to the CSV file.
        - preview_rows (int): Number of rows to preview (default is 5).
        - days_towards_end (int, optional): Number of days from the most recent date.
        - days_from_start (int, optional): Number of days from the oldest date of filtered data.
        - description (str): A brief description of the dataset.
                           Explanation:
                           - To retrieve data from the **end**: Use `days_towards_end`.
                           - To retrieve data from the **start of the filtered range**: Use `days_from_start`.
                           - To retrieve data from the **middle**: Use both:
                             For example, if `days_towards_end=100` and `days_from_start=50`,
                             the function will first filter the last 100 days of the dataset,
                             and then filter the first 50 days from this range.
                             This results in data between the last 100th and the last 50th day.

    Returns:
        - pd.DataFrame: The loaded and filtered data from the CSV file.
    """
    try:
        if description:
            print(f"\nDescription: {description}")
        print(f"\nFile path: {file_path}")
        
        # Read the CSV file
        data = pd.read_csv(file_path, parse_dates=['date'], index_col='date')
        
        # Filter by days towards the end
        if days_towards_end is not None:
            # Get the most recent date in the dataset
            last_date = data.index.max()
            end_cutoff_date = last_date - pd.Timedelta(days=days_towards_end)
            data = data[data.index >= end_cutoff_date]
            print(f"\nRetrieving data from the past {days_towards_end} days (from {end_cutoff_date.date()} onwards):")
        
        # Filter by days from the start (from the filtered data)
        if days_from_start is not None:
            # Get the earliest date in the filtered dataset
            first_date = data.index.min()
            start_cutoff_date = first_date + pd.Timedelta(days=days_from_start)
            data = data[data.index <= start_cutoff_date]
            print(f"\nRetrieving the first {days_from_start} days from the filtered data (up to {start_cutoff_date.date()}):")

        if preview_rows:
            # Print a preview of the data
            print(f"\nPreview of the first {preview_rows} rows:")
            display(data.head(preview_rows))
            print()

            print(f"\nPreview of the last {preview_rows} rows:")
            display(data.tail(preview_rows))
            print()

        return data
    
    except FileNotFoundError:
        print("Error: File not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: File parsing failed.")
    except Exception as e:
        print(f"Unexpected error: {e}")

def downsample_minute_data(data: pd.DataFrame, n: int) -> pd.DataFrame:
    """
    Downsample minute data into N-minute intervals by retaining every Nth row.

    Args:
        - data (pd.DataFrame): The original DataFrame with a datetime index.
        - n (int): The number of minutes for the downsampling interval.

    Returns:
        - pd.DataFrame: Downsampled DataFrame.
    """
    print("\n========---> Downsampling the data! \n")
    data = data.copy()

    # Ensure index is a DatetimeIndex
    if not isinstance(data.index, pd.DatetimeIndex):
        try:
            data.index = pd.to_datetime(data.index)
        except Exception as e:
            raise ValueError("DataFrame index conversion to DatetimeIndex failed.") from e

    # Downsample by selecting rows where minute % N == 0
    return data[data.index.minute % n == 0]

def calculate_log_returns_all_columns(data: pd.DataFrame, exclude_columns: list = [], dropna: bool = True) -> pd.DataFrame:
    """
    Calculate log returns for all numeric columns in a pandas DataFrame,
    excluding specified columns, and removing excluded columns from the returned DataFrame.

    Args:
        - data (pd.DataFrame): Input DataFrame containing numeric data.
        - exclude_columns (list): List of columns to exclude from log return calculations and the result.
        - dropna (bool): Whether to drop rows with NaN values resulting from the calculation.

    Returns:
        - pd.DataFrame: DataFrame with log returns for numeric columns, excluding specified columns.
    """
    # Copy data and remove excluded columns
    data = data.copy().drop(columns=exclude_columns)
    
    # Select numeric columns for transformation
    columns_to_transform = data.select_dtypes(include=[np.number]).columns
    print(f"columns_to_transform = \n{columns_to_transform}, \nlen(columns_to_transform) = {len(columns_to_transform)}")

    # Calculate log returns for each numeric column
    for col in columns_to_transform:
        if (data[col] <= 0).any():
            raise ValueError(f"Column '{col}' contains non-positive values. Log returns require strictly positive values.")
        data[col] = np.log(data[col] / data[col].shift(1))

    # Return data with or without NaN rows based on dropna
    return data.dropna() if dropna else data

def created_sequences_2(data: pd.DataFrame, sequence_length: int = 60, sliding_interval: int = 60) -> list:
    """
    Divide the dataset into sequences based on the sequence_length.
    Each sequence must fully cover the window size.

    Args:
    - data (pd.DataFrame): The input DataFrame.
    - sequence_length (int): The window size for sequences.

    Returns:
    - sequences (list): A list of sequences (as DataFrames).
    """
    sequences = []
    
    # Iterate over the data with a sliding window to create sequences
    for i in range(0, len(data) - sequence_length + 1, sliding_interval):
        # Extract a sequence of specified length from the DataFrame
        seq = data.iloc[i:i + sequence_length].copy()
        sequences.append(seq)

    return sequences

def gaussian_smoothing(data: pd.DataFrame, sigma=2) -> pd.DataFrame:
    """
    Applies Gaussian smoothing to numeric columns in a DataFrame.

    Args:
        - data (pd.DataFrame): Input DataFrame.
        - sigma (float): Standard deviation for the Gaussian kernel (default is 2).

    Returns:
        - pd.DataFrame: Smoothed DataFrame with sorted index.
    """
    # Sort data by index in ascending order and create a copy
    data = data.sort_index(ascending=True).copy()
    
    # Apply Gaussian smoothing to numeric columns
    for column in data.columns:
        if pd.api.types.is_numeric_dtype(data[column]):
            data[column] = gaussian_filter1d(data[column].values, sigma=sigma)
    
    return data

def detect_trends_4(
    dataframe: pd.DataFrame, 
    column: str = 'close', 
    lower_threshold: float = 0.001, 
    upper_threshold: float = 0.02,
    reverse_steps: int = 7,
    trends_to_keep: set = {0, 1, 2, 3, 4}  # Default keeps all trends
) -> pd.DataFrame:
    """
    Detects trends based on log return data provided in a specified column and categorizes them into different strength levels.

    This function analyzes time-series data by evaluating cumulative trends in log return values provided in the input DataFrame. 
    It uses three dictionaries (`dic1`, `dic2`, `dic3`) to track different phases of trends, handles multi-step reversals, and 
    classifies trends dynamically based on cumulative product thresholds and specified thresholds for trend strengths.

    Args:
        - dataframe (pd.DataFrame): Input DataFrame with log return data.
        - column (str): Column name for log returns (default is 'close').
        - lower_threshold (float): Threshold for moderate trends (default is 0.001).
        - upper_threshold (float): Threshold for strong trends (default is 0.02).
        - reverse_steps (int): Steps to confirm trend reversal (default is 7).
        - trends_to_keep (set): Trends to retain, others set to 0 (default is {0, 1, 2, 3, 4}).

    Returns:
        pd.DataFrame: DataFrame with 'trend' column:
                        - 0: No trend
                        - 1: Moderate negative trend
                        - 2: Very strong negative trend
                        - 3: Moderate positive trend
                        - 4: Very strong positive trend
                      Any trends not included in `trends_to_keep` will be reset to 0.

    Function Details:
    1. **Input Assumption**:
    - The input DataFrame already contains log return data in the specified column (`column`).

    2. **Trend Tracking**:
    - Uses dictionaries to monitor trends:
        - `dic1`: Tracks the first phase of the trend.
        - `dic2`: Tracks the second phase if a reversal occurs.
        - `dic3`: Tracks the third phase if another reversal occurs.

    3. **Cumulative Product**:
    - Calculates the cumulative product of `(1 + log_return)` from the specified column to evaluate the strength of trends.

    4. **Reversal Handling**:
    - If a trend reversal persists beyond `reverse_steps`, labels are assigned based on the cumulative product tracked in `dic1`.
    - Subsequent reversals are merged or labeled independently if conditions are met.

    5. **Label Assignment**:
    - Labels are dynamically assigned based on cumulative product thresholds for positive and negative trends:
        - Positive trends are categorized as moderate (3) or strong (4).
        - Negative trends are categorized as moderate (1) or strong (2).

    6. **Trend Filtering**:
    - After detecting trends, only those specified in `trends_to_keep` remain unchanged.
    - Any trend category not included in `trends_to_keep` is reset to 0 (No Trend).

    7. **Edge Cases**:
    - Properly handles scenarios where data points are insufficient for trend analysis or when trend phases overlap, ensuring all data points are labeled.
    """
    # Copy to avoid modifying the original DataFrame
    df = dataframe.copy()
    df['trend'] = None  # Default value 

    dic1, dic2, dic3 = None, None, None # Initialize trend tracking dictionaries
    
    def assign_label(dictio_, lower_threshold, upper_threshold):
        cumulative = dictio_['cumulative']
        if cumulative > (1 + upper_threshold):
            df.iloc[dictio_['ids'], df.columns.get_loc('trend')] = 4  # Very strong positive
        elif (1 + lower_threshold) < cumulative <= (1 + upper_threshold):
            df.iloc[dictio_['ids'], df.columns.get_loc('trend')] = 3  # Moderate positive
        elif (1 - upper_threshold) < cumulative <= (1 - lower_threshold):
            df.iloc[dictio_['ids'], df.columns.get_loc('trend')] = 1  # Moderate negative
        elif cumulative <= (1 - upper_threshold):
            df.iloc[dictio_['ids'], df.columns.get_loc('trend')] = 2  # Very strong negative
        else:
            df.iloc[dictio_['ids'], df.columns.get_loc('trend')] = 0  # No trend
    
    # Process each log return to detect trends
    for idx, log_ret in enumerate(df[column]):
        sign = 1 if log_ret > 0 else -1

        if dic1 is None:  # Initialize dic1
            dic1 = {'ids': [idx], 'last_sign': sign, 'cumulative': (1 + log_ret)}
            continue

        last_sign = dic1['last_sign']
        if sign == last_sign and dic2 is None:  # Continue same trend
            dic1['ids'].append(idx)
            dic1['last_sign'] = sign
            dic1['cumulative'] *= (1 + log_ret)
            continue

        # 1st Reversal occuring
        if dic2 is None:  # Start dic2
            dic2 = {'ids': [idx], 'last_sign': sign, 'cumulative': (1 + log_ret)}
            continue

        last_sign = dic2['last_sign']
        if sign == last_sign and dic3 is None:  # Continue same trend
            dic2['ids'].append(idx)
            dic2['last_sign'] = sign
            dic2['cumulative'] *= (1 + log_ret)
            if len(dic2['ids']) == reverse_steps:
                assign_label(dic1, lower_threshold, upper_threshold) # Assign labels in the 'trend' column for ids of dic1
                dic1, dic2 = dic2, None
            continue

        # 2nd Reversal occuring
        if dic3 is None:  # Start dic3
            dic3 = {'ids': [idx], 'last_sign': sign, 'cumulative': (1 + log_ret)}
            continue

        last_sign = dic3['last_sign']
        if sign == last_sign: # Continue same trend, there is no dic4 to check if is None
            dic3['ids'].append(idx)
            dic3['last_sign'] = sign
            dic3['cumulative'] *= (1 + log_ret)
            dic_prod = dic2['cumulative'] * dic3['cumulative']
            if (sign == 1 and dic_prod > 1) or (sign == -1 and dic_prod < 1):
                dic1['ids'] += dic2['ids'] + dic3['ids']
                dic1['last_sign'] = dic3['last_sign']
                dic1['cumulative'] *= dic2['cumulative'] * dic3['cumulative']
                dic2, dic3 = None, None
                continue

            if len(dic3['ids']) == reverse_steps:      
                assign_label(dic1, lower_threshold, upper_threshold) # Assign labels in the 'trend' column for ids of dic1
                assign_label(dic2, lower_threshold, upper_threshold) # Assign labels in the 'trend' column for ids of dic1
                dic1, dic2, dic3 = dic3, None, None
            continue
            
        # 3rd Reversal occuring
        assign_label(dic1, lower_threshold, upper_threshold) # Assign labels in the 'trend' column for ids of dic1
        dic1, dic2, dic3 = dic2, dic3, {'ids': [idx], 'last_sign': sign, 'cumulative': (1 + log_ret)}

    # Assign remaining labels
    if dic1:
        assign_label(dic1, lower_threshold, upper_threshold)
    if dic2:
        assign_label(dic2, lower_threshold, upper_threshold)
    if dic3:
        assign_label(dic3, lower_threshold, upper_threshold)
    
    # Apply filtering: Keep only selected trends, set others to 0
    df['trend'] = df['trend'].where(df['trend'].isin(trends_to_keep), 0)

    return df

def split_X_y(sequences: list[pd.DataFrame], 
              target_column: str = 'trend',
              detect_trends_function: Callable[[pd.DataFrame, str, float, float, int, set], pd.DataFrame] = detect_trends_4, 
              column: str = 'close', 
              lower_threshold: float = 0.0009, 
              upper_threshold: float = 0.015,
              reverse_steps: int = 7,
              trends_to_keep: set = {0, 1, 2, 3, 4}) -> Tuple[np.ndarray, np.ndarray]:
    """
    Process sequences to generate features (X) and labels (y) with trend detection.

    Args:
        - sequences (list[pd.DataFrame]): List of DataFrame sequences.
        - target_column (str): Column name for labels (default is 'trend').
        - detect_trends_function (Callable): Trend detection function (default is detect_trends_4).
        - column (str): Column for trend detection (default is 'close').
        - lower_threshold (float): Lower threshold for trends (default is 0.0009).
        - upper_threshold (float): Upper threshold for trends (default is 0.015).
        - reverse_steps (int): Steps for trend reversal (default is 7).
        - trends_to_keep (set): Trends to retain (default is {0, 1, 2, 3, 4}).

    Returns:
        - Tuple[np.ndarray, np.ndarray]: X (features), y (labels) as NumPy arrays.
    """
    # Initialize lists for features and labels
    X, y = [], []
    
    # Process each sequence
    for seq in sequences:
        # Apply trend detection
        seq = detect_trends_function(seq, column, lower_threshold, upper_threshold, reverse_steps, trends_to_keep)
        
        # Extract features and labels
        X.append(seq.drop(columns=[target_column]).values)
        y.append(seq[target_column].values)
    
    # Convert to NumPy arrays
    return np.array(X), np.array(y)

def process_and_return_splits(
    with_indicators_file_path: str,
    downsampled_data_minutes: int,
    exclude_columns: list[str],
    lower_threshold: float,
    upper_threshold: float,
    reverse_steps: int,
    sequence_length: int,
    sliding_interval: int,
    trends_to_keep: set = {0, 1, 2, 3, 4}  # Default keeps all trends
) -> tuple[
    list[list[float]],  # X_train: List of sequences, each containing a list of features
    list[list[int]],    # y_train: List of sequences, each containing a list of labels
    list[list[float]],  # X_val: List of sequences, each containing a list of features
    list[list[int]],    # y_val: List of sequences, each containing a list of labels
    list[list[float]],  # X_test: List of sequences, each containing a list of features
    list[list[int]]     # y_test: List of sequences, each containing a list of labels
]:
    """
    Processes time-series data from a CSV file and prepares it for machine learning.

    This function performs the following steps:
        1. Reads data from the specified CSV file and sorts it by date in descending order.
        2. Optionally downsamples the data to a lower frequency (e.g., 5-minute intervals).
        3. Applies Gaussian smoothing to reduce noise in the data.
        4. Calculates log returns for all numeric columns, excluding specified columns.
        5. Detects trends based on defined thresholds (`lower_threshold`, `upper_threshold`, and `reverse_steps`).
        6. Filters trends to keep only those specified in `trends_to_keep`, setting others to 0 (No Trend).
        7. Converts the processed data into sequences of a fixed length (`sequence_length`) with a sliding interval.
        8. Splits the sequences into training (80%), validation (10%), and test (10%) sets.
        9. Further splits the sequences into features (`X`) and labels (`y`) for supervised learning.

    Args:
        - with_indicators_file_path (str): Path to the CSV file with time-series data.
        - downsampled_data_minutes (int): Frequency for downsampling (e.g., 1 for no downsampling).
        - exclude_columns (list[str]): Columns to exclude from log return calculations.
        - lower_threshold (float): Lower threshold for trend detection.
        - upper_threshold (float): Upper threshold for trend detection.
        - reverse_steps (int): Steps for reversing trends in trend detection.
        - sequence_length (int): Length of sequences to create.
        - sliding_interval (int): Interval for sliding the window.
        - trends_to_keep (set): Trends to retain, others set to 0 (default is {0, 1, 2, 3, 4}).

    Returns:
        - tuple: X_train, y_train, X_val, y_val, X_test, y_test as lists of sequences.
    """
    def check_missing_timestamps(data: pd.DataFrame, stage: str):
        """
        Checks for missing timestamps and prints diagnostic info.
        """
        missing_timestamps = pd.date_range(
            start=data.index.min(),
            end=data.index.max(),
            freq='1min',  # Checking 1-minute frequency
            tz=data.index.tz,
        ).difference(data.index)

        print(f"\n{stage} - Missing timestamps: \n{missing_timestamps}")

        if not missing_timestamps.empty:
            for timestamp in missing_timestamps[:5]:  # Show only first 5 missing timestamps
                print(f"\nMissing timestamp: {timestamp}")

                before = data[data.index < timestamp].tail(5)  # 5 data points before
                after = data[data.index > timestamp].head(5)  # 5 data points after

                print("\nData before missing timestamp:")
                display(before) if not before.empty else print("No data available before.")

                print("\nData after missing timestamp:")
                display(after) if not after.empty else print("No data available after.")

    print("\n======== Processing Time-Series Data ========")

    # Step 1: Read & Sort Data
    data = read_csv_file(with_indicators_file_path, preview_rows=0).sort_index(ascending=False)

    # Step 2: Downsample Data
    if downsampled_data_minutes != 1:
        print("\n---> Downsampling Data")
        data = downsample_minute_data(data, downsampled_data_minutes)

    check_missing_timestamps(data, "Data Retrieved")

    # Step 3: Gaussian Smoothing
    data = gaussian_smoothing(data, sigma=7)
    check_missing_timestamps(data, "Gaussian Smoothed Data")

    # Step 4: Compute Log Returns
    data = calculate_log_returns_all_columns(data, exclude_columns=exclude_columns)
    check_missing_timestamps(data, "Log Returns Computed")

    # Step 5: Create Sequences
    sequences = created_sequences_2(data, sequence_length, sliding_interval)

    # Step 6: Train / Validation / Test Split
    train_size = int(len(sequences) * 0.8)
    val_size = int(len(sequences) * 0.1)

    train_sequences = sequences[:train_size]
    val_sequences = sequences[train_size:train_size + val_size]
    test_sequences = sequences[train_size + val_size:]

    print(f"\nNumber of sequences:\n"
          f"  - Total sequences: {len(sequences)}\n"
          f"  - Train: {len(train_sequences)}\n"
          f"  - Validation: {len(val_sequences)}\n"
          f"  - Test: {len(test_sequences)}\n")

    # Step 7: Convert Sequences to X, y
    def split_and_format_data(sequences):
        X, y = split_X_y(
            sequences, target_column='trend',
            detect_trends_function=detect_trends_4,
            column='close', lower_threshold=lower_threshold,
            upper_threshold=upper_threshold, reverse_steps=reverse_steps,
            trends_to_keep=trends_to_keep
        )
        return np.array(X), np.array(y)

    X_train, y_train = split_and_format_data(train_sequences)
    X_val, y_val = split_and_format_data(val_sequences)
    X_test, y_test = split_and_format_data(test_sequences)

    # Step 8: Data Integrity Check (Ensuring Proper Types)
    def check_data_types(X: np.ndarray, y: np.ndarray, label: str):
        """
        Checks if all values in X are float and y are integer.
        """
        unexpected_X = [(i, j, k, type(v)) for i, seq in enumerate(X)
                        for j, row in enumerate(seq)
                        for k, v in enumerate(row) if not isinstance(v, (float, np.float32))]
        unexpected_y = [(i, j, type(v)) for i, seq in enumerate(y)
                        for j, v in enumerate(seq) if not isinstance(v, (int, np.int64))]

        if unexpected_X:
            print(f"\n⚠️ Unexpected type in {label} X:")
            for i, j, k, t in unexpected_X[:5]:  # Show first 5 errors
                print(f"  Sequence {i}, Row {j}, Feature {k}: {t}")

        if unexpected_y:
            print(f"\n⚠️ Unexpected type in {label} y:")
            for i, j, t in unexpected_y[:5]:  # Show first 5 errors
                print(f"  Sequence {i}, Label {j}: {t}")

    check_data_types(X_train, y_train, "Train")
    check_data_types(X_val, y_val, "Validation")
    check_data_types(X_test, y_test, "Test")

    # Step 9: Convert y types if needed
    def convert_dtype(y: np.ndarray):
        return np.array(y, dtype=np.int64) if isinstance(y, np.ndarray) and y.dtype == np.object_ else y

    y_train, y_val, y_test = map(convert_dtype, [y_train, y_val, y_test])

    # Get feature info
    Number_features = X_train.shape[-1]
    close_col_index = data.columns.get_loc('close')
    
    print(f"\nFeature Info:\n  - close_col_index = {close_col_index}\n  - Number_features = {Number_features}")

    return X_train, y_train, X_val, y_val, X_test, y_test, Number_features


## __All (Initial) parameters__

In [4]:
ticker = 'BTC-USD'
downsampled_data_minutes = 1 # No downsampling

# Step 0 (Again): Identify parameters for trend settings of the loaded data with 1,000 data points
lower_threshold = 0.0009 # 較小的價格變動門檻，代表 輕微的趨勢變化 也可能被識別為趨勢。
upper_threshold = 0.015  # 較大的價格變動門檻，當變動超過這個值，才會標記為強趨勢。
reverse_steps = 13       # 趨勢反轉的步數門檻，當價格變動連續 13 次反向時，才認為趨勢改變。

# Features not to be included in the analysis
exclude_columns= ['MACD', 'MACD_signal', 'ROC_10', 'OBV', 'AD_Line']

# Step 3, under ### Correlation Analysis
# Compute correlations with the 'trend' column
# corr = data_trends.corr()
# trend_corr = corr['trend'].sort_values(ascending=False)
strongly_correlated = ['close', 'open', 'SMA_5', 'high', 'low', 'EMA_10', 'SMA_10'] # Strongly correlated (correlation > 0.6)
moderately_correlated = ['BB_middle', 'BB_lower', 'BB_upper', 'RSI_14']             # Moderately correlated (correlation between 0.3 and 0.6)
weakly_correlated = ['SMA_50', 'volume', 'BBW', 'ATR_14']                           # Weakly correlated or negligible (correlation <~ 0.3)

# Add the weakly_correlated and moderately_correlated features to exclude_columns.
exclude_columns += weakly_correlated + moderately_correlated

sequence_length = 1000
sliding_interval = 60

## __Check GPU, CUDA, Pytorch__

### GPU Details

In [5]:
!nvidia-smi

Sat Mar 29 17:10:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.03                 Driver Version: 566.03         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   36C    P8             10W /  200W |    1114MiB /  12282MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### CUDA Details

In [6]:
def check_gpu_config():
    """
    Check GPU availability and display detailed configuration information.
    """
    # Check if GPU is available
    gpu_available = torch.cuda.is_available()
    
    # Print header
    print("=" * 50)
    print("GPU Configuration Check".center(50))
    print("=" * 50)
    
    # Basic GPU availability
    print(f"{'PyTorch Version':<25}: {torch.__version__}")
    print(f"{'GPU Available':<25}: {'Yes' if gpu_available else 'No'}")
    
    # If GPU is available, print detailed info
    if gpu_available:
        print("-" * 50)
        print("GPU Details".center(50))
        print("-" * 50)
        
        # Device info
        print(f"{'Device Name':<25}: {torch.cuda.get_device_name(0)}")
        print(f"{'Number of GPUs':<25}: {torch.cuda.device_count()}")
        print(f"{'Current Device Index':<25}: {torch.cuda.current_device()}")
        
        # Compute capability and CUDA cores
        props = torch.cuda.get_device_properties(0)
        print(f"{'Compute Capability':<25}: {props.major}.{props.minor}")
        print(f"{'Total CUDA Cores':<25}: {props.multi_processor_count * 128}")  # Approx. 128 cores per SM
        
        # Memory info
        total_memory = props.total_memory / (1024 ** 3)  # Convert to GB
        memory_allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
        memory_reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
        print(f"{'Total Memory (GB)':<25}: {total_memory:.2f}")
        print(f"{'Allocated Memory (GB)':<25}: {memory_allocated:.2f}")
        print(f"{'Reserved Memory (GB)':<25}: {memory_reserved:.2f}")
    else:
        print("-" * 50)
        print("No GPU detected. Running on CPU.".center(50))
        print("-" * 50)
    
    print("=" * 50)

if __name__ == "__main__":
    check_gpu_config()

             GPU Configuration Check              
PyTorch Version          : 2.4.1+cu124
GPU Available            : Yes
--------------------------------------------------
                   GPU Details                    
--------------------------------------------------
Device Name              : NVIDIA GeForce RTX 4070
Number of GPUs           : 1
Current Device Index     : 0
Compute Capability       : 8.9
Total CUDA Cores         : 5888
Total Memory (GB)        : 11.99
Allocated Memory (GB)    : 0.00
Reserved Memory (GB)     : 0.00


### PyTorch Details

In [7]:
def print_torch_config():
    """Print PyTorch and CUDA configuration in a formatted manner."""
    print("=" * 50)
    print("PyTorch Configuration".center(50))
    print("=" * 50)
    
    # Basic PyTorch and CUDA info
    print(f"{'PyTorch Version':<25}: {torch.__version__}")
    print(f"{'CUDA Compiled Version':<25}: {torch.version.cuda}")
    print(f"{'CUDA Available':<25}: {'Yes' if torch.cuda.is_available() else 'No'}")
    print(f"{'Number of GPUs':<25}: {torch.cuda.device_count()}")

    # GPU details if available
    if torch.cuda.is_available():
        print(f"{'GPU Name':<25}: {torch.cuda.get_device_name(0)}")

    print("-" * 50)
    
    # Seed setting
    torch.manual_seed(42)
    print(f"{'Random Seed':<25}: 42 (Seeding successful!)")
    
    print("=" * 50)

if __name__ == "__main__":
    print_torch_config()

              PyTorch Configuration               
PyTorch Version          : 2.4.1+cu124
CUDA Compiled Version    : 12.4
CUDA Available           : Yes
Number of GPUs           : 1
GPU Name                 : NVIDIA GeForce RTX 4070
--------------------------------------------------
Random Seed              : 42 (Seeding successful!)


## __Build the GRU Model__

### Method 0: Bi-Directional GRU with Attention without LoRA

In [None]:
# class BiGRUWithAttention(nn.Module):
#     def __init__(self, input_size: int, hidden_size: int, output_size: int, num_layers: int, dropout: float = 0.0):
#         """
#         Bi-directional GRU model with attention mechanism for sequence classification.

#         Args:
#             - input_size (int): Number of input features.
#             - hidden_size (int): Number of hidden units in GRU.
#             - output_size (int): Number of output classes or values.
#             - num_layers (int): Number of GRU layers.
#             - dropout (float): Dropout rate (default is 0.0).
#         """
#         super(BiGRUWithAttention, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
        
#         # Bi-directional GRU layer
#         self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, 
#                           bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        
#         # Attention layer (original implementation)
#         self.attention_fc = nn.Linear(hidden_size * 2, hidden_size * 2)  # Hidden size * 2 for bi-directional
        
#         # Fully connected layer
#         self.fc = nn.Linear(hidden_size * 2, output_size)
#         self.dropout = nn.Dropout(dropout)
        
#         # Initialize weights
#         self.init_weights()
        
#     def init_weights(self):
#         for name, param in self.named_parameters():
#             if 'weight' in name:
#                 nn.init.xavier_uniform_(param)  # Xavier initialization for weights
#             elif 'bias' in name:
#                 nn.init.constant_(param, 0)     # Zero initialization for biases

#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         # Initialize hidden state
#         batch_size = x.size(0)
#         h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size, device=x.device) # Bi-directional: num_layers * 2

#         # GRU forward pass
#         out, _ = self.gru(x, h0)  # Shape: (batch_size, seq_length, hidden_size * 2)

#         # Attention mechanism
#         attn_weights = torch.tanh(self.attention_fc(out))  # Shape: (batch_size, seq_length, hidden_size * 2)
#         out = attn_weights * out    # Element-wise attention application
#         out = self.dropout(out)     # Apply dropout

#         # Fully connected layer
#         out = self.fc(out)  # Shape: (batch_size, seq_length, output_size)
#         return out
    
# # Example usage
# if __name__ == "__main__":
#     # Dummy data
#     batch_size, seq_length, input_size = 32, 60, 20
#     hidden_size, output_size, num_layers = 64, 2, 2
#     x = torch.randn(batch_size, seq_length, input_size)
    
#     # Initialize model
#     model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout=0.2)
#     output = model(x)
#     print(f"Output shape: {output.shape}")  # Expected: (32, 60, 2)

#     del batch_size, seq_length, input_size, hidden_size, output_size, num_layers, x, model, output

Output shape: torch.Size([32, 60, 2])


### Method 1: Attention Layer with LoRA

In [8]:
class LoRA(nn.Module):
    def __init__(self, linear_layer: nn.Linear, rank: int):
        """
        LoRA module applied to a specified linear layer.

        Args:
            linear_layer (nn.Linear): The linear layer to adapt (e.g., attention_fc or fc).
            rank (int): The rank of the LoRA adjustment matrices (e.g., 8).
        """
        super(LoRA, self).__init__()
        self.linear = linear_layer  # 保留對 linear_layer 的引用
        self.rank = rank
        
        # Get input and output dimensions from the linear layer
        in_features, out_features = linear_layer.weight.shape
        
        # Create LoRA matrices A and B
        self.A = nn.Parameter(torch.zeros(in_features, rank))  # Shape: (in_features, rank)
        self.B = nn.Parameter(torch.zeros(rank, out_features))  # Shape: (rank, out_features)
        
        # Initialize A with normal distribution, B with zeros
        nn.init.normal_(self.A, mean=0, std=1)
        nn.init.zeros_(self.B)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass with LoRA adjustment applied to the linear layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor with LoRA-adapted weights.
        """
        lora_delta = self.A @ self.B
        adapted_weight = self.linear.weight + lora_delta
        return nn.functional.linear(x, adapted_weight, self.linear.bias)
    
    def parameters(self, recurse=True):
        """
        Override parameters() to return only LoRA-specific parameters (A and B).

        Args:
            recurse (bool): Ignored, included for compatibility with nn.Module.

        Returns:
            list: List of LoRA parameters (self.A and self.B).
        """
        return [self.A, self.B]
    

class BiGRUWithAttention(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int, num_layers: int, dropout: float = 0.0, lora_rank: int = 8):
        """
        BiGRU model with attention mechanism and optional LoRA support.

        Args:
            input_size (int): Number of input features.
            hidden_size (int): Number of hidden units in GRU.
            output_size (int): Number of output classes or values.
            num_layers (int): Number of GRU layers.
            dropout (float): Dropout rate (default is 0.0).
            lora_rank (int): Rank for LoRA adapters (default is 8).
        """
        super(BiGRUWithAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lora_rank = lora_rank
        
        # Bi-directional GRU layer
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, 
                          bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        
        # Attention layer
        self.attention_fc = nn.Linear(hidden_size * 2, hidden_size * 2)
        
        # List to hold multiple LoRA adapters
        self.lora_adapters = nn.ModuleList()
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize base model weights
        self.init_weights()
    
    def init_weights(self):
        """Initialize weights of the base model (GRU, attention_fc, fc)."""
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)
    
    def add_lora_adapter(self):
        """
        Add a new LoRA adapter to the attention layer.
        During training, only the latest LoRA adapter should be fine-tuned, while others are frozen.
        """
        new_lora = LoRA(self.attention_fc, self.lora_rank)
        # Move the new LoRA adapter to the same device as the model
        device = next(self.parameters()).device
        new_lora.to(device)
        self.lora_adapters.append(new_lora)
        print(f"Added LoRA adapter, total adapters: {len(self.lora_adapters)}, on device: {device}")
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.

        During inference, all LoRA adapters are applied by summing their adjustments to the attention_fc weights.
        During training, only the latest LoRA adapter should be fine-tuned, while others and the base attention_fc
        should be frozen.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length, input_size).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, seq_length, output_size).
        """
        # Initialize hidden state for GRU
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size, device=x.device)
        
        # GRU forward pass
        out, _ = self.gru(x, h0)  # Shape: (batch_size, seq_length, hidden_size * 2)
        
        # Apply attention with all LoRA adapters (if any)
        if self.lora_adapters:
            # Sum adjustments from all LoRA adapters
            lora_delta = sum(lora.A @ lora.B for lora in self.lora_adapters)
            adapted_weight = self.attention_fc.weight + lora_delta
            attn_out = nn.functional.linear(out, adapted_weight, self.attention_fc.bias)
        else:
            attn_out = self.attention_fc(out)
        
        # Apply tanh activation and compute attention-weighted output
        attn_weights = torch.tanh(attn_out)
        out = attn_weights * out
        out = self.dropout(out)
        
        # Fully connected layer
        out = self.fc(out)  # Shape: (batch_size, seq_length, output_size)
        return out

## __Training and validation function__

### Analytical Function

In [9]:
def compute_classwise_accuracy(student_logits_flat, y_batch, class_correct, class_total):
    """
    Computes per-class accuracy by accumulating correct and total samples for each class using vectorized operations.
    
    Args:
        student_logits_flat (torch.Tensor): Model predictions (logits) in shape [batch_size * seq_len, output_size]
        y_batch (torch.Tensor): True labels in shape [batch_size * seq_len]
        class_correct (dict): Dictionary to store correct predictions per class
        class_total (dict): Dictionary to store total samples per class
    """
    # Ensure inputs are on the same device
    if student_logits_flat.device != y_batch.device:
        raise ValueError("student_logits_flat and y_batch must be on the same device")

    # Convert logits to predicted class indices
    predictions = torch.argmax(student_logits_flat, dim=-1)  # Shape: [batch_size * seq_len]

    # Compute correct predictions mask
    correct_mask = (predictions == y_batch)  # Shape: [batch_size * seq_len], boolean

    # Get unique labels in this batch
    unique_labels = torch.unique(y_batch)

    # Update class_total and class_correct using vectorized operations
    for label in unique_labels:
        label = label.item()  # Convert tensor to scalar
        if label not in class_total:
            class_total[label] = 0
            class_correct[label] = 0
        
        # Count total samples for this label
        label_mask = (y_batch == label)
        class_total[label] += label_mask.sum().item()
        
        # Count correct predictions for this label
        class_correct[label] += (label_mask & correct_mask).sum().item()

### Training and validation function for Period 1

In [10]:
def train_and_validate(model, output_size, criterion, optimizer, 
                       X_train, y_train, X_val, y_val, scheduler, 
                       use_scheduler=None, num_epochs=10, batch_size=64, 
                       model_saving_folder=None, model_name=None, stop_signal_file=None):
    """
    Training and validation function for Period 1.
    
    This function trains a model, evaluates on validation data, and saves:
    1. The top 5 best models based on validation accuracy.
    2. The single best model (`best_model.pth`).
    3. The final model at the last epoch (`final_model.pth`).
    """
    print("\n🚀 'train_and_validate' function started.\n")

    # Ensure the model saving folder exists (delete if it already exists)
    if model_saving_folder:
        if os.path.exists(model_saving_folder):
            shutil.rmtree(model_saving_folder)  # Remove old contents
            print(f"✅ Removed existing folder: {model_saving_folder}")
        os.makedirs(model_saving_folder, exist_ok=True)

    # Default model saving settings
    if not model_saving_folder:
        model_saving_folder = './saved_models'
    if not model_name:
        model_name = 'model'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Convert data to tensors and move to device
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device) # (seqs, seq_len, features)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)    # (seqs, seq_len)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)

    # Create Dataset & DataLoader
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

    # Print dataset information
    print("\n✅ Data Overview:")
    print(f"X_train Shape: {X_train.shape} | y_train Shape: {y_train.shape}")
    print(f"X_val Shape: {X_val.shape} | y_val Shape: {y_val.shape}")

    # Record best results
    global best_results
    best_results = []
    
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        class_correct, class_total = {}, {}

        # Stop signal check
        if stop_signal_file and os.path.exists(stop_signal_file):
            print("\n🛑 Stop signal detected. Exiting training loop safely.\n")
            break

        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).view(-1, output_size)  # seqs * seq_len, output_size
            y_batch = y_batch.view(-1)                      # seqs * seq_len

            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * X_batch.size(0)

            # Compute class-wise accuracy
            compute_classwise_accuracy(outputs, y_batch, class_correct, class_total)

        train_loss = epoch_loss / len(train_loader.dataset)

        # Compute per-class training accuracy
        train_classwise_accuracy = {int(c): f"{(class_correct[c] / class_total[c]) * 100:.2f}%" if class_total[c] > 0 else "0.00%" 
                                    for c in sorted(class_total.keys())}

        # Validate the model
        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        val_class_correct, val_class_total = {}, {}

        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                val_outputs = model(X_val_batch).view(-1, output_size)
                val_labels = y_val_batch.view(-1)

                val_loss += criterion(val_outputs, val_labels).item() * X_val_batch.size(0)
                val_predictions = torch.argmax(val_outputs, dim=-1)
                val_correct += (val_predictions == val_labels).sum().item()
                val_total += val_labels.size(0)

                compute_classwise_accuracy(val_outputs, val_labels, val_class_correct, val_class_total)

        val_loss /= len(val_loader.dataset)
        val_accuracy = val_correct / val_total
        val_classwise_accuracy = {int(c): f"{(val_class_correct[c] / val_class_total[c]) * 100:.2f}%" if val_class_total[c] > 0 else "0.00%" 
                                  for c in sorted(val_class_total.keys())}

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {train_loss:.9f}, "
              f"Train-Class-Acc: {train_classwise_accuracy}, "
              f"Val Loss: {val_loss:.9f}, "
              f"Val Accuracy: {val_accuracy * 100:.2f}%, "
              f"Val-Class-Acc: {val_classwise_accuracy}, "
              f"LR: {optimizer.param_groups[0]['lr']:.9f}")

        # Save current epoch model information
        current_epoch_info = {
            "epoch": epoch+1,
            "train_loss": train_loss,
            "train_classwise_accuracy": train_classwise_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "val_classwise_accuracy": val_classwise_accuracy,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "learning_rate": optimizer.param_groups[0]['lr'],
            "model_path": os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch+1}.pth")
        }
        
        # Save top 5 models
        if len(best_results) < 5 or val_accuracy > best_results[-1]["val_accuracy"]:
            if len(best_results) == 5:
                worst = best_results.pop()
                if os.path.exists(worst["model_path"]):
                    os.remove(worst["model_path"])
                    print(f"🗑 Removed old model: {worst['model_path']} (Acc: {worst['val_accuracy']*100:.2f}%)")

            best_results.append(current_epoch_info)
            best_results.sort(key=lambda x: (x["val_accuracy"], x["epoch"]), reverse=True)

            torch.save({
                'epoch': epoch+1,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'learning_rate': optimizer.param_groups[0]['lr']
            }, current_epoch_info["model_path"])
            print(f"✅ Model saved: {current_epoch_info['model_path']}")

        if use_scheduler:
            scheduler.step(val_loss)

    # Save best model
    if best_results:
        best_model_info = best_results[0]  
        best_model_path = os.path.join(model_saving_folder, f"{model_name}_best.pth")

        torch.save({
            'epoch': best_model_info["epoch"],
            'train_loss': best_model_info["train_loss"],
            'val_loss': best_model_info["val_loss"],
            'model_state_dict': best_model_info["model_state_dict"],
            'optimizer_state_dict': best_model_info["optimizer_state_dict"],
            'learning_rate': best_model_info["learning_rate"]
        }, best_model_path)
        print(f"\n🏆 Best model saved as: {best_model_path} (Val Accuracy: {best_model_info['val_accuracy'] * 100:.2f}%)")
    else:
        print("\n⚠️ No best model saved")

    # Save the final model
    if 'current_epoch_info' in locals():
        final_model_path = os.path.join(model_saving_folder, f"{model_name}_final.pth")
        torch.save({ # Save this model
            'epoch': epoch+1,  # Save the current epoch
            'train_loss': train_loss,
            'val_loss': val_loss,
            'model_state_dict': model.state_dict(),  # Model weights
            'optimizer_state_dict': optimizer.state_dict(),  # Optimizer state
            'learning_rate': optimizer.param_groups[0]['lr'] # Optimizer state
        }, final_model_path)
        print(f"\n📌 Final model saved as: {final_model_path}")
    else:
        print("\n⚠️ No final model saved")

    print("\n🎯 Top 5 Best Models by Validation Accuracy:")
    for res in best_results:        
        print(f"Epoch {res['epoch']}/{num_epochs}, "
              f"Train Loss: {res['train_loss']:.9f}, "
              f"Train-Class-Acc: {res['train_classwise_accuracy']},\n"  # Adjusted newline here
              f"Val Loss: {res['val_loss']:.9f}, "
              f"Val Accuracy: {res['val_accuracy'] * 100:.2f}%, "
              f"Val-Class-Acc: {res['val_classwise_accuracy']}, "
              f"Model Path: {res['model_path']}")

    del X_train, y_train, X_val, y_val, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()

### Training and validation function for Period 2 and beyond

In [None]:
# # Old Version
# def train_and_validate_lora(student_model, teacher_model, stable_classes, output_size, criterion, optimizer, 
#                             X_train, y_train, X_val, y_val, scheduler, 
#                             use_scheduler=None, num_epochs=10, batch_size=64, alpha=0.5,
#                             model_saving_folder=None, model_name=None, stop_signal_file=None):
#     """
#     student_model: The new LoRA-based student model (with output size 3).
#     teacher_model: Frozen teacher model from period 1 (with output size 2).
#     criterion: CrossEntropyLoss function.
#     optimizer: Optimizer for student model.
#     X_train, y_train, X_val, y_val: Training/validation data (as NumPy arrays or similar).
#     num_epochs: Number of epochs to train.
#     batch_size: Batch size for DataLoader.
#     alpha: Weighting factor for distillation loss (alpha * distill_loss + (1-alpha) * ce_loss).
#     """
#     print("\n🚀 'train_and_validate_lora' function started.\n")

#     # Ensure the model saving folder exists (delete if it already exists)
#     if model_saving_folder:
#         if os.path.exists(model_saving_folder):
#             shutil.rmtree(model_saving_folder)  # Remove old contents
#             print(f"✅ Removed existing folder: {model_saving_folder}")
#         os.makedirs(model_saving_folder, exist_ok=True)
        
#     # Default model saving settings
#     if not model_saving_folder:
#         model_saving_folder = './saved_models'
#     if not model_name:
#         model_name = 'model'

#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     student_model.to(device)
#     teacher_model.to(device)

#     # Convert data to tensors and move to device
#     X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
#     y_train = torch.tensor(y_train, dtype=torch.long).to(device)
#     X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
#     y_val = torch.tensor(y_val, dtype=torch.long).to(device)

#     # Create Dataset & DataLoader
#     train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

#     print("\n✅ Data Overview:")
#     print(f"X_train Shape: {X_train.shape} | y_train Shape: {y_train.shape}")
#     print(f"X_val Shape: {X_val.shape} | y_val Shape: {y_val.shape}")

#     # Record best results
#     global best_results  # Ensure we can modify the external variable if defined outside.
#     best_results = []    # Start empty each training run
#     teacher_model.eval() # Ensure teacher is frozen

#     for epoch in range(num_epochs):
#         epoch_loss = 0.0
#         class_correct, class_total = {}, {}
        
#         # Stop signal check
#         if stop_signal_file and os.path.exists(stop_signal_file):
#             print("\n🛑 Stop signal detected. Exiting training loop safely.\n")
#             break

#         student_model.train()
#         i=0
#         for X_batch, y_batch in train_loader:
#             # Reset gradients before forward pass
#             optimizer.zero_grad()  # Best practice

#             # Forward pass: student model produces logits for output_size classes.
#             student_logits = student_model(X_batch)  # Shape: [batch, seq_len, output_size]
            
#             # Reshape for CE loss computation.
#             student_logits_flat = student_logits.view(-1, output_size)
#             y_batch = y_batch.view(-1)

#             # Compute Cross-Entropy loss
#             ce_loss = criterion(student_logits_flat, y_batch)

#             # Compute class-wise accuracy (Accumulates values in dict)
#             compute_classwise_accuracy(student_logits_flat, y_batch, class_correct, class_total)

#             if epoch == 1 and i < 3:
#                 i += 1
#                 print(f"\nUnique target values: {y_batch.unique()}")
#                 print(f"Target dtype: {y_batch.dtype}")
#                 print(f"Min target: {y_batch.min()}, Max target: {y_batch.max()}")
#                 print("Unique classes in y_train:", y_train.unique())
#                 print(f"Unique classes in y_val: {y_val.unique()}\n")

#             # Knowledge Distillation: Forward pass through teacher (pre-trained on previous period data).
#             with torch.no_grad():
#                 teacher_logits = teacher_model(X_batch)  # Shape: [batch, seq_len, teacher_output_size]
            
#             # Select stable classes for distillation
#             """
#             Use stable_classes (a list of indices) to extract the relevant logits.
#             We distill only the stable classes (class 1 if teacher is from period 1).
#             Teacher's class index 1 corresponds to student's class index 1.
#             It's safer to use index_select to ensure the operation works on GPU.
#             """
#             stable_indices = torch.tensor(stable_classes, device=teacher_logits.device)
#             teacher_stable = teacher_logits.index_select(dim=2, index=stable_indices)
#             student_stable = student_logits.index_select(dim=2, index=stable_indices)

#             # Compute KL Distillation Loss
#             distill_loss = F.mse_loss(student_stable, teacher_stable)

#             # Total loss: weighted sum of CE loss and distillation loss
#             total_loss = alpha * distill_loss + (1 - alpha) * ce_loss
#             total_loss.backward()
#             optimizer.step()
#             epoch_loss += total_loss.item() * X_batch.size(0)
            
#         train_loss = epoch_loss / len(train_loader.dataset)

#         # Compute per-class training accuracy
#         train_classwise_accuracy = {int(c): f"{(class_correct[c] / class_total[c]) * 100:.2f}%" if class_total[c] > 0 else "0.00%" 
#                                     for c in sorted(class_total.keys())}

#         # Perform validation at the end of each epoch (only CE loss and accuracy)
#         student_model.eval()
#         val_loss, val_correct, val_total = 0.0, 0, 0
#         val_class_correct, val_class_total = {}, {}

#         with torch.no_grad():
#             for X_val_batch, y_val_batch in val_loader:
#                 val_outputs = student_model(X_val_batch).view(-1, output_size)
#                 val_labels = y_val_batch.view(-1)

#                 val_loss += criterion(val_outputs, val_labels).item() * X_val_batch.size(0)  # Scale to total loss
#                 val_predictions = torch.argmax(val_outputs, dim=-1)
#                 val_correct += (val_predictions == val_labels).sum().item()
#                 val_total += val_labels.size(0)

#                 # Compute per-class validation accuracy
#                 compute_classwise_accuracy(val_outputs, val_labels, val_class_correct, val_class_total)

#         val_loss /= len(val_loader.dataset)
#         val_accuracy = val_correct / val_total

#         # Compute per-class validation accuracy
#         val_classwise_accuracy = {int(c): f"{(val_class_correct[c] / val_class_total[c]) * 100:.2f}%" if val_class_total[c] > 0 else "0.00%" 
#                                   for c in sorted(val_class_total.keys())}

#         print(f"Epoch {epoch+1}/{num_epochs}, "
#               f"Train Loss: {train_loss:.9f}, "
#               f"Train-Class-Acc: {train_classwise_accuracy}, "
#               f"Val Loss: {val_loss:.9f}, "
#               f"Val Accuracy: {val_accuracy * 100:.2f}%, "
#               f"Val-Class-Acc: {val_classwise_accuracy}, "
#               f"LR: {optimizer.param_groups[0]['lr']:.9f}")

#         # Save current model and update best results if applicable
#         current_epoch_info = {
#             "epoch": epoch+1,
#             "train_loss": train_loss,
#             "train_classwise_accuracy": train_classwise_accuracy,
#             "val_loss": val_loss,
#             "val_accuracy": val_accuracy,
#             "val_classwise_accuracy": val_classwise_accuracy,
#             "model_state_dict": student_model.state_dict(),
#             "optimizer_state_dict": optimizer.state_dict(),
#             "learning_rate": optimizer.param_groups[0]['lr'],
#             "model_path": os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch+1}.pth")
#         }

#         # Save top 5 models
#         if len(best_results) < 5 or val_accuracy > best_results[-1]["val_accuracy"]:
#             if len(best_results) == 5:
#                 # Remove the worst model from the list, the last (lowest accuracy)
#                 worst = best_results.pop() 
#                 if os.path.exists(worst["model_path"]):
#                     os.remove(worst["model_path"])
#                     print(f"🗑 Removed old model: {worst['model_path']} (Acc: {worst['val_accuracy']*100:.2f}%)")

#             # Just insert and sort by val_accuracy descending
#             best_results.append(current_epoch_info) 
#             best_results.sort(key=lambda x: (x["val_accuracy"], x["epoch"]), reverse=True)

#             torch.save({ # Save this model
#                 'epoch': epoch+1,  # Save the current epoch
#                 'train_loss': train_loss,
#                 'val_loss': val_loss,
#                 'model_state_dict': student_model.state_dict(),  # Model weights
#                 'optimizer_state_dict': optimizer.state_dict(),  # Optimizer state
#                 'learning_rate': optimizer.param_groups[0]['lr'] # Optimizer state
#             }, current_epoch_info["model_path"])
#             print(f"✅ Model saved: {current_epoch_info['model_path']}")

#         if use_scheduler == True:
#             # Scheduler step should follow after considering the results (placed after otallher losses)
#             scheduler.step(val_loss)

#     # Save best model
#     if best_results:
#         best_model_info = best_results[0]  
#         best_model_path = os.path.join(model_saving_folder, f"{model_name}_best.pth")

#         torch.save({
#             'epoch': best_model_info["epoch"],
#             'train_loss': best_model_info["train_loss"],
#             'val_loss': best_model_info["val_loss"],
#             'model_state_dict': best_model_info["model_state_dict"],
#             'optimizer_state_dict': best_model_info["optimizer_state_dict"],
#             'learning_rate': best_model_info["learning_rate"]
#         }, best_model_path)
#         print(f"\n🏆 Best model saved as: {best_model_path} (Val Accuracy: {best_model_info['val_accuracy'] * 100:.2f}%)")
#     else:
#         print("\n⚠️ No best model saved")

#     # Save the final model
#     if 'current_epoch_info' in locals():
#         final_model_path = os.path.join(model_saving_folder, f"{model_name}_final.pth")
#         torch.save({ # Save this model
#             'epoch': epoch+1,  # Save the current epoch
#             'train_loss': train_loss,
#             'val_loss': val_loss,
#             'model_state_dict': student_model.state_dict(),  # Model weights
#             'optimizer_state_dict': optimizer.state_dict(),  # Optimizer state
#             'learning_rate': optimizer.param_groups[0]['lr'] # Optimizer state
#         }, final_model_path)
#         print(f"\n📌 Final model saved as: {final_model_path}")
#     else:
#         print("\n⚠️ No final model saved")

#     print("\n🎯 Top 5 Best Models by Validation Accuracy:")
#     for res in best_results:        
#         print(f"Epoch {res['epoch']}/{num_epochs}, "
#               f"Train Loss: {res['train_loss']:.9f}, "
#               f"Train-Class-Acc: {res['train_classwise_accuracy']},\n"
#               f"Val Loss: {res['val_loss']:.9f}, "
#               f"Val Accuracy: {res['val_accuracy'] * 100:.2f}%, "
#               f"Val-Class-Acc: {res['val_classwise_accuracy']}, "
#               f"Model Path: {res['model_path']}")
    
#     del X_train, y_train, X_val, y_val, train_loader, val_loader
#     torch.cuda.empty_cache()

In [11]:
# New Version
def train_and_validate_lora(student_model, teacher_model, stable_classes, output_size, criterion, optimizer, 
                            X_train, y_train, X_val, y_val, scheduler, 
                            use_scheduler=None, num_epochs=10, batch_size=64, alpha=0.5,
                            model_saving_folder=None, model_name=None, stop_signal_file=None,
                            class_features_dict=None, tau_high=0.5, tau_low=0.5, related_labels=None):
    """
    Training and validation function for Period 2 and beyond with LoRA adapters and custom strategy.

    Args:
        student_model: The student model (BiGRUWithAttention) to train with LoRA adapters.
        teacher_model: Frozen teacher model from the previous period.
        stable_classes (list): List of class indices to use for knowledge distillation.
        output_size (int): Number of output classes for the student model.
        criterion: Loss function (e.g., CrossEntropyLoss).
        optimizer: Optimizer for the student model.
        X_train, y_train, X_val, y_val: Training and validation data (as NumPy arrays).
        scheduler: Learning rate scheduler.
        use_scheduler (bool): Whether to use the scheduler.
        num_epochs (int): Number of epochs to train.
        batch_size (int): Batch size for DataLoader.
        alpha (float): Weighting factor for distillation loss (alpha * distill_loss + (1-alpha) * ce_loss).
        model_saving_folder (str): Folder to save models.
        model_name (str): Name of the model for saving.
        stop_signal_file (str): Path to a stop signal file to interrupt training.
        class_features_dict (dict): Dictionary to store class features from previous periods.
        tau_high (float): High similarity threshold for DynEx-CLoRA (set to 0.5 as per request).
        tau_low (float): Low similarity threshold for DynEx-CLoRA (set to 0.5 as per request).
        related_labels (dict): Dictionary recording related labels for attention_fc and LoRA adapters.
    
    Returns:
        class_features_dict: Updated dictionary of class features.
    """
    print("\n🚀 'train_and_validate_lora' function started.\n")

    # Initialize related_labels if not provided (for Period 2)
    if related_labels is None:
        related_labels = {'attention_fc': [0, 1]}
    print(f"Initial related_labels: {related_labels}")

    # Ensure the model saving folder exists (delete if it already exists)
    if model_saving_folder:
        if os.path.exists(model_saving_folder):
            shutil.rmtree(model_saving_folder)
            print(f"✅ Removed existing folder: {model_saving_folder}")
        os.makedirs(model_saving_folder, exist_ok=True)
        
    # Default model saving settings
    if not model_saving_folder:
        model_saving_folder = './saved_models'
    if not model_name:
        model_name = 'model'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)
    teacher_model.to(device)

    # Convert data to tensors and move to device
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)

    # Create DataLoader
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

    print("\n✅ Data Overview:")
    print(f"X_train Shape: {X_train.shape} | y_train Shape: {y_train.shape}")
    print(f"X_val Shape: {X_val.shape} | y_val Shape: {y_val.shape}")

    # Record best results
    global best_results
    best_results = []
    teacher_model.eval()

    # Extract features for new classes
    student_model.eval()
    new_class_features = {}
    with torch.no_grad():
        for X_batch, y_batch in train_loader:
            features = student_model.gru(X_batch)[0]  # Shape: [batch, seq_len, hidden_size * 2]
            features = features.reshape(-1, features.size(-1))  # Flatten
            y_batch = y_batch.view(-1)
            for label in torch.unique(y_batch):
                label = label.item()
                if label not in new_class_features:
                    new_class_features[label] = []
                mask = (y_batch == label)
                new_class_features[label].append(features[mask])
    for label in new_class_features:
        new_class_features[label] = torch.cat(new_class_features[label], dim=0).mean(dim=0)
    student_model.train()

    # DynEx-CLoRA: Compare with existing class features
    if class_features_dict is None:
        class_features_dict = {}
    existing_class_features = class_features_dict

    # Calculate similarity
    cosine_sim = nn.CosineSimilarity(dim=0)
    similarity_scores = {}
    for new_label, new_feature in new_class_features.items():
        similarity_scores[new_label] = {}
        for existing_label, existing_feature in existing_class_features.items():
            s = cosine_sim(new_feature, existing_feature)
            similarity_scores[new_label][existing_label] = s.item()

    print("\nSimilarity Scores:")
    for new_label, scores in similarity_scores.items():
        print(f"New Class {new_label}:")
        if scores:
            for existing_label, s in scores.items():
                print(f"  - Existing Class {existing_label}: {s:.4f}")
        else:
            print("  - No existing classes to compare")
    
    # Define similarity threshold
    similarity_threshold = 0.5  # Set to 0.5 as per request
    print(f"Similarity threshold: {similarity_threshold}")

    # Classify old and new classes
    old_classes = [label for label in new_class_features.keys() if label in existing_class_features]
    new_classes = [label for label in new_class_features.keys() if label not in existing_class_features]
    print(f"Old classes: {old_classes}")
    print(f"New classes: {new_classes}")

    # Decision rules: Determine which networks to unfreeze
    to_unfreeze = set()

    # Handle old classes (except 0)
    for label in old_classes:
        if label != 0:
            s = similarity_scores[label][label]
            print(f"Old Class {label} similarity with itself: {s:.4f}")
            if s < similarity_threshold:
                # Find related networks and add to unfreeze set
                for key, labels in related_labels.items():
                    if label in labels:
                        to_unfreeze.add(key)
                        print(f"Unfreezing {key} due to low similarity of Class {label}")

    # Special case for Period 2: If no existing features, directly add a LoRA adapter
    if not existing_class_features:
        print("No existing class features found (Period 2). Adding a new LoRA adapter for new classes.")
        # Filter out classes already in related_labels['attention_fc']
        truly_new_classes = [label for label in new_classes if label not in related_labels['attention_fc']]
        if truly_new_classes:
            student_model.add_lora_adapter()
            new_lora_index = len(student_model.lora_adapters) - 1
            related_labels[new_lora_index] = truly_new_classes  # Associate only truly new classes to this LoRA
            to_unfreeze.add(new_lora_index)
            print(f"Added new LoRA adapter (index {new_lora_index}) for Classes {truly_new_classes}")
        else:
            print("No truly new classes to associate with a new LoRA adapter.")
    else:
        # Handle old classes (except 0)
        for label in old_classes:
            if label != 0:
                s = similarity_scores[label][label]
                print(f"Old Class {label} similarity with itself: {s:.4f}")
                if s < similarity_threshold:
                    # Find related networks and add to unfreeze set
                    for key, labels in related_labels.items():
                        if label in labels:
                            to_unfreeze.add(key)
                            print(f"Unfreezing {key} due to low similarity of Class {label}")

        # Handle new classes
        for label in new_classes:
            s_0 = similarity_scores[label][0] if 0 in existing_class_features else -1
            print(f"New Class {label} similarity with Class 0: {s_0:.4f}")
            
            if s_0 >= similarity_threshold:
                # Add new LoRA and associate new class
                student_model.add_lora_adapter()
                new_lora_index = len(student_model.lora_adapters) - 1
                related_labels[new_lora_index] = [label]
                to_unfreeze.add(new_lora_index)
                print(f"Added new LoRA adapter (index {new_lora_index}) for Class {label}")
            
            # Check similarity with other Existing Classes (except 0), regardless of s_0
            for existing_label in existing_class_features:
                if existing_label != 0:
                    s = similarity_scores[label][existing_label]
                    if s >= similarity_threshold:
                        for key, labels in related_labels.items():
                            if existing_label in labels:
                                to_unfreeze.add(key)
                                print(f"Unfreezing {key} due to high similarity between New Class {label} and Existing Class {existing_label}")
                                # If not adding new LoRA, associate new class to this LoRA
                                if s_0 < similarity_threshold and isinstance(key, int):
                                    related_labels[key].append(label)
                                    print(f"Associated New Class {label} to LoRA adapter {key}")
                                break  # Assume one association per new class

    # Default: Freeze all attention_fc and LoRA adapters
    for param in student_model.attention_fc.parameters():
        param.requires_grad = False
    for lora in student_model.lora_adapters:
        for param in lora.parameters():
            param.requires_grad = False
    print("\nDefault: All attention_fc and LoRA adapters are frozen")

    print("to_unfreeze:", to_unfreeze)
    
    print("After freezing attention_fc & LoRA adapter:")
    for param in student_model.attention_fc.parameters():
        print(f"attention_fc param.requires_grad: {param.requires_grad}")
    for lora in student_model.lora_adapters:
        for param in lora.parameters():
           print(f"lora param.requires_grad: {lora}: {param.requires_grad}")

    # Unfreeze networks that need to be updated
    if 'attention_fc' in to_unfreeze:
        for param in student_model.attention_fc.parameters():
            param.requires_grad = True
        print("Unfroze attention_fc")

    print("After recognizing attention_fc (default: False):")
    for param in student_model.attention_fc.parameters():
        print(f"param.requires_grad: {param.requires_grad}")
    for lora in student_model.lora_adapters:
        for param in lora.parameters():
           print(f"lora param.requires_grad: {lora}: {param.requires_grad}")
        
    for lora_index in to_unfreeze:
        if isinstance(lora_index, int):
            for param in student_model.lora_adapters[lora_index].parameters():
                param.requires_grad = True
            print(f"Unfroze LoRA adapter {lora_index}")

    print("After recognizing LoRA adapters (default: False):")
    for param in student_model.attention_fc.parameters():
        print(f"attention_fc param.requires_grad: {param.requires_grad}")
    for lora in student_model.lora_adapters:
        for param in lora.parameters():
           print(f"lora param.requires_grad: {lora}: {param.requires_grad}")

    # FC layer is always trainable
    for param in student_model.fc.parameters():
        param.requires_grad = True
    print("FC layer remains trainable")

    # Print freeze status before training
    print("\nFreeze Status Before Training:")

    # Check attention_fc
    for param in student_model.attention_fc.parameters():
        print(f"attention_fc param.requires_grad: {param.requires_grad}")
    for lora in student_model.lora_adapters:
        for param in lora.parameters():
           print(f"lora param.requires_grad: {lora}: {param.requires_grad}")
        
    # Check each LoRA adapter
    for i, lora in enumerate(student_model.lora_adapters):
        lora_frozen = all(not param.requires_grad for param in lora.parameters())
        print(f"LoRA adapter {i}: {'Frozen' if lora_frozen else 'Unfrozen'}")
        
    # Check fc layer
    fc_frozen = all(not param.requires_grad for param in student_model.fc.parameters())
    print(f"fc layer: {'Frozen' if fc_frozen else 'Unfrozen'}")

    # Update optimizer to only optimize trainable parameters
    optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, student_model.parameters()),
        lr=optimizer.param_groups[0]['lr']
    )

    print(f"Current Related_labels: {related_labels}")

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        class_correct, class_total = {}, {}

        if stop_signal_file and os.path.exists(stop_signal_file):
            print("\n🛑 Stop signal detected. Exiting training loop safely.\n")
            break

        student_model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            student_logits = student_model(X_batch)
            student_logits_flat = student_logits.view(-1, output_size)
            y_batch = y_batch.view(-1)
            ce_loss = criterion(student_logits_flat, y_batch)
            
            # Compute class-wise accuracy
            compute_classwise_accuracy(student_logits_flat, y_batch, class_correct, class_total)

            # Knowledge Distillation
            with torch.no_grad():
                teacher_logits = teacher_model(X_batch)
            
            # Select stable classes for distillation
            stable_indices = torch.tensor(stable_classes, device=teacher_logits.device)
            teacher_stable = teacher_logits.index_select(dim=2, index=stable_indices)
            student_stable = student_logits.index_select(dim=2, index=stable_indices)

            # Compute KL Distillation Loss
            distill_loss = F.mse_loss(student_stable, teacher_stable)

            # Total loss: weighted sum of CE loss and distillation loss
            total_loss = alpha * distill_loss + (1 - alpha) * ce_loss
            total_loss.backward()
            optimizer.step()
            epoch_loss += total_loss.item() * X_batch.size(0)

        train_loss = epoch_loss / len(train_loader.dataset)

        # Compute per-class training accuracy
        train_classwise_accuracy = {int(c): f"{(class_correct[c] / class_total[c]) * 100:.2f}%" if class_total[c] > 0 else "0.00%" 
                                    for c in sorted(class_total.keys())}
        
        # Perform validation at the end of each epoch (only CE loss and accuracy)
        student_model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        val_class_correct, val_class_total = {}, {}
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                val_outputs = student_model(X_val_batch).view(-1, output_size)
                val_labels = y_val_batch.view(-1)
                val_loss += criterion(val_outputs, val_labels).item() * X_val_batch.size(0)
                val_predictions = torch.argmax(val_outputs, dim=-1)
                val_correct += (val_predictions == val_labels).sum().item()
                val_total += val_labels.size(0)

                # Compute per-class validation accuracy
                compute_classwise_accuracy(val_outputs, val_labels, val_class_correct, val_class_total)

        val_loss /= len(val_loader.dataset)
        val_accuracy = val_correct / val_total

        # Compute per-class validation accuracy
        val_classwise_accuracy = {int(c): f"{(val_class_correct[c] / val_class_total[c]) * 100:.2f}%" if val_class_total[c] > 0 else "0.00%" 
                                  for c in sorted(val_class_total.keys())}
        
        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {train_loss:.9f}, "
              f"Train-Class-Acc: {train_classwise_accuracy}, "
              f"Val Loss: {val_loss:.9f}, "
              f"Val Accuracy: {val_accuracy * 100:.2f}%, "
              f"Val-Class-Acc: {val_classwise_accuracy}, "
              f"LR: {optimizer.param_groups[0]['lr']:.9f}")
        
        # Save current model and update best results if applicable
        current_epoch_info = {
            "epoch": epoch+1,
            "train_loss": train_loss,
            "train_classwise_accuracy": train_classwise_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "val_classwise_accuracy": val_classwise_accuracy,
            "model_state_dict": student_model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "learning_rate": optimizer.param_groups[0]['lr'],
            "num_lora_adapters": len(student_model.lora_adapters),
            "related_labels": related_labels,  # Save related_labels
            "model_path": os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch+1}.pth")
        }

        # Save top 5 models
        if len(best_results) < 5 or val_accuracy > best_results[-1]["val_accuracy"]:
            if len(best_results) == 5:
                # Remove the worst model from the list (lowest accuracy)
                worst = best_results.pop()
                if os.path.exists(worst["model_path"]):
                    os.remove(worst["model_path"])
                    print(f"🗑 Removed old model: {worst['model_path']} (Acc: {worst['val_accuracy']*100:.2f}%)")
            
            # Insert and sort by val_accuracy descending
            best_results.append(current_epoch_info)
            best_results.sort(key=lambda x: (x["val_accuracy"], x["epoch"]), reverse=True)
            
            torch.save({
                'epoch': epoch+1,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'model_state_dict': student_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'learning_rate': optimizer.param_groups[0]['lr'],
                'num_lora_adapters': len(student_model.lora_adapters),
                'related_labels': related_labels  # Save related_labels
            }, current_epoch_info["model_path"])
            print(f"✅ Model saved: {current_epoch_info['model_path']}")

        if use_scheduler == True:
            scheduler.step(val_loss)

    # Save best model
    if best_results:
        best_model_info = best_results[0]  
        best_model_path = os.path.join(model_saving_folder, f"{model_name}_best.pth")

        torch.save({
            'epoch': best_model_info["epoch"],
            'train_loss': best_model_info["train_loss"],
            'val_loss': best_model_info["val_loss"],
            'model_state_dict': best_model_info["model_state_dict"],
            'optimizer_state_dict': best_model_info["optimizer_state_dict"],
            'learning_rate': best_model_info["learning_rate"],
            'num_lora_adapters': best_model_info["num_lora_adapters"],
            'related_labels': related_labels  # Save related_labels
        }, best_model_path)
        print(f"\n🏆 Best model saved as: {best_model_path} (Val Accuracy: {best_model_info['val_accuracy'] * 100:.2f}%)")

    # Save the final model
    if 'current_epoch_info' in locals():
        final_model_path = os.path.join(model_saving_folder, f"{model_name}_final.pth")
        torch.save({
            'epoch': epoch+1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'model_state_dict': student_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'learning_rate': optimizer.param_groups[0]['lr'],
            'num_lora_adapters': len(student_model.lora_adapters),
            'related_labels': related_labels  # Save related_labels
        }, final_model_path)
        print(f"\n📌 Final model saved as: {final_model_path}")
    else:
        print("\n⚠️ No final model saved")

    print("\n🎯 Top 5 Best Models by Validation Accuracy:")
    for res in best_results:        
        print(f"Epoch {res['epoch']}/{num_epochs}, "
              f"Train Loss: {res['train_loss']:.9f}, "
              f"Train-Class-Acc: {res['train_classwise_accuracy']},\n"
              f"Val Loss: {res['val_loss']:.9f}, "
              f"Val Accuracy: {res['val_accuracy'] * 100:.2f}%, "
              f"Val-Class-Acc: {res['val_classwise_accuracy']}, "
              f"Model Path: {res['model_path']}")
    
    # Save class features
    for label, feature in new_class_features.items():
        class_features_dict[label] = feature
    
    del X_train, y_train, X_val, y_val, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return class_features_dict

In [None]:
# # Training and validation function for Period 2 and beyond 
# def train_and_validate_KL_Div(student_model, teacher_model, stable_classes, output_size, criterion, optimizer, 
#                               X_train, y_train, X_val, y_val, scheduler, use_scheduler=None, 
#                               num_epochs=10, batch_size=64, alpha=0.5, temperature=2.0,
#                               model_saving_folder=None, model_name=None, stop_signal_file=None):
#     """
#     Training and validation function for Period 2+ with KL Divergence distillation.

#     Args:
#         student_model: The new LoRA-based student model (with output size 3).
#         teacher_model: Frozen teacher model from period 1 (with output size 2).
#         stable_classes: List of class indices to distill (e.g., [1]).
#         output_size: Number of output classes for student model.
#         criterion: CrossEntropyLoss function.
#         optimizer: Optimizer for student model.
#         X_train, y_train, X_val, y_val: Training/validation data (NumPy arrays or similar).
#         scheduler: Learning rate scheduler.
#         use_scheduler: Boolean to enable scheduler (default None).
#         num_epochs: Number of epochs to train (default 10).
#         batch_size: Batch size for DataLoader (default 64).
#         alpha: Weighting factor for distillation loss (default 0.5).
#         temperature: Temperature for softening probabilities in distillation (default 2.0).
#         model_saving_folder: Folder to save models (default None).
#         model_name: Base name for saved models (default None).
#         stop_signal_file: File path to check for early stopping (default None).
#     """
#     print("\n🚀 'train_and_validate_KL_Div' function started.\n")

#     # Ensure the model saving folder exists (delete if it already exists)
#     if model_saving_folder:
#         if os.path.exists(model_saving_folder):
#             shutil.rmtree(model_saving_folder)
#             print(f"✅ Removed existing folder: {model_saving_folder}")
#         os.makedirs(model_saving_folder, exist_ok=True)
        
#     # Default model saving settings
#     if not model_saving_folder:
#         model_saving_folder = './saved_models'
#     if not model_name:
#         model_name = 'model'

#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     student_model.to(device)
#     teacher_model.to(device)

#     # Convert data to tensors and move to device
#     X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
#     y_train = torch.tensor(y_train, dtype=torch.long).to(device)
#     X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
#     y_val = torch.tensor(y_val, dtype=torch.long).to(device)

#     # Create Dataset & DataLoader
#     train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

#     # Print dataset information
#     print("\n✅ Data Overview:")
#     print(f"X_train Shape: {X_train.shape} | y_train Shape: {y_train.shape}")
#     print(f"X_val Shape: {X_val.shape} | y_val Shape: {y_val.shape}")
#     print(f"Unique values in y_train: {y_train.unique()}")
#     print(f"Unique values in y_val: {y_val.unique()}")

#     global best_results  # Ensure we can modify the external variable if defined outside.
#     best_results = []    # Start empty each training run
#     stable_indices = torch.tensor(stable_classes, device=device)
#     teacher_model.eval()  # Ensure teacher is frozen
    
#     for epoch in range(num_epochs):
#         epoch_loss = 0.0
#         class_correct, class_total = {}, {}
        
#         if stop_signal_file and os.path.exists(stop_signal_file):
#             print("\n🛑 Stop signal detected. Exiting training loop safely.\n")
#             break

#         student_model.train()
#         i = 0
#         for X_batch, y_batch in train_loader:
#             # Reset gradients before forward pass
#             optimizer.zero_grad()

#             # Forward pass: student model produces logits for output_size classes.
#             student_logits = student_model(X_batch)  # Shape: [batch, seq_len, output_size]

#             # Reshape for CE loss computation.
#             student_logits_flat = student_logits.view(-1, output_size)
#             y_batch_flat = y_batch.view(-1)

#             # Compute Cross-Entropy loss
#             ce_loss = criterion(student_logits_flat, y_batch_flat)

#             # Compute class-wise accuracy (Accumulates values in dict)
#             compute_classwise_accuracy(student_logits_flat, y_batch_flat, class_correct, class_total)
            
#             if epoch == 1 and i < 3:
#                 i += 1
#                 print(f"\nBatch {i} Debug Info:")
#                 print(f"Unique target values: {y_batch_flat.unique()}")
#                 print(f"Target dtype: {y_batch_flat.dtype}")
#                 print(f"Min target: {y_batch_flat.min()}, Max target: {y_batch_flat.max()}")

#             # Knowledge Distillation: Forward pass through teacher (pre-trained on previous period data).
#             with torch.no_grad():
#                 teacher_logits = teacher_model(X_batch)  # Shape: [batch, seq_len, teacher_output_size]

#             # Select stable classes for distillation
#             """
#             Use stable_classes (a list of indices) to extract the relevant logits.
#             We distill only the stable classes (class 1 if teacher is from period 1).
#             Teacher's class index 1 corresponds to student's class index 1.
#             It's safer to use index_select to ensure the operation works on GPU.
#             """
#             teacher_stable = teacher_logits.index_select(dim=2, index=stable_indices)
#             student_stable = student_logits.index_select(dim=2, index=stable_indices)

#             # Compute softened probabilities for KL divergence
#             teacher_probs = F.softmax(teacher_stable / temperature, dim=2)
#             student_log_probs = F.log_softmax(student_stable / temperature, dim=2)
            
#             # Compute the KL divergence loss; note: multiplying by temperature^2 as in common distillation practice.
#             distill_loss = F.kl_div(student_log_probs, teacher_probs, reduction='batchmean') * (temperature ** 2)
                        
#             # Total loss: balance between cross-entropy and distillation loss.
#             total_loss = alpha * distill_loss + (1 - alpha) * ce_loss
#             total_loss.backward()
#             optimizer.step()
#             epoch_loss += total_loss.item() * X_batch.size(0)
            
#         train_loss = epoch_loss / len(train_loader.dataset)
#         train_classwise_accuracy = {int(c): f"{(class_correct[c] / class_total[c]) * 100:.2f}%" if class_total[c] > 0 else "0.00%" 
#                                     for c in sorted(class_total.keys())}

#         # Perform validation at the end of each epoch (only CE loss and accuracy)
#         student_model.eval()
#         val_loss, val_correct, val_total = 0.0, 0, 0
#         val_class_correct, val_class_total = {}, {}

#         with torch.no_grad():
#             for X_val_batch, y_val_batch in val_loader:
#                 val_outputs = student_model(X_val_batch).view(-1, output_size)
#                 val_labels = y_val_batch.view(-1)
#                 val_loss += criterion(val_outputs, val_labels).item() * X_val_batch.size(0)
#                 val_predictions = torch.argmax(val_outputs, dim=-1)
#                 val_correct += (val_predictions == val_labels).sum().item()
#                 val_total += val_labels.size(0)
                
#                 # Compute per-class validation accuracy
#                 compute_classwise_accuracy(val_outputs, val_labels, val_class_correct, val_class_total)

#         val_loss /= len(val_loader.dataset)
#         val_accuracy = val_correct / val_total

#         # Compute per-class validation accuracy
#         val_classwise_accuracy = {int(c): f"{(val_class_correct[c] / val_class_total[c]) * 100:.2f}%" if val_class_total[c] > 0 else "0.00%" 
#                                   for c in sorted(val_class_total.keys())}

#         print(f"Epoch {epoch+1}/{num_epochs}, "
#               f"Train Loss: {train_loss:.9f}, "
#               f"Train-Class-Acc: {train_classwise_accuracy}, "
#               f"Val Loss: {val_loss:.9f}, "
#               f"Val Accuracy: {val_accuracy * 100:.2f}%, "
#               f"Val-Class-Acc: {val_classwise_accuracy}, "
#               f"LR: {optimizer.param_groups[0]['lr']:.9f}")

#         # Save current model and update best results if applicable
#         current_epoch_info = {
#             "epoch": epoch+1,
#             "train_loss": train_loss,
#             "train_classwise_accuracy": train_classwise_accuracy,
#             "val_loss": val_loss,
#             "val_accuracy": val_accuracy,
#             "val_classwise_accuracy": val_classwise_accuracy,
#             "model_state_dict": student_model.state_dict(),
#             "optimizer_state_dict": optimizer.state_dict(),
#             "learning_rate": optimizer.param_groups[0]['lr'],
#             "model_path": os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch+1}.pth")
#         }

#         # Save top 5 models
#         if len(best_results) < 5 or val_accuracy > best_results[-1]["val_accuracy"]:
#             if len(best_results) == 5:
#                 # Remove the worst model from the list, the last (lowest accuracy)
#                 worst = best_results.pop()
#                 if os.path.exists(worst["model_path"]):
#                     os.remove(worst["model_path"])
#                     print(f"🗑 Removed old model: {worst['model_path']} (Acc: {worst['val_accuracy']*100:.2f}%)")

#             # Just insert and sort by val_accuracy descending
#             best_results.append(current_epoch_info)
#             best_results.sort(key=lambda x: (x["val_accuracy"], x["epoch"]), reverse=True)

#             try:
#                 torch.save({
#                     'epoch': epoch+1,
#                     'train_loss': train_loss,
#                     'val_loss': val_loss,
#                     'model_state_dict': student_model.state_dict(),
#                     'optimizer_state_dict': optimizer.state_dict(),
#                     'learning_rate': optimizer.param_groups[0]['lr']
#                 }, current_epoch_info["model_path"])
#                 print(f"✅ Model saved: {current_epoch_info['model_path']}")
#             except Exception as e:
#                 print(f"❌ Failed to save model {current_epoch_info['model_path']}: {e}")

#         if use_scheduler:
#             scheduler.step(val_loss)

#     # Save best model
#     if best_results:
#         best_model_info = best_results[0]
#         best_model_path = os.path.join(model_saving_folder, f"{model_name}_best.pth")
#         try:
#             torch.save({
#                 'epoch': best_model_info["epoch"],
#                 'train_loss': best_model_info["train_loss"],
#                 'val_loss': best_model_info["val_loss"],
#                 'model_state_dict': best_model_info["model_state_dict"],
#                 'optimizer_state_dict': best_model_info["optimizer_state_dict"],
#                 'learning_rate': best_model_info["learning_rate"]
#             }, best_model_path)
#             print(f"\n🏆 Best model saved as: {best_model_path} (Val Accuracy: {best_model_info['val_accuracy'] * 100:.2f}%)")
#         except Exception as e:
#             print(f"❌ Failed to save best model {best_model_path}: {e}")
#     else:
#         print("\n⚠️ No best model saved due to early termination or no epochs completed.")

#     # Save the final model
#     if 'current_epoch_info' in locals():
#         final_model_path = os.path.join(model_saving_folder, f"{model_name}_final.pth")
#         try:
#             torch.save({
#                 'epoch': epoch+1,
#                 'train_loss': train_loss,
#                 'val_loss': val_loss,
#                 'model_state_dict': student_model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'learning_rate': optimizer.param_groups[0]['lr']
#             }, final_model_path)
#             print(f"\n📌 Final model saved as: {final_model_path}")
#         except Exception as e:
#             print(f"❌ Failed to save final model {final_model_path}: {e}")
#     else:
#         print("\n⚠️ No final model saved due to early termination before first epoch.")

#     print("\n🎯 Top 5 Best Models by Validation Accuracy:")
#     for res in best_results:
#         print(f"Epoch {res['epoch']}/{num_epochs}, "
#               f"Train Loss: {res['train_loss']:.9f}, "
#               f"Train-Class-Acc: {res['train_classwise_accuracy']},\n"
#               f"Val Loss: {res['val_loss']:.9f}, "
#               f"Val Accuracy: {res['val_accuracy'] * 100:.2f}%, "
#               f"Val-Class-Acc: {res['val_classwise_accuracy']}, "
#               f"Model Path: {res['model_path']}")

#     del X_train, y_train, X_val, y_val, train_loader, val_loader
#     torch.cuda.empty_cache()

## __Setup before training__

### Define list_period_files_full_path

In [12]:
def setup_file_paths(pair='BTCUSD', base_dir='Data', days=190):
    """
    Set up file paths for cryptocurrency data across multiple periods.

    Args:
        pair (str): Trading pair (e.g., 'BTCUSD').
        base_dir (str): Base directory for data storage (default 'Data').
        days (int): Number of days for each period (default 190).

    Returns:
        tuple: (base_folder_path, with_indicators_file_path, list_period_files_full_path)
    """
    # Define base file name and folder structure
    file_name = f"Polygon_{pair}_4Y_1min"
    base_folder_path = os.path.normpath(os.path.join(base_dir, file_name))
    
    # Check if folder exists
    if not os.path.isdir(base_folder_path):
        raise FileNotFoundError(f"Directory '{base_folder_path}' does not exist.")

    # Define file path with indicators for Period 1
    with_indicators_file_path = os.path.normpath(
        os.path.join(base_folder_path, f"_{file_name}_{days}_days_with_indicators.csv")
    )

    # Define file paths for all periods
    list_period_files_full_path = [
        # Period 1
        with_indicators_file_path,
        # Period 2: 2020-11-11 to 2021-05-20
        os.path.normpath(os.path.join(
            base_folder_path, f"{file_name}_{days}_days__2020-11-11__2021-05-20__with_indicators.csv"
        )),
        # Period 3: 2021-05-20 to 2021-11-26
        os.path.normpath(os.path.join(
            base_folder_path, f"{file_name}_{days}_days__2021-05-20__2021-11-26__with_indicators.csv"
        )),
        # Period 4: 2021-11-26 to 2022-06-04
        os.path.normpath(os.path.join(
            base_folder_path, f"{file_name}_{days}_days__2021-11-26__2022-06-04__with_indicators.csv"
        )),
        # Period 5: 2022-06-04 to 2022-12-11
        os.path.normpath(os.path.join(
            base_folder_path, f"{file_name}_{days}_days__2022-06-04__2022-12-11__with_indicators.csv"
        )),
    ]

    return base_folder_path, with_indicators_file_path, list_period_files_full_path

def print_folder_contents(folder_path):
    """Print all files in the specified folder."""
    print("\n📂 Folder Contents:")
    for file in os.listdir(folder_path):
        print(f"Found file: {file}")

if __name__ == "__main__":
    # Set up paths
    base_folder_path, with_indicators_file_path, list_period_files_full_path = setup_file_paths()

    # Print results
    print("=" * 70)
    print("File Path Configuration".center(70))
    print("=" * 70)
    
    print(f"{'Base Folder Path':<25}: {base_folder_path}")
    print(f"{'Period 1 File Path':<25}: {with_indicators_file_path}")
    print("-" * 70)
    
    print("List of Period Files:")
    for i, path in enumerate(list_period_files_full_path, 1):
        print(f"{'Period ' + str(i):<25}: {path}")
    
    print("=" * 70)

    # Print folder contents
    print_folder_contents(base_folder_path)

                       File Path Configuration                        
Base Folder Path         : Data\Polygon_BTCUSD_4Y_1min
Period 1 File Path       : Data\Polygon_BTCUSD_4Y_1min\_Polygon_BTCUSD_4Y_1min_190_days_with_indicators.csv
----------------------------------------------------------------------
List of Period Files:
Period 1                 : Data\Polygon_BTCUSD_4Y_1min\_Polygon_BTCUSD_4Y_1min_190_days_with_indicators.csv
Period 2                 : Data\Polygon_BTCUSD_4Y_1min\Polygon_BTCUSD_4Y_1min_190_days__2020-11-11__2021-05-20__with_indicators.csv
Period 3                 : Data\Polygon_BTCUSD_4Y_1min\Polygon_BTCUSD_4Y_1min_190_days__2021-05-20__2021-11-26__with_indicators.csv
Period 4                 : Data\Polygon_BTCUSD_4Y_1min\Polygon_BTCUSD_4Y_1min_190_days__2021-11-26__2022-06-04__with_indicators.csv
Period 5                 : Data\Polygon_BTCUSD_4Y_1min\Polygon_BTCUSD_4Y_1min_190_days__2022-06-04__2022-12-11__with_indicators.csv

📂 Folder Contents:
Found file: Polyg

### __All periods data__
'trend': Categorized trend values based on the detected phases:
- 0: No trend
- 1: Moderate negative trend
- 2: Very strong negative trend
- 3: Moderate positive trend
- 4: Very strong positive trend


## __Train the Model__

---
### Period 1 (num_layers = 4, lora_r=4)
+ ##### BiGRUWithAttentionLoRA
+ ##### Training and saving in *'LoRA_v1/Rank_4_Period_1/1st_try'*
#### __Val Accuracy: 98.80%__
#### __Val-Class-Acc: {0: '99.15%', 1: '98.25%'}__

In [11]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[0], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1]
num_classes = 2


In [12]:
# Model hyperparameters
input_size = Number_features  # Number of input features
hidden_size = 64  # Number of GRU units
output_size = num_classes  # Number of trend classes (2 for Period 1: {0, 1})
num_layers = 4  # Number of GRU layers
dropout = 0.0  # Dropout rate
lora_r = 4  # Rank of LoRA matrices (not used in Period 1, but defined for consistency)
num_epochs = 2000  # Number of training epochs
batch_size = 64  # Batch size for DataLoader
model_name = 'BiGRUWithAttention'  # Model name for saving
best_results = []  # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths for stop signal and model saving
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try"))
ensure_folder(model_saving_folder)

# Instantiate the model (no LoRA adapters added for Period 1)
class_gru_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)

# Define loss function, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(class_gru_model.parameters(), lr=0.0001)  # Optimize all parameters for Period 1
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Start training
train_and_validate(class_gru_model, output_size, criterion, optimizer, X_train, y_train, X_val, y_val, scheduler, 
                   use_scheduler=False, num_epochs=num_epochs, batch_size=batch_size, 
                   model_saving_folder=model_saving_folder, model_name=model_name, stop_signal_file=stop_signal_file)

# Print model and class information after training
print(f"\nclass_gru_model: \n{class_gru_model}")
print(f"\nunique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up memory
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()



🚀 'train_and_validate' function started.

✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_1\1st_try

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_val Shape: torch.Size([454, 1000, 7]) | y_val Shape: torch.Size([454, 1000])
Epoch 1/2000, Train Loss: 0.679526536, Train-Class-Acc: {0: '99.08%', 1: '0.91%'}, Val Loss: 0.670811959, Val Accuracy: 60.61%, Val-Class-Acc: {0: '100.00%', 1: '0.00%'}, LR: 0.000100000
✅ Model saved: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_1\1st_try\BiGRUWithAttention_epoch_1.pth
Epoch 2/2000, Train Loss: 0.659214080, Train-Class-Acc: {0: '100.00%', 1: '0.00%'}, Val Loss: 0.671974270, Val Accuracy: 60.61%, Val-Class-Acc: {0: '100.00%', 1: '0.00%'}, LR: 0.000100000
✅ Model saved: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_1\1st_try\BiGRUWithAttent

---
### Period 2 (num_layers = 4, lora_r=4, alpha = 0.5)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_2/alpha_0.5'*
#### __Val Accuracy: 97.88%__
#### __Val-Class-Acc: {0: '99.10%', 1: '97.37%', 2: '93.64%'}__

In [13]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[1], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2]
num_classes = 3


In [14]:
# Model parameters
stable_classes = [1]    # From Period 1: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (3 for Period 2: {0, 1, 2})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.5  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.5"))
ensure_folder(model_saving_folder)

# Load class features from Period 1
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/alpha_0.5/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 1
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)  # Period 1 may not have LoRA adapters
related_labels = {'attention_fc': [0, 1]}  # Initialize related_labels for Period 2
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Initialized related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass initialized related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

No previous class features found.
Teacher model checkpoint has 0 LoRA adapters.
Initialized related_labels: {'attention_fc': [0, 1]}
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth
Teacher model now has 0 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList()
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Student model now has 0 LoRA adapters.

🚀 'train_and_validate_lora' function started.

Initial related_labels: {'attention_fc': [0, 1]}
✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.5

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_va

---
### Period 3 (num_layers = 4, lora_r=4, alpha = 0.5)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_3/alpha_0.5'*
#### __Val Accuracy: 96.52%__
#### __Val-Class-Acc: {0: '89.71%', 1: '97.58%', 2: '98.98%', 3: '96.79%'}__

In [15]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[2], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3]
num_classes = 4


In [16]:
# Model parameters
stable_classes = [1, 2]  # From Period 2: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.5  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.5"))
ensure_folder(model_saving_folder)

# Load class features from Period 2
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.5/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 2
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.5/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.5\class_features.pkl
Teacher model checkpoint has 1 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.5/BiGRUWithAttention_best.pth
Teacher model now has 1 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0): LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapters: 1, on device: cuda:0
Student model now has 1 LoRA adapters.

🚀 'tr

---
### Period 4 (num_layers = 4, lora_r=4, alpha = 0.5)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_4/alpha_0.5'*
#### __Val Accuracy: 96.07__
#### __Val-Class-Acc: {0: '92.27%', 1: '97.03%', 2: '92.97%', 3: '97.60%', 4: '93.94%'}__

In [17]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[3], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3, 4}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3 4]
num_classes = 5


In [18]:
# Model parameters
stable_classes = [1, 2, 3]  # From Period 3: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (5 for Period 4: {0, 1, 2, 3, 4})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.5  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_4/alpha_0.5"))
ensure_folder(model_saving_folder)

# Load class features from Period 3
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', "David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.5/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")
    
# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 3
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.5/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_3\alpha_0.5\class_features.pkl
Teacher model checkpoint has 2 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2], 1: [3]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Added LoRA adapter, total adapters: 2, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.5/BiGRUWithAttention_best.pth
Teacher model now has 2 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0-1): 2 x LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapt

---
### Period 2 (num_layers = 4, lora_r=4, alpha = 0.4)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_2/alpha_0.4'*
#### __Val Accuracy: 97.84%__
#### __Val-Class-Acc: {0: '99.22%', 1: '97.91%', 2: '90.95%'}__

In [37]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[1], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2]
num_classes = 3


In [38]:
# Model parameters
stable_classes = [1]    # From Period 1: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (3 for Period 2: {0, 1, 2})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.4  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 1
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 1
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)  # Period 1 may not have LoRA adapters
related_labels = {'attention_fc': [0, 1]}  # Initialize related_labels for Period 2
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Initialized related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass initialized related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

No previous class features found.
Teacher model checkpoint has 0 LoRA adapters.
Initialized related_labels: {'attention_fc': [0, 1]}
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth
Teacher model now has 0 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList()
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Student model now has 0 LoRA adapters.

🚀 'train_and_validate_lora' function started.

Initial related_labels: {'attention_fc': [0, 1]}
✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.4

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_va

---
### Period 3 (num_layers = 4, lora_r=4, alpha = 0.4)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_3/alpha_0.4'*
#### __Val Accuracy: 96.79%__
#### __Val-Class-Acc: {0: '86.29%', 1: '99.02%', 2: '93.40%', 3: '97.27%'}__

In [39]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[2], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3]
num_classes = 4


In [40]:
# Model parameters
stable_classes = [1, 2]  # From Period 2: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.4 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 2
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 2
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.4\class_features.pkl
Teacher model checkpoint has 1 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.4/BiGRUWithAttention_best.pth
Teacher model now has 1 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0): LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapters: 1, on device: cuda:0
Student model now has 1 LoRA adapters.

🚀 'tr

---
### Period 4 (num_layers = 4, lora_r=4, alpha = 0.4
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_4/alpha_0.4*
#### __Val Accuracy: 94.99%__
#### __Val-Class-Acc: {0: '90.64%', 1: '97.56%', 2: '88.25%', 3: '97.22%', 4: '90.25%'}__

In [41]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[3], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3, 4}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3 4]
num_classes = 5


In [42]:
# Model parameters
stable_classes = [1, 2, 3]  # From Period 3: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (5 for Period 4: {0, 1, 2, 3, 4})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.4 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_4/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 3
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")
    
# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 3
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_3\alpha_0.4\class_features.pkl
Teacher model checkpoint has 2 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2], 1: [3]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Added LoRA adapter, total adapters: 2, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.4/BiGRUWithAttention_best.pth
Teacher model now has 2 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0-1): 2 x LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapt

---
### Period 2 (num_layers = 4, lora_r=4, alpha = 0.3)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_2/alpha_0.3'*
#### __Val Accuracy: 97.36%__
#### __Val-Class-Acc: {0: '99.57%', 1: '97.72%', 2: '85.47%'}__

In [None]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[1], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")


In [20]:
# Model parameters
stable_classes = [1]    # From Period 1: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (3 for Period 2: {0, 1, 2})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.3  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 1
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 1
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)  # Period 1 may not have LoRA adapters
related_labels = {'attention_fc': [0, 1]}  # Initialize related_labels for Period 2
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Initialized related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass initialized related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

No previous class features found.
Teacher model checkpoint has 0 LoRA adapters.
Initialized related_labels: {'attention_fc': [0, 1]}
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth
Teacher model now has 0 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList()
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Student model now has 0 LoRA adapters.

🚀 'train_and_validate_lora' function started.

Initial related_labels: {'attention_fc': [0, 1]}
✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.3

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_va

---
### Period 3 (num_layers = 4, lora_r=4, alpha = 0.3)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_3/alpha_0.3'*
#### __Val Accuracy: 97.23%__
#### __Val-Class-Acc: {0: '91.08%', 1: '98.10%', 2: '92.82%', 3: '98.27%'}__

In [21]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[2], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3]
num_classes = 4


In [22]:
# Model parameters
stable_classes = [1, 2]  # From Period 2: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.3  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 2
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 2
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.3\class_features.pkl
Teacher model checkpoint has 1 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.3/BiGRUWithAttention_best.pth
Teacher model now has 1 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0): LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapters: 1, on device: cuda:0
Student model now has 1 LoRA adapters.

🚀 'tr

---
### Period 4 (num_layers = 4, lora_r=4, alpha = 0.3)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_4/alpha_0.3'*
#### __Val Accuracy: 96.23%__
#### __Val-Class-Acc: {0: '90.56%', 1: '96.84%', 2: '94.36%', 3: '97.37%', 4: '95.96%'}__

In [23]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[3], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3, 4}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3 4]
num_classes = 5


In [24]:
# Model parameters
stable_classes = [1, 2, 3]  # From Period 3: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (5 for Period 4: {0, 1, 2, 3, 4})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.3  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_4/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 3
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")
    
# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 3
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_3\alpha_0.3\class_features.pkl
Teacher model checkpoint has 2 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2], 1: [3]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Added LoRA adapter, total adapters: 2, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.3/BiGRUWithAttention_best.pth
Teacher model now has 2 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0-1): 2 x LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapt

---
### Period 2 (num_layers = 4, lora_r=4, alpha = 0.2)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_2/alpha_0.2'*
#### __Val Accuracy: 97.97%__
#### __Val-Class-Acc: {0: '99.40%', 1: '97.04%', 2: '94.18%'}__

In [25]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[1], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2]
num_classes = 3


In [26]:
# Model parameters
stable_classes = [1]    # From Period 1: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (3 for Period 2: {0, 1, 2})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.2  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 1
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 1
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)  # Period 1 may not have LoRA adapters
related_labels = {'attention_fc': [0, 1]}  # Initialize related_labels for Period 2
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Initialized related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass initialized related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

No previous class features found.
Teacher model checkpoint has 0 LoRA adapters.
Initialized related_labels: {'attention_fc': [0, 1]}
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth
Teacher model now has 0 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList()
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Student model now has 0 LoRA adapters.

🚀 'train_and_validate_lora' function started.

Initial related_labels: {'attention_fc': [0, 1]}
✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.2

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_va

---
### Period 3 (num_layers = 4, lora_r=4, alpha = 0.2)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_3/alpha_0.2'*
#### __Val Accuracy: 98.16%__
#### __Val-Class-Acc: {0: '95.13%', 1: '98.45%', 2: '98.34%', 3: '98.59%'}__

In [27]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[2], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3]
num_classes = 4


In [28]:
# Model parameters
stable_classes = [1, 2]  # From Period 2: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.2 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 2
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 2
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.2\class_features.pkl
Teacher model checkpoint has 1 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.2/BiGRUWithAttention_best.pth
Teacher model now has 1 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0): LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapters: 1, on device: cuda:0
Student model now has 1 LoRA adapters.

🚀 'tr

---
### Period 4 (num_layers = 4, lora_r=4, alpha = 0.2
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_4/alpha_0.2*
#### __Val Accuracy: 96.59%__
#### __Val-Class-Acc: {0: '92.66%', 1: '98.25%', 2: '88.84%', 3: '98.34%', 4: '98.29%'}__

In [29]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[3], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3, 4}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3 4]
num_classes = 5


In [30]:
# Model parameters
stable_classes = [1, 2, 3]  # From Period 3: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (5 for Period 4: {0, 1, 2, 3, 4})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.2 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_4/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 3
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")
    
# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 3
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_3\alpha_0.2\class_features.pkl
Teacher model checkpoint has 2 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2], 1: [3]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Added LoRA adapter, total adapters: 2, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.2/BiGRUWithAttention_best.pth
Teacher model now has 2 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0-1): 2 x LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapt

---
### Period 2 (num_layers = 4, lora_r=4, alpha = 0.1)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_2/alpha_0.1'*
#### __Val Accuracy: 97.74%__
#### __Val-Class-Acc: {0: '99.19%', 1: '97.32%', 2: '92.09%'}__

In [31]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[1], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2]
num_classes = 3


In [32]:
# Model parameters
stable_classes = [1]    # From Period 1: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (3 for Period 2: {0, 1, 2})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.1  # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 1
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 1
teacher_checkpoint_path = "Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)  # Period 1 may not have LoRA adapters
related_labels = {'attention_fc': [0, 1]}  # Initialize related_labels for Period 2
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Initialized related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass initialized related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

No previous class features found.
Teacher model checkpoint has 0 LoRA adapters.
Initialized related_labels: {'attention_fc': [0, 1]}
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_1/1st_try/BiGRUWithAttention_best.pth
Teacher model now has 0 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList()
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Student model now has 0 LoRA adapters.

🚀 'train_and_validate_lora' function started.

Initial related_labels: {'attention_fc': [0, 1]}
✅ Removed existing folder: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.1

✅ Data Overview:
X_train Shape: torch.Size([3634, 1000, 7]) | y_train Shape: torch.Size([3634, 1000])
X_va

---
### Period 3 (num_layers = 4, lora_r=4, alpha = 0.1)
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_3/alpha_0.1'*
#### __Val Accuracy: 97.73%__
#### __Val-Class-Acc: {0: '90.68%', 1: '98.89%', 2: '94.95%', 3: '98.51%'}__

In [33]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[2], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3]
num_classes = 4


In [34]:
# Model parameters
stable_classes = [1, 2]  # From Period 2: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.1 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 2
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")

# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 2
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_2\alpha_0.1\class_features.pkl
Teacher model checkpoint has 1 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_2/alpha_0.1/BiGRUWithAttention_best.pth
Teacher model now has 1 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0): LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapters: 1, on device: cuda:0
Student model now has 1 LoRA adapters.

🚀 'tr

---
### Period 4 (num_layers = 4, lora_r=4, alpha = 0.1
+ ##### BiGRUWithAttention
+ ##### Training and saving in *'LoRA_v2/Rank_4_Period_4/alpha_0.1*
#### __Val Accuracy: 97.48%__
#### __Val-Class-Acc: {0: '95.69%', 1: '97.72%', 2: '92.39%', 3: '99.31%', 4: '98.93%'}__

In [35]:
with contextlib.redirect_stdout(open(os.devnull, 'w')):
    X_train, y_train, X_val, y_val, X_test, y_test, Number_features = process_and_return_splits(
        with_indicators_file_path = list_period_files_full_path[3], # Change 
        downsampled_data_minutes = downsampled_data_minutes,
        exclude_columns = exclude_columns,
        lower_threshold = lower_threshold,
        upper_threshold = upper_threshold,
        reverse_steps = reverse_steps,
        sequence_length = sequence_length,
        sliding_interval = sliding_interval,
        trends_to_keep = {0, 1, 2, 3, 4}  # Default keeps all trends : {0, 1, 2, 3, 4}
    )

print(f"\nNumber_features = {Number_features}")

unique_classes = np.unique(y_val)
num_classes = len(unique_classes)
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")



Number_features = 7
unique_classes = [0 1 2 3 4]
num_classes = 5


In [36]:
# Model parameters
stable_classes = [1, 2, 3]  # From Period 3: Exclude class 0 because it changes
input_size = Number_features    # Number of input features
hidden_size = 64    # Number of GRU units
output_size = num_classes   # Number of trend classes (5 for Period 4: {0, 1, 2, 3, 4})
num_layers = 4  # Number of GRU layers
dropout = 0.0   # Dropout rate
lora_r = 4      # Rank of LoRA matrices
learning_rate = 0.0001  # Learning rate
alpha = 0.1 # Weighting factor for distillation loss
num_epochs = 2000   # Number of training epochs
batch_size = 64     # Batch size for DataLoader
model_name = 'BiGRUWithAttention'   # Model name for saving
best_results = []   # List to store best results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define paths
stop_signal_file = os.path.normpath(os.path.join('Class_Incremental_CL', 'David_Classif_Bi_Dir_GRU_Model/stop_training.txt'))
model_saving_folder = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_4/alpha_{alpha}"))
ensure_folder(model_saving_folder)

# Load class features from Period 3
class_features_path = os.path.normpath(os.path.join('Class_Incremental_CL', f"David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/class_features.pkl"))
if os.path.exists(class_features_path):
    with open(class_features_path, 'rb') as f:
        class_features_dict = pickle.load(f)
    print(f"Loaded class features from: {class_features_path}")
else:
    class_features_dict = {}
    print("No previous class features found.")
    
# Instantiate models
student_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers, dropout, lora_r).to(device)
teacher_model = BiGRUWithAttention(input_size, hidden_size, output_size - 1, num_layers, dropout, lora_r).to(device)

# Load teacher model from Period 3
teacher_checkpoint_path = f"Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_{alpha}/BiGRUWithAttention_best.pth"
teacher_checkpoint = torch.load(teacher_checkpoint_path, map_location=device, weights_only=True)
num_lora_adapters = teacher_checkpoint.get('num_lora_adapters', 0)
related_labels = teacher_checkpoint.get('related_labels', {'attention_fc': [0, 1]})  # Load related_labels, default if not found
print(f"Teacher model checkpoint has {num_lora_adapters} LoRA adapters.")
print(f"Loaded related_labels: {related_labels}")

# Add LoRA adapters to teacher_model before loading state_dict
for _ in range(num_lora_adapters):
    teacher_model.add_lora_adapter()
teacher_model.load_state_dict(teacher_checkpoint['model_state_dict'])
print(f"Loaded teacher model from: \n\t{teacher_checkpoint_path}")
print(f"Teacher model now has {len(teacher_model.lora_adapters)} LoRA adapters.")
print(f"\n{teacher_model}\n")
del teacher_checkpoint
gc.collect()

# Add the same number of LoRA adapters to student_model
for _ in range(num_lora_adapters):
    student_model.add_lora_adapter()
print(f"Student model now has {len(student_model.lora_adapters)} LoRA adapters.")

# Copy teacher weights to student (excluding FC layer)
state_dict_teacher = teacher_model.state_dict()
state_dict_student = student_model.state_dict()
for name, param in state_dict_teacher.items():
    if 'fc' not in name:
        state_dict_student[name].copy_(param)
student_model.load_state_dict(state_dict_student)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

# Train and validate with related_labels
class_features_dict = train_and_validate_lora(
    student_model, 
    teacher_model, 
    stable_classes, 
    output_size, 
    criterion, 
    optimizer, 
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    scheduler, 
    use_scheduler=False, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    alpha=alpha, 
    model_saving_folder=model_saving_folder, 
    model_name=model_name, 
    stop_signal_file=stop_signal_file,
    class_features_dict=class_features_dict,
    tau_high=0.5,  # Adjusted to 0.5
    tau_low=0.5,   # Adjusted to 0.5
    related_labels=related_labels  # Pass related_labels
)

# Save class features
class_features_path = os.path.join(model_saving_folder, "class_features.pkl")
with open(class_features_path, 'wb') as f:
    pickle.dump(class_features_dict, f)
print(f"Saved class features to: {class_features_path}")

print(f"\nstudent_model: \n{student_model}\n")
print(f"unique_classes = {unique_classes}")
print(f"num_classes = {num_classes}")

# Clean up
for var in ["X_train", "y_train", "X_val", "y_val", "X_test", "y_test", "Number_features", "unique_classes", "num_classes"]:
    if var in locals():
        del locals()[var]
gc.collect()
torch.cuda.empty_cache()

Loaded class features from: Class_Incremental_CL\David_Classif_Bi_Dir_GRU_Model\Trained_models\LoRA_v2\Rank_4_Period_3\alpha_0.1\class_features.pkl
Teacher model checkpoint has 2 LoRA adapters.
Loaded related_labels: {'attention_fc': [0, 1], 0: [2], 1: [3]}
Added LoRA adapter, total adapters: 1, on device: cuda:0
Added LoRA adapter, total adapters: 2, on device: cuda:0
Loaded teacher model from: 
	Class_Incremental_CL/David_Classif_Bi_Dir_GRU_Model/Trained_models/LoRA_v2/Rank_4_Period_3/alpha_0.1/BiGRUWithAttention_best.pth
Teacher model now has 2 LoRA adapters.

BiGRUWithAttention(
  (gru): GRU(7, 64, num_layers=4, batch_first=True, bidirectional=True)
  (attention_fc): Linear(in_features=128, out_features=128, bias=True)
  (lora_adapters): ModuleList(
    (0-1): 2 x LoRA(
      (linear): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

Added LoRA adapter, total adapt

---

## __Getting into Transfer Learning for 5 periods__

In [None]:
# def custom_evaluattion_function(model, list_period_files_full_path, criterion, output_size, batch_size=64, model_number=100):
#     # 1- With the given model, for each period in the list, predict and print accuracy
#     # 2- With the given model, predict and print accuracy for all data combined.
#     # For (2), you can do it by saving in a dictionary the accuracy and sample number as you go through each period

#     print(f"\nUsing model {model_number}: \n{model}\n")
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     model.eval()
#     # Dictionary to save predictions and details.
#     store_preds = {}

#     for i, path_ in enumerate(list_period_files_full_path):
#         # Suppress output by redirecting to os.devnull
#         with contextlib.redirect_stdout(open(os.devnull, 'w')):
#             X_train_, y_train_, X_val_, y_val_, X_test_, y_test_ = process_and_return_splits(
#                 with_indicators_file_path = path_,
#                 downsampled_data_minutes = downsampled_data_minutes,
#                 exclude_columns = exclude_columns,
#                 lower_threshold = lower_threshold,
#                 upper_threshold = upper_threshold,
#                 reverse_steps = reverse_steps,
#                 sequence_length = sequence_length,
#                 sliding_interval = sliding_interval
#             )

#         val_loader_ = DataLoader(TensorDataset(torch.tensor(X_val_, dtype=torch.float32).to(device),  # (seqs, seq_len, features),
#                                                        torch.tensor(y_val_, dtype=torch.long).to(device)    # (seqs, seq_len)
#                                                        ), 
#                                                        batch_size=batch_size)
#         del X_train_, y_train_, X_val_, y_val_, X_test_, y_test_
#         gc.collect()
#         torch.cuda.empty_cache()

#         # Perform validation at the end of each epoch
#         val_loss = 0.0
#         val_correct = 0
#         val_total = 0
#         with torch.no_grad():
#             for X_val_batch, y_val_batch in val_loader_:
#                 val_outputs = model(X_val_batch).view(-1, output_size)
#                 val_labels = y_val_batch.view(-1)
#                 val_loss += criterion(val_outputs, val_labels).item()
#                 val_predictions = torch.argmax(val_outputs, dim=-1)
#                 val_correct += (val_predictions == val_labels).sum().item()
#                 val_total += val_labels.size(0)
#         val_loss /= len(val_loader_.dataset)
#         val_accuracy = val_correct / val_total

#         store_preds[i+1] = {'val_loss' : val_loss, 
#                             'val_accuracy' : val_accuracy,
#                             'val_correct' : val_correct,
#                             'val_total' : val_total}
        
#         print(f"Period {i+1}/{len(list_period_files_full_path)}, "
#               f"Val Loss: {val_loss:.9f}, "
#               f"Val Accuracy: {val_accuracy * 100:.2f}%, ")
        
#         # Clean up DataLoader and clear cache
#         del val_loader_
#         gc.collect()
#         torch.cuda.empty_cache()

#     # Iterate through the stored predictions
#     print()
#     for period_key in sorted(store_preds.keys()):
#         print("#---------------------------------------------------------#")
#         # Get current period's accuracy and total
#         val_correct = store_preds[period_key]['val_correct']
#         val_total = store_preds[period_key]['val_total']
#         current_accuracy = store_preds[period_key]['val_accuracy']

#         # Print accuracy for the current period
#         print(f"Period {period_key}: Accuracy: {current_accuracy * 100:.2f}%")

#         # If not the first period, calculate and print combined accuracy
#         if period_key > 1:
#             combined_correct = sum(store_preds[key]['val_correct'] for key in range(1, period_key + 1))
#             combined_total = sum(store_preds[key]['val_total'] for key in range(1, period_key + 1))
#             combined_accuracy = combined_correct / combined_total
#             print(f"Combined Accuracy up to Period {period_key}: {combined_accuracy * 100:.2f}%")
#     print("#---------------------------------------------------------#")
#     print()
#     return

# def periods_evaluation_transfer_learning(model_number, best_epoch_number_dic, list_period_files_full_path, lr=0.00001):
#     """
#     There are many variables explicitely declared in this function, pay attention!
#     """
    
#     torch.manual_seed(42)
#     print("Seeding successful!\n")

#     # Model parameters
#     input_size = Number_features  # Number of features
#     hidden_size = 64  # Number of GRU units
#     output_size = 5  # Number of trend classes (0, 15, 25, -15, -25)
#     num_layers = 4  # Number of GRU layers
#     num_epochs= 2000 # Number of epochs/ go through entire data
#     batch_size= 64 # How many sequences passed at once to the model
#     model_name = 'BiGRUWithAttention' # Name of the model to use for saving
#     global best_results
#     best_results = [] # Initialize this outside the training function or at the beginning of training
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     # Define a global stop signal
#     stop_signal_file = os.path.normpath(os.path.join(Working_directory, 'Classif_Bi_Dir_GRU_Model/stop_training.txt'))  # Create this file to stop training
#     model_saving_folder_init = os.path.normpath(os.path.join(Working_directory, "Classif_Bi_Dir_GRU_Model/Trained_models/2nd_try"))
#     ensure_folder(model_saving_folder_init)

#     # Instantiate the model
#     class_gru_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers).to(device)

#     # Define the loss function, optimizer and scheduler
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(class_gru_model.parameters(), lr=lr) # lr=0.00005
#     # optimizer = optim.Adam(class_gru_model.parameters(), lr=0.001, weight_decay=1e-5)
#     # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
#     scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=10)

#     #---------------------------------------------------------
    
#     if model_number == 1:
#         # Load the best saved base model parameters
#         epoch_number = best_epoch_number_dic[model_number]
#         base_model_path = os.path.normpath(
#             os.path.join(model_saving_folder_init, f"{model_name}_epoch_{epoch_number}.pth"))

#         # Copy the file Destination directory for normalization
#         destination_directory = os.path.normpath(
#             os.path.join(model_saving_folder_init, f"Small_Final/period_{model_number}"))
#         ensure_folder(destination_directory)
#         destination_path = os.path.join(destination_directory, os.path.basename(base_model_path))
#         shutil.copy(base_model_path, destination_path)

#         #---------------------------------------------------------
#         # print(f"\n{class_gru_model}\n")
#         checkpoint = torch.load(destination_path, map_location=device, weights_only=True)
#         # print(f"\n{checkpoint}\n")
#         class_gru_model.load_state_dict(checkpoint['model_state_dict'])
#         del checkpoint
#         gc.collect()
#         print(f"Loaded 'base model / model {model_number}' from: \n\t{destination_path}")
#         # print(f"\n{class_gru_model}\n")

#     elif model_number > 1:
#         # Load the best saved base model parameters
#         epoch_number = best_epoch_number_dic[model_number-1]
#         previous_model_folder = os.path.normpath(os.path.join(model_saving_folder_init, f'Small_Final/period_{model_number-1}'))
#         previous_model_path = os.path.normpath(os.path.join(previous_model_folder, f"{model_name}_epoch_{epoch_number}.pth"))
#         # print(f"\n{class_gru_model}\n")
#         checkpoint = torch.load(previous_model_path, map_location=device, weights_only=True)
#         # print(f"\n{checkpoint}\n")
#         class_gru_model.load_state_dict(checkpoint['model_state_dict'])
#         del checkpoint
#         gc.collect()
#         print(f"Loaded base model from: \n\t{previous_model_path}")
#         # print(f"\n{class_gru_model}\n")

#         #---------------------------------------------------------
#         # Creating New Saving Folder
#         model_saving_folder = os.path.normpath(os.path.join(model_saving_folder_init, f'Small_Final/period_{model_number}'))
#         ensure_folder(model_saving_folder)

#         #---------------------------------------------------------
#         # New dataset to work with
#         X_train_, y_train_, X_val_, y_val_, X_test_, y_test_ = process_and_return_splits(
#             with_indicators_file_path = list_period_files_full_path[model_number-1], # Period data
#             # with_indicators_file_path = list_period_files_full_path[0], # Period data
#             downsampled_data_minutes = downsampled_data_minutes,
#             exclude_columns = exclude_columns,
#             lower_threshold = lower_threshold,
#             upper_threshold = upper_threshold,
#             reverse_steps = reverse_steps,
#             sequence_length = sequence_length,
#             sliding_interval = sliding_interval
#         )
#         del X_test_, y_test_
#         #---------------------------------------------------------

#         train_and_validate(class_gru_model, output_size, criterion, optimizer, X_train_, y_train_, X_val_, y_val_, scheduler, 
#                         False, num_epochs, batch_size, model_saving_folder, model_name, stop_signal_file)

#         best_epoch_number_dic[model_number] = best_results[0]['epoch']
#         del X_train_, y_train_, X_val_, y_val_
#         gc.collect()
#         torch.cuda.empty_cache()
#         #---------------------------------------------------------

#         for res in best_results:        
#             print(f"Epoch {res['epoch']}/{num_epochs}, "
#                     f"Train Loss: {res['train_loss']:.4f}, " 
#                     f"Val Loss: {res['val_loss']:.4f}, "
#                     f"Val Accuracy: {res['val_accuracy'] * 100:.2f}%, "
#                     f"Model Path: {res['model_path']}")      
#         print(f"\nclass_gru_model: \n{class_gru_model}")
#         del class_gru_model
#         gc.collect()
#         torch.cuda.empty_cache()
#         #---------------------------------------------------------

#         # Instantiate the model again
#         class_gru_model = BiGRUWithAttention(input_size, hidden_size, output_size, num_layers).to(device)

#         #---------------------------------------------------------
#         # Load the best saved base model parameters
#         epoch_number = best_epoch_number_dic[model_number]
#         curr_best_model_path = os.path.normpath(os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch_number}.pth")) # File of the best epoch
#         # print(f"\n{class_gru_model}\n")
#         checkpoint = torch.load(curr_best_model_path, map_location=device, weights_only=True)
#         # print(f"\n{checkpoint}\n")
#         class_gru_model.load_state_dict(checkpoint['model_state_dict'])
#         del checkpoint
#         gc.collect()
#         print(f"Loaded model {model_number} from: \n\t{curr_best_model_path}")
#         # print(f"\n{class_gru_model}\n")

#     else:
#         print(f"Give an appropriate model_number (1, 2, ..., 5, ...). Passed model_number = {model_number}\n")
#         return -1
    
#     #---------------------------------------------------------
#     custom_evaluattion_function(class_gru_model, list_period_files_full_path, criterion, output_size, batch_size, model_number)
#     del class_gru_model
#     gc.collect()
#     torch.cuda.empty_cache()

#     #---------------------------------------------------------
#     return best_epoch_number_dic


## __Evaluate the Model__

### Testing function

In [None]:
# def test_model(model_class, model_path, X_test, y_test, criterion, input_size, hidden_size, output_size, num_layers):
#     """
#     Function to test a saved model on test data.
    
#     Parameters:
#         model_class (nn.Module): The class of the model to instantiate.
#         model_path (str): Path to the saved model file.
#         X_test (np.ndarray or torch.Tensor): Test features of shape (num_samples, seq_len, num_features).
#         y_test (np.ndarray or torch.Tensor): Test labels of shape (num_samples, seq_len).
#         output_size (int): Number of output classes.
#         criterion: Loss function.
        
#     Returns:
#         np.ndarray: Predicted classes for the test data.
#     """
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
#     print(f"Loading model from {model_path}...")
    
#     # Load the model
#     model = model_class(input_size=X_test.shape[-1], hidden_size=hidden_size, output_size=output_size, num_layers=num_layers)
    
#     checkpoint = torch.load(model_path, map_location=device, weights_only=True)
#     print("Checkpoint Keys:", checkpoint.keys() if isinstance(checkpoint, dict) else "State dict directly stored", '\n')
    
#     if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
#         print("Dictionaries stored \n")
#         model.load_state_dict(checkpoint['model_state_dict'])
#     else:
#         print("State dict directly stored \n")
#         model.load_state_dict(checkpoint)  # Assume it's directly the state dict

#     model.to(device)
#     model.eval()  # Set model to evaluation mode

#     # Convert test data to tensors
#     X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
#     y_test = torch.tensor(y_test, dtype=torch.long).to(device)
    
#     with torch.no_grad():
#         # Forward pass
#         outputs = model(X_test)  # Shape: (batch_size, seq_len, output_size)
#         outputs = outputs.view(-1, output_size)  # Flatten for prediction and loss calculation
#         y_test_flat = y_test.view(-1)  # Flatten labels

#         # Calculate loss
#         test_loss = criterion(outputs, y_test_flat).item()
        
#         # Predictions
#         predictions = torch.argmax(outputs, dim=-1).cpu().numpy()  # Convert to NumPy array

#         # Calculate accuracy
#         test_accuracy = (predictions == y_test_flat.cpu().numpy()).mean() * 100

#     print(f"Test Loss: {test_loss:.4f}")
#     print(f"Test Accuracy: {test_accuracy:.2f}% \n")

#     return predictions.reshape(y_test.shape)  # Reshape to match the original test data
