In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random
import os

# --- 1. CONFIGURATION ---
SEQUENCE_LENGTH = 500  # 10 seconds of data (500 * 0.02s)
INPUT_CHANNELS = 3     # Lateral, Vertical, Longitudinal G-forces
HIDDEN_DIM = 64        # Size of the hidden state in the GRU
NUM_LAYERS = 2         # Number of stacked GRU layers
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

# --- 2. DATA PREPARATION (MOCK DATASET) ---

# NOTE: This section uses a simplified list of 20 coasters with a mock score.
# You will replace the 'load_coaster_data' function to load your actual segmented
# and labeled data where each segment is paired with the coaster's score.

class CoasterScoreDataset(Dataset):
    def __init__(self, data_segments, scores):
        """
        Initializes the dataset. Each segment is paired with the coaster's score.
        :param data_segments: List of NumPy arrays, each (3, SEQUENCE_LENGTH)
        :param scores: List of corresponding coaster scores (0-100)
        """
        self.segments = data_segments
        self.scores = scores

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        # Convert NumPy array to Float Tensor
        segment = torch.from_numpy(self.segments[idx]).float()
        # Ensure score is also a Float Tensor
        score = torch.tensor(self.scores[idx]).float()
        return segment, score

# Mock function to load your combined data (replace with actual loading logic)
def load_labeled_data(file_paths_and_scores):
    all_segments = []
    all_scores = []
    
    for file_path, coaster_score in file_paths_and_scores.items():
        # Load the raw data (similar to the SimCLR setup)
        try:
            # We assume a function exists to load and segment one file
            # In your actual project, you would need to associate each segment
            # with its parent coaster's score.
            
            # --- Load data from the specified path ---
            # NOTE: We use os.path.join here for cross-platform compatibility
            
            df = pd.read_csv(file_path, sep=';', header=2, usecols=[1, 2, 3], dtype=np.float32)
            data = df.values.T
            
            # Simple segmentation
            num_timesteps = data.shape[1]
            segments = []
            for start in range(0, num_timesteps - SEQUENCE_LENGTH, SEQUENCE_LENGTH):
                segments.append(data[:, start : start + SEQUENCE_LENGTH])
            
            all_segments.extend(segments)
            all_scores.extend([coaster_score] * len(segments))
            
        except Exception as e:
            print(f"Skipping file {file_path} due to error: {e}")
            continue

    return all_segments, all_scores

# --- 3. THE BIGRU SEQUENCE-TO-VALUE MODEL ---

class BiGRURegressor(nn.Module):
    def __init__(self, input_size=INPUT_CHANNELS, hidden_dim=HIDDEN_DIM, 
                 num_layers=NUM_LAYERS, output_dim=1):
        super(BiGRURegressor, self).__init__()
        
        # 1. BiGRU Encoder
        # batch_first=True means input shape is (Batch, Seq_Len, Channels)
        # However, CNN/RNN inputs usually prefer (Batch, Channels, Seq_Len)
        # We will transpose the input in the forward pass to (Batch, Seq_Len, Channels)
        
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True  # Crucial: reads sequence forward and backward
        )
        
        # 2. Dense Regression Head (maps the concatenated hidden states to the score)
        # The output size of a BiGRU is 2 * hidden_dim (one for forward, one for backward)
        self.regressor = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim) # Output a single score value
        )

    def forward(self, x):
        # Input x shape: (B, Channels, Seq_Len) -> e.g., (32, 3, 500)
        
        # Transpose to (B, Seq_Len, Channels) for the GRU
        x = x.transpose(1, 2) # (32, 500, 3) 
        
        # Pass through GRU. We only care about the final hidden state (h_n)
        _, h_n = self.gru(x)
        
        # h_n shape: (2 * Num_Layers, B, Hidden_Dim)
        
        # Concatenate the final hidden states from the last layer (forward and backward)
        # h_n[-2, :, :] is the last forward state
        # h_n[-1, :, :] is the last backward state
        final_forward_backward_state = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        
        # Pass the concatenated vector through the regression head
        score_prediction = self.regressor(final_forward_backward_state)
        
        # Return prediction (shape: (B, 1))
        return score_prediction.squeeze(1)


# --- 4. EXECUTION AND TRAINING LOOP ---

def main_regression_setup():
    # --- Labeled Data Setup: INPUT SCORES HERE ---
    # This dictionary maps the file path to the coaster's overall user score (0-100).
    # You MUST update this dictionary with the paths and scores for all 20 coasters.
    labeled_coaster_data = {
        # Format: 'folder_name/file_name.csv': Score_Value
        os.path.join('Raw acceleration data', 'SteelVengeance.csv'): 98.5,
        os.path.join('Raw acceleration data', 'Anubis.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'AlpenFury.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'ElToro.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Hyperia.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Lightning.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Pantheon.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Shambhala.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Skyrush.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'Taron.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'TwistedCo.csv'): 85.2, 
        os.path.join('Raw acceleration data', 'WickedCyc.csv'): 92.1,
    }
    
    # Load all segments and scores
    all_segments, all_scores = load_labeled_data(labeled_coaster_data)

    if not all_segments:
        print("No valid labeled data loaded. Aborting setup.")
        return

    # Create Dataset and DataLoader
    dataset = CoasterScoreDataset(all_segments, all_scores)
    # Note: Splitting into train/test sets should happen here!
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    print(f"\nTotal labeled segments: {len(all_segments)}")
    print(f"DataLoader initialized with Batch Size: {BATCH_SIZE}")

    # --- Model Training Setup ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiGRURegressor().to(device)
    
    # Loss function for regression (Mean Squared Error)
    criterion = nn.MSELoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    print(f"Model initialized on device: {device}")

    # --- Example Training Loop (5 Epochs) ---
    NUM_EPOCHS = 5
    model.train()
    
    for epoch in range(NUM_EPOCHS):
        total_loss = 0
        for segments, scores in dataloader:
            segments = segments.to(device)
            scores = scores.to(device)

            # Forward pass
            predictions = model(segments)
            
            # Calculate Loss
            loss = criterion(predictions, scores)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * segments.size(0)

        avg_loss = total_loss / len(dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss (MSE): {avg_loss:.4f}")

    print("\n--- BiGRU Regression Training Complete ---")
    
    # Example: How to calculate correlation/R^2 (for the final report)
    # You would do this on a separate test set, not the training set!
    # A low MSE correlates to a strong linear relationship (correlation) between 
    # the features extracted by the BiGRU and the final score.

    # To find Feature Importance (Correlation):
    # 1. Run inference on the test set.
    # 2. Extract the intermediate feature vector (the output of the GRU before the regressor).
    # 3. Use standard statistical methods (e.g., Pearson correlation) to check the correlation 
    #    between these learned features and the final score.

if __name__ == '__main__':
    main_regression_setup()

Skipping file Raw acceleration data\SteelVengeance.csv due to error: Usecols do not match columns, columns expected but not found: [1, 2, 3]
Skipping file Raw acceleration data\Anubis.csv due to error: Usecols do not match columns, columns expected but not found: [1, 2, 3]
No valid labeled data loaded. Aborting setup.
