In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import random
import os

# --- 1. CONFIGURATION ---
SEQUENCE_LENGTH = 500  
INPUT_CHANNELS = 3     
HIDDEN_DIM = 32         
NUM_LAYERS = 2         
BATCH_SIZE = 32
LEARNING_RATE = 5e-4    
TEST_SPLIT_RATIO = 0.2
DROPOUT_RATE = 0.2     

# --- 2. DATA PREPARATION  ---

class CoasterScoreDataset(Dataset):
    def __init__(self, data_segments, scores):
        self.segments = data_segments
        self.scores = scores

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment = torch.from_numpy(self.segments[idx]).float()
        score = torch.tensor(self.scores[idx]).float()
        return segment, score

def load_labeled_data(file_paths_and_scores):
    all_coaster_data = []
    REQUIRED_COLUMNS = ['Lateral', 'Vertical', 'Longitudinal']
    FILE_DELIMITER = ',' 

    for file_path, coaster_score in file_paths_and_scores.items():
        try:
            df = pd.read_csv(file_path, sep=FILE_DELIMITER, header=0, skiprows=[1, 2], dtype=np.float32)
            
            missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing required columns in CSV: {missing_cols}")

            data = df[REQUIRED_COLUMNS].values.T
            
            segments = []
            num_timesteps = data.shape[1]
            for start in range(0, num_timesteps - SEQUENCE_LENGTH, SEQUENCE_LENGTH):
                segment = data[:, start : start + SEQUENCE_LENGTH]
                segments.append(segment)
            
            all_coaster_data.append({
                'segments': segments, 
                'score': coaster_score, 
                'file_path': file_path
            })
            
        except Exception as e:
            print(f"Skipping file {file_path} due to error: {e}")
            continue

    return all_coaster_data

# --- 3. THE BIGRU SEQUENCE-TO-VALUE MODEL (REVISED) ---

class BiGRURegressor(nn.Module):
    def __init__(self, input_size=INPUT_CHANNELS, hidden_dim=HIDDEN_DIM, 
                 num_layers=NUM_LAYERS, output_dim=1, dropout_rate=DROPOUT_RATE):
        super(BiGRURegressor, self).__init__()
        
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate
        )
        
        # Regression Head
        self.regressor = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim), 
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = x.transpose(1, 2) 
        
        _, h_n = self.gru(x)
        
        final_forward_backward_state = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        
        score_prediction = self.regressor(final_forward_backward_state)
        
        return score_prediction.squeeze(1)

# --- 4. EVALUATION FUNCTION  ---

def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for segments, scores in dataloader:
            segments = segments.to(device)
            scores = scores.to(device)

            predictions = model(segments)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(scores.cpu().numpy())

    r2 = r2_score(all_targets, all_predictions)
    mse = np.mean((np.array(all_targets) - np.array(all_predictions))**2)

    model.train()
    return r2, mse, all_predictions, all_targets

# --- 5. EXECUTION AND TRAINING LOOP  ---

def main_regression_setup():
    # --- Labeled Data Setup: INPUT SCORES HERE ---
    labeled_coaster_data = {
        os.path.join('Raw acceleration data', 'SteelVengeance.csv'): 100,
        os.path.join('Raw acceleration data', 'Anubis.csv'): 98.6, 
        os.path.join('Raw acceleration data', 'AlpenFury.csv'): 98.3, 
        os.path.join('Raw acceleration data', 'ElToro.csv'): 98.3, 
        os.path.join('Raw acceleration data', 'Hyperia.csv'): 97.6, 
        os.path.join('Raw acceleration data', 'Lightning.csv'): 98.1, 
        os.path.join('Raw acceleration data', 'Pantheon.csv'): 97.4, 
        os.path.join('Raw acceleration data', 'Shambhala.csv'): 97.7, 
        os.path.join('Raw acceleration data', 'Skyrush.csv'): 97.7, 
        os.path.join('Raw acceleration data', 'Taron.csv'): 98.3, 
        os.path.join('Raw acceleration data', 'TwistedCo.csv'): 98, 
        os.path.join('Raw acceleration data', 'WickedCyc.csv'): 97.8,
        # Add the remaining coaster files and their scores here
    }
    
    all_coaster_data = load_labeled_data(labeled_coaster_data)

    if not all_coaster_data:
        print("No valid labeled data loaded. Aborting setup.")
        return
    
    coaster_files = [d['file_path'] for d in all_coaster_data]
    
    # Split the coaster files into training and testing lists
    train_coaster_files, test_coaster_files = train_test_split(
        coaster_files, test_size=TEST_SPLIT_RATIO, random_state=42
    )
    
    # Re-aggregate segments based on the split file lists
    train_segments, train_scores = [], []
    test_segments, test_scores = [], []

    for coaster_data in all_coaster_data:
        is_train = coaster_data['file_path'] in train_coaster_files
        
        target_segments = train_segments if is_train else test_segments
        target_scores = train_scores if is_train else test_scores

        target_segments.extend(coaster_data['segments'])
        target_scores.extend([coaster_data['score']] * len(coaster_data['segments']))

    train_dataset = CoasterScoreDataset(train_segments, train_scores)
    test_dataset = CoasterScoreDataset(test_segments, test_scores)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print(f"\nTotal Labeled Segments: {len(train_segments) + len(test_segments)}")
    print(f"Total Coasters: {len(labeled_coaster_data)}")
    print(f"Training Coasters: {len(train_coaster_files)} | Test Coasters: {len(test_coaster_files)}")
    print(f"Training Segments: {len(train_segments)} | Test Segments: {len(test_segments)}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiGRURegressor().to(device)
    
    criterion = nn.MSELoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    NUM_EPOCHS = 20
    
    print(f"\nStarting training for {NUM_EPOCHS} epochs on {device}...")
    
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_train_loss = 0
        
        for segments, scores in train_dataloader:
            segments = segments.to(device)
            scores = scores.to(device)

            predictions = model(segments)
            loss = criterion(predictions, scores)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item() * segments.size(0)

        avg_train_loss = total_train_loss / len(train_dataset)

        if (epoch + 1) % 5 == 0 or epoch == NUM_EPOCHS - 1:
            test_r2, test_mse, _, _ = evaluate_model(model, test_dataloader, device)
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train MSE: {avg_train_loss:.2f} | Test MSE: {test_mse:.2f} | Test R2: {test_r2:.4f}")
        else:
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train MSE: {avg_train_loss:.2f}")

    print("\n--- BiGRU Regression Training Complete ---")
    
    final_test_r2, final_test_mse, final_predictions, final_targets = evaluate_model(model, test_dataloader, device)
    
    print(f"\nFinal Test R-squared (RÂ²): {final_test_r2:.4f}")
    print(f"Final Test Mean Squared Error (MSE): {final_test_mse:.2f}")

if __name__ == '__main__':
    main_regression_setup()


Total Labeled Segments: 211
Total Coasters: 12
Training Coasters: 9 | Test Coasters: 3
Training Segments: 152 | Test Segments: 59

Starting training for 20 epochs on cpu...
Epoch 1/20 | Train MSE: 9620.71
Epoch 2/20 | Train MSE: 9596.07
Epoch 3/20 | Train MSE: 9569.90
Epoch 4/20 | Train MSE: 9539.32
Epoch 5/20 | Train MSE: 9501.32 | Test MSE: 9635.92 | Test R2: -11239.4608
Epoch 6/20 | Train MSE: 9453.27
Epoch 7/20 | Train MSE: 9390.81
Epoch 8/20 | Train MSE: 9311.02
Epoch 9/20 | Train MSE: 9208.95
Epoch 10/20 | Train MSE: 9086.70 | Test MSE: 9157.70 | Test R2: -10681.6094
Epoch 11/20 | Train MSE: 8950.30
Epoch 12/20 | Train MSE: 8808.94
Epoch 13/20 | Train MSE: 8666.73
Epoch 14/20 | Train MSE: 8525.24
Epoch 15/20 | Train MSE: 8384.56 | Test MSE: 8445.95 | Test R2: -9851.3366
Epoch 16/20 | Train MSE: 8243.97
Epoch 17/20 | Train MSE: 8102.95
Epoch 18/20 | Train MSE: 7960.86
Epoch 19/20 | Train MSE: 7817.30
Epoch 20/20 | Train MSE: 7672.14 | Test MSE: 7724.45 | Test R2: -9009.7036

--- 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
import random
import os

# --- 1. CONFIGURATION (REGRESSION) ---
SEQUENCE_LENGTH = 500  
INPUT_CHANNELS = 3     
HIDDEN_DIM = 16        
NUM_LAYERS = 2         
BATCH_SIZE = 32
LEARNING_RATE = 5e-4    
TEST_SPLIT_RATIO = 0.2
DROPOUT_RATE = 0.4     

# --- 2. DATA PREPARATION ---

class CoasterScoreDataset(Dataset):
    def __init__(self, data_segments, scores):
        self.segments = data_segments
        self.labels = scores.astype(np.float32) 

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment = torch.from_numpy(self.segments[idx]).float()
        label = torch.tensor(self.labels[idx]).float()
        return segment, label

def trim_flat_track(data, vertical_g_index=1, tolerance=0.05, min_window=200):
    """Dynamically trims the start/end of the sequence where acceleration is near 1G."""
    vertical_g = data[vertical_g_index, :]
    g_deviation = np.abs(vertical_g - 1.0)
    start_index = np.argmax(g_deviation > tolerance)
    end_index = len(g_deviation) - np.argmax(g_deviation[::-1] > tolerance)
    
    if start_index >= end_index or (end_index - start_index) < min_window:
        return data

    return data[:, start_index:end_index]


def load_labeled_data(file_paths_and_scores):
    all_coaster_data = []
    REQUIRED_COLUMNS = ['Lateral', 'Vertical', 'Longitudinal']
    FILE_DELIMITER = ',' 
    VERTICAL_G_INDEX = 1 

    for file_path, coaster_score in file_paths_and_scores.items():
        try:
            df = pd.read_csv(file_path, sep=FILE_DELIMITER, header=0, skiprows=[1, 2], dtype=np.float32)
            
            missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing required columns in CSV: {missing_cols}")

            data = df[REQUIRED_COLUMNS].values.T 
            trimmed_data = trim_flat_track(data, vertical_g_index=VERTICAL_G_INDEX)
            
            # Normalize input data: subtract 1G from vertical to center around 0, and normalize lateral/longitudinal
            trimmed_data[VERTICAL_G_INDEX, :] = trimmed_data[VERTICAL_G_INDEX, :] - 1.0
            trimmed_data = trimmed_data / 5.0  # Scale all by typical max G-force
            
            segments = []
            num_timesteps = trimmed_data.shape[1]
            for start in range(0, num_timesteps - SEQUENCE_LENGTH, SEQUENCE_LENGTH):
                segment = trimmed_data[:, start : start + SEQUENCE_LENGTH]
                segments.append(segment)
            
            all_coaster_data.append({
                'segments': segments, 
                'score': coaster_score, 
                'file_path': file_path
            })
            
        except Exception as e:
            print(f"Skipping file {file_path} due to error: {e}")
            continue

    return all_coaster_data

# --- 3. THE BIGRU SEQUENCE-TO-VALUE MODEL (REGRESSION) ---

class BiGRURegressor(nn.Module):
    def __init__(self, input_size=INPUT_CHANNELS, hidden_dim=HIDDEN_DIM, 
                 num_layers=NUM_LAYERS, output_dim=1, dropout_rate=DROPOUT_RATE):
        super(BiGRURegressor, self).__init__()
        
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate
        )
        
        # Regression Head with output scaling to [0, 100]
        self.head = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim), 
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = x.transpose(1, 2) 
        _, h_n = self.gru(x)
        
        # Concatenate final forward and backward states
        final_forward_backward_state = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        
        output = self.head(final_forward_backward_state)
        
        # Scale output to [0, 100] range using sigmoid + scaling
        output_scaled = torch.sigmoid(output) * 100.0
        
        return output_scaled.squeeze(1)

# --- 4. EVALUATION FUNCTION (REGRESSION) ---

def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for segments, labels in dataloader:
            segments = segments.to(device)
            labels = labels.to(device) 

            predictions = model(segments)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())

    # Regression Metrics
    r2 = r2_score(all_targets, all_predictions)
    mse = mean_squared_error(all_targets, all_predictions)

    model.train()
    return r2, mse, all_predictions, all_targets

# --- 5. EXECUTION AND TRAINING LOOP ---

def main_regression_setup():
    # --- Labeled Data Setup: FINAL CORRECTED SCORES ---
    labeled_coaster_data = {
        os.path.join('accel_data', 'SteelVengeance.csv'): 100.0,
        os.path.join('accel_data', 'Anubis.csv'): 98.6, 
        os.path.join('accel_data', 'AlpenFury.csv'): 98.3, 
        os.path.join('accel_data', 'ElToro.csv'): 98.3, 
        os.path.join('accel_data', 'Hyperia.csv'): 97.6, 
        os.path.join('accel_data', 'Lightning.csv'): 98.1, 
        os.path.join('accel_data', 'Pantheon.csv'): 97.4, 
        os.path.join('accel_data', 'Shambhala.csv'): 97.7, 
        os.path.join('accel_data', 'Skyrush.csv'): 97.7, 
        os.path.join('accel_data', 'Taron.csv'): 98.3, 
        os.path.join('accel_data', 'TwistedCo.csv'): 98.0, 
        os.path.join('accel_data', 'WickedCyc.csv'): 97.8,
    }
    
    all_coaster_data = load_labeled_data(labeled_coaster_data)

    if not all_coaster_data:
        print("No valid labeled data loaded. Aborting setup.")
        return
    
    coaster_files = [d['file_path'] for d in all_coaster_data]
    
    # Split the coaster files into training and testing lists (Coaster-based split)
    train_coaster_files, test_coaster_files = train_test_split(
        coaster_files, test_size=TEST_SPLIT_RATIO, random_state=42
    )
    
    # Re-aggregate segments based on the split file lists
    train_segments, train_scores = [], []
    test_segments, test_scores = [], []
    test_coaster_mapping = {}  # Track which segments belong to which coaster

    for coaster_data in all_coaster_data:
        is_train = coaster_data['file_path'] in train_coaster_files
        
        target_segments = train_segments if is_train else test_segments
        target_scores = train_scores if is_train else test_scores

        target_segments.extend(coaster_data['segments'])
        target_scores.extend([coaster_data['score']] * len(coaster_data['segments']))
        
        # Map test segments to their coaster name
        if not is_train:
            coaster_name = os.path.basename(coaster_data['file_path']).replace('.csv', '')
            for i in range(len(coaster_data['segments'])):
                test_coaster_mapping[len(test_segments) - len(coaster_data['segments']) + i] = coaster_name
    
    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)

    train_dataset = CoasterScoreDataset(train_segments, train_scores)
    test_dataset = CoasterScoreDataset(test_segments, test_scores)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print(f"\nTotal Labeled Segments (After Trimming): {len(train_segments) + len(test_segments)}")
    print(f"Total Coasters: {len(labeled_coaster_data)}")
    print(f"Training Coasters: {len(train_coaster_files)} | Test Coasters: {len(test_coaster_files)}")
    print(f"Training Segments: {len(train_segments)} | Test Segments: {len(test_segments)}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiGRURegressor().to(device)
    
    criterion = nn.MSELoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    NUM_EPOCHS = 50 
    
    print(f"\nStarting training for {NUM_EPOCHS} epochs on {device}...")
    
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_train_loss = 0
        
        for segments, labels in train_dataloader:
            segments = segments.to(device)
            labels = labels.to(device) 

            predictions = model(segments)
            loss = criterion(predictions, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item() * segments.size(0)

        avg_train_loss = total_train_loss / len(train_dataset)

        if (epoch + 1) % 5 == 0 or epoch == NUM_EPOCHS - 1:
            test_r2, test_mse, _, _ = evaluate_model(model, test_dataloader, device)
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train MSE: {avg_train_loss:.4f} | Test MSE: {test_mse:.4f} | Test R2: {test_r2:.4f}")
        else:
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train MSE: {avg_train_loss:.4f}")

    print("\n--- BiGRU Regression Training Complete ---")
    
    final_test_r2, final_test_mse, final_predictions, final_targets = evaluate_model(model, test_dataloader, device)
    
    # Aggregate predictions by coaster
    coaster_scores = {}
    for idx, (pred, target) in enumerate(zip(final_predictions, final_targets)):
        coaster_name = test_coaster_mapping.get(idx, "Unknown")
        if coaster_name not in coaster_scores:
            coaster_scores[coaster_name] = {'predicted': [], 'actual': target}
        coaster_scores[coaster_name]['predicted'].append(pred)
    
    print("\n--- Test Coaster Predictions ---")
    for coaster_name in sorted(coaster_scores.keys()):
        scores_info = coaster_scores[coaster_name]
        avg_pred = np.mean(scores_info['predicted'])
        actual_score = scores_info['actual']
        print(f"{coaster_name}: Predicted Score = {avg_pred:.2f}, Actual Score = {actual_score:.2f}")

if __name__ == '__main__':
    # Fix the random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    main_regression_setup()



Total Labeled Segments (After Trimming): 162
Total Coasters: 12
Training Coasters: 9 | Test Coasters: 3
Training Segments: 110 | Test Segments: 52

Starting training for 50 epochs on cpu...
Epoch 1/50 | Train MSE: 2839.9680
Epoch 1/50 | Train MSE: 2839.9680
Epoch 2/50 | Train MSE: 2773.6731
Epoch 2/50 | Train MSE: 2773.6731
Epoch 3/50 | Train MSE: 2741.7015
Epoch 3/50 | Train MSE: 2741.7015
Epoch 4/50 | Train MSE: 2664.4811
Epoch 4/50 | Train MSE: 2664.4811
Epoch 5/50 | Train MSE: 2619.9230 | Test MSE: 2685.5144 | Test R2: -2955.7518
Epoch 5/50 | Train MSE: 2619.9230 | Test MSE: 2685.5144 | Test R2: -2955.7518
Epoch 6/50 | Train MSE: 2592.3286
Epoch 6/50 | Train MSE: 2592.3286
Epoch 7/50 | Train MSE: 2526.8424
Epoch 7/50 | Train MSE: 2526.8424
Epoch 8/50 | Train MSE: 2460.3215
Epoch 8/50 | Train MSE: 2460.3215
Epoch 9/50 | Train MSE: 2400.0373
Epoch 9/50 | Train MSE: 2400.0373
Epoch 10/50 | Train MSE: 2312.1481 | Test MSE: 2346.1270 | Test R2: -2582.0862
Epoch 10/50 | Train MSE: 2312.