In [1]:
import os
data_path = "../Data/data_angles/data1_angles.csv"
print("Attempting to access:", data_path)
print("Full path:", os.path.abspath(data_path))
if os.path.exists(data_path):
    print("File found!")
else:
    print("File not found!")

Attempting to access: ../Data/data_angles/data1_angles.csv
Full path: /home/exx/Desktop/quantum/Data/data_angles/data1_angles.csv
File found!


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
print(device)

cuda


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast
import matplotlib.pyplot as plt
from typing import Tuple

# Verify GPU
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU count:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected, exiting!")
    exit(1)

class AngleDataset(Dataset):
    def __init__(self, data_path: str, window_size: int, indices: np.ndarray = None):
        self.df = pd.read_csv(data_path)
        if self.df[['Feature Phi (degrees)', 'Feature Theta (degrees)', 
                   'Target Phi (degrees)', 'Target Theta (degrees)']].isna().any().any():
            print("Warning: NaN values found in data!")
            self.df = self.df.dropna()
        if np.isinf(self.df[['Feature Phi (degrees)', 'Feature Theta (degrees)', 
                            'Target Phi (degrees)', 'Target Theta (degrees)']].values).any():
            print("Warning: Infinite values found in data!")
            self.df = self.df[~np.isinf(self.df[['Feature Phi (degrees)', 'Feature Theta (degrees)', 
                                                'Target Phi (degrees)', 'Target Theta (degrees)']]).any(axis=1)]
        
        self.features = self.df[['Feature Phi (degrees)', 'Feature Theta (degrees)']].values
        self.targets = self.df[['Target Phi (degrees)', 'Target Theta (degrees)']].values
        self.features[:, 0] /= 180.0
        self.features[:, 1] /= 180.0
        self.targets[:, 0] /= 180.0
        self.targets[:, 1] /= 180.0
        
        print("Normalized Feature Phi range:", self.features[:, 0].min(), "to", self.features[:, 0].max())
        print("Normalized Feature Theta range:", self.features[:, 1].min(), "to", self.features[:, 1].max())
        print("Normalized Target Phi range:", self.targets[:, 0].min(), "to", self.targets[:, 0].max())
        print("Normalized Target Theta range:", self.targets[:, 1].min(), "to", self.targets[:, 1].max())
        
        self.window_size = window_size
        self.indices = indices if indices is not None else np.arange(len(self.df) - window_size + 1)
        self.length = len(self.indices)

    def __len__(self) -> int:
        return self.length

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, int]:
        data_idx = self.indices[idx]
        window = self.features[data_idx:data_idx + self.window_size]
        target = self.targets[data_idx + self.window_size - 1]
        time_index = data_idx + self.window_size - 1
        return torch.FloatTensor(window), torch.FloatTensor(target), time_index

class FlashAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.dropout = nn.Dropout(dropout)
        
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.out = nn.Linear(d_model, d_model)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.shape
        qkv = self.qkv(x).reshape(B, T, 3, self.n_heads, self.d_k).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        scale = 1.0 / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        attn = torch.matmul(q, k.transpose(-2, -1)) * scale
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).reshape(B, T, C)
        out = self.out(out)
        return out

class AnglePredictionModel(nn.Module):
    def __init__(self, input_dim: int, d_model: int, n_heads: int, n_layers: int, dropout: float = 0.1):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 128, d_model) * 0.1)
        self.attn_layers = nn.ModuleList([
            FlashAttention(d_model, n_heads, dropout) for _ in range(n_layers)
        ])
        self.norm_layers = nn.ModuleList([
            nn.LayerNorm(d_model) for _ in range(n_layers)
        ])
        self.output = nn.Linear(d_model, 2)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.input_proj(x)
        x = x + self.pos_encoding[:, :x.size(1), :]
        
        for attn, norm in zip(self.attn_layers, self.norm_layers):
            residual = x
            x = attn(x)
            x = norm(x + residual)
        
        x = x[:, -1, :]
        return self.output(x)

def train_model(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, 
                epochs: int, device: torch.device, lr: float = 1e-4):
    print("Model device:", next(model.parameters()).device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    scaler = torch.amp.GradScaler('cuda')
    criterion = nn.MSELoss()
    
    train_losses, test_losses = [], []
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, (batch_x, batch_y, _) in enumerate(train_loader):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            if i == 0:
                print("Batch x device:", batch_x.device)
            
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                output = model(batch_x)
                if torch.isnan(output).any():
                    print(f"NaN in output at epoch {epoch+1}, batch {i}")
                    break
                loss = criterion(output, batch_y)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for batch_x, batch_y, _ in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(batch_x)
                test_loss += criterion(output, batch_y).item()
        
        test_loss /= len(test_loader)
        test_losses.append(test_loss)
        
        scheduler.step()
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
    
    return train_losses, test_losses

def plot_predictions(predictions: np.ndarray, actuals: np.ndarray, time_indices: np.ndarray, 
                     rmse_phi: float, rmse_theta: float, filename: str, title_prefix: str):
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(time_indices, actuals[:, 0], label='Actual Phi (1551 nm, °)', color='blue', linewidth=2)
    plt.plot(time_indices, predictions[:, 0], label='Predicted Phi (from 1531 nm, °)', color='red', linestyle='--', linewidth=2)
    plt.title(f'{title_prefix} Phi Time Series (RMSE: {rmse_phi:.2f}°)')
    plt.xlabel('Time Index')
    plt.ylabel('Phi (degrees)')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(2, 1, 2)
    plt.plot(time_indices, actuals[:, 1], label='Actual Theta (1551 nm, °)', color='blue', linewidth=2)
    plt.plot(time_indices, predictions[:, 1], label='Predicted Theta (from 1531 nm, °)', color='red', linestyle='--', linewidth=2)
    plt.title(f'{title_prefix} Theta Time Series (RMSE: {rmse_theta:.2f}°)')
    plt.xlabel('Time Index')
    plt.ylabel('Theta (degrees)')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    
    errors = np.abs(predictions - actuals)
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(time_indices, errors[:, 0], label='|Predicted - Actual| Phi', color='purple', linewidth=2)
    plt.title(f'{title_prefix} Phi Absolute Error')
    plt.xlabel('Time Index')
    plt.ylabel('Error (degrees)')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(2, 1, 2)
    plt.plot(time_indices, errors[:, 1], label='|Predicted - Actual| Theta', color='purple', linewidth=2)
    plt.title(f'{title_prefix} Theta Absolute Error')
    plt.xlabel('Time Index')
    plt.ylabel('Error (degrees)')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(f'errors_{filename.split("_", 1)[1]}')
    plt.close()

def experiment_window_sizes(data_path: str, window_sizes: list, batch_size: int = 64, epochs: int = 50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = {}
    
    for window_size in window_sizes:
        print(f"\nExperimenting with window size: {window_size}")
        
        dataset_full = AngleDataset(data_path, window_size)
        total_length = len(dataset_full)
        train_length = int(0.8 * total_length)
        test_length = total_length - train_length
        
        train_indices = np.arange(train_length)
        test_indices = np.arange(train_length, total_length)
        
        train_dataset = AngleDataset(data_path, window_size, train_indices)
        test_dataset = AngleDataset(data_path, window_size, test_indices)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        model = AnglePredictionModel(
            input_dim=2,
            d_model=140,
            n_heads=4,
            n_layers=4,
            dropout=0.1
        ).to(device)
        
        train_losses, test_losses = train_model(
            model, train_loader, test_loader, epochs, device
        )
        
        model.eval()
        predictions = []
        actuals = []
        time_indices = []
        with torch.no_grad():
            for batch_x, batch_y, batch_indices in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(batch_x)
                output = output.cpu().numpy() * 180.0
                batch_y = batch_y.cpu().numpy() * 180.0
                predictions.append(output)
                actuals.append(batch_y)
                time_indices.append(batch_indices.numpy())
        
        predictions = np.concatenate(predictions, axis=0)
        actuals = np.concatenate(actuals, axis=0)
        time_indices = np.concatenate(time_indices, axis=0)
        
        sort_idx = np.argsort(time_indices)
        time_indices = time_indices[sort_idx]
        predictions = predictions[sort_idx]
        actuals = actuals[sort_idx]
        
        rmse_phi = np.sqrt(np.mean((predictions[:, 0] - actuals[:, 0])**2))
        rmse_theta = np.sqrt(np.mean((predictions[:, 1] - actuals[:, 1])**2))
        
        plot_predictions(
            predictions, actuals, time_indices, rmse_phi, rmse_theta,
            f'predictions_window_{window_size}.png',
            f'Window Size {window_size}'
        )
        
        results[window_size] = {
            'train_losses': train_losses,
            'test_losses': test_losses,
            'model': model,
            'predictions': predictions,
            'actuals': actuals,
            'time_indices': time_indices,
            'rmse_phi': rmse_phi,
            'rmse_theta': rmse_theta
        }
        
        plt.figure(figsize=(10, 5))
        plt.plot(train_losses, label='Train Loss')
        plt.plot(test_losses, label='Test Loss')
        plt.title(f'Loss Curves (Window Size: {window_size})')
        plt.xlabel('Epoch')
        plt.ylabel('MSE Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'loss_window_{window_size}.png')
        plt.close()
    
    plt.figure(figsize=(12, 6))
    for window_size, result in results.items():
        plt.plot(result['test_losses'], label=f'Window {window_size}')
    plt.title('Test Loss Comparison Across Window Sizes')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig('window_size_comparison.png')
    plt.close()
    
    return results

def experiment_model_sizes(data_path: str, window_size: int = 64, batch_size: int = 64, epochs: int = 50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_configs = [
        {'name': 'small', 'd_model': 16, 'n_heads': 4, 'n_layers': 1},
        {'name': 'medium', 'd_model': 32, 'n_heads': 4, 'n_layers': 2},
        {'name': 'large', 'd_model': 64, 'n_heads': 8, 'n_layers': 3},
        # Extra Large: Increase d_model to ~140 to get ~5x parameters
        {'name': 'extra_large', 'd_model': 140, 'n_heads': 10, 'n_layers': 4}
    ]
    results = {}
    
    dataset_full = AngleDataset(data_path, window_size)
    total_length = len(dataset_full)
    train_length = int(0.8 * total_length)
    test_length = total_length - train_length

    train_indices = np.arange(train_length)
    test_indices = np.arange(train_length, total_length)

    train_dataset = AngleDataset(data_path, window_size, train_indices)
    test_dataset = AngleDataset(data_path, window_size, test_indices)

    rmse_phi_list = []
    rmse_theta_list = []
    model_names = []

    for config in model_configs:
        model_name = config['name']
        print(f"\nExperimenting with model size: {model_name} (d_model={config['d_model']}, "
              f"n_layers={config['n_layers']}, n_heads={config['n_heads']})")
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        model = AnglePredictionModel(
            input_dim=2,
            d_model=config['d_model'],
            n_heads=config['n_heads'],
            n_layers=config['n_layers'],
            dropout=0.1
        ).to(device)
        
        train_losses, test_losses = train_model(
            model, train_loader, test_loader, epochs, device
        )
        
        # Evaluation Phase
        model.eval()
        predictions = []
        actuals = []
        time_indices = []
        with torch.no_grad():
            for batch_x, batch_y, batch_indices in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(batch_x)
                output = output.cpu().numpy() * 180.0
                batch_y = batch_y.cpu().numpy() * 180.0
                predictions.append(output)
                actuals.append(batch_y)
                time_indices.append(batch_indices.numpy())
        
        predictions = np.concatenate(predictions, axis=0)
        actuals = np.concatenate(actuals, axis=0)
        time_indices = np.concatenate(time_indices, axis=0)
        
        sort_idx = np.argsort(time_indices)
        predictions = predictions[sort_idx]
        actuals = actuals[sort_idx]
        
        rmse_phi = np.sqrt(np.mean((predictions[:, 0] - actuals[:, 0])**2))
        rmse_theta = np.sqrt(np.mean((predictions[:, 1] - actuals[:, 1])**2))
        
        rmse_phi_list.append(rmse_phi)
        rmse_theta_list.append(rmse_theta)
        model_names.append(model_name.capitalize())
        
        # Existing plots
        plot_predictions(
            predictions, actuals, time_indices[sort_idx], rmse_phi, rmse_theta,
            f'predictions_model_{model_name}.png',
            f'Model {model_name.capitalize()} (Window Size {window_size})'
        )
        
        results[model_name] = {
            'train_losses': train_losses,
            'test_losses': test_losses,
            'rmse_phi': rmse_phi,
            'rmse_theta': rmse_theta
        }

        # Loss Curve Plot
        plt.figure(figsize=(10, 5))
        plt.plot(train_losses, label='Train Loss')
        plt.plot(test_losses, label='Test Loss')
        plt.title(f'Loss Curves (Model: {model_name}, Window Size: {window_size})')
        plt.xlabel('Epoch')
        plt.ylabel('MSE Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'loss_model_{model_name}.png')
        plt.close()
    
    # Plot RMSE Comparison Chart
    x = np.arange(len(model_names))
    width = 0.35

    plt.figure(figsize=(10, 6))
    plt.bar(x - width/2, rmse_phi_list, width, label='RMSE Phi')
    plt.bar(x + width/2, rmse_theta_list, width, label='RMSE Theta')
    plt.xticks(x, model_names)
    plt.ylabel('RMSE (degrees)')
    plt.title('RMSE Comparison Across Model Sizes')
    plt.legend()
    plt.grid(True, axis='y')
    plt.tight_layout()
    plt.savefig('rmse_model_comparison.png')
    plt.close()
    
    return results


if __name__ == "__main__":
    data_path = os.path.join("..", "Data", "data_angles", "data1_angles.csv")
    
    print("Attempting to access:", data_path)
    print("Full path:", os.path.abspath(data_path))
    if os.path.exists(data_path):
        print("File found!")
    else:
        print("File not found!")
        exit(1)
    
    # Experiment 1: Window sizes
    window_sizes = [8, 16, 32, 64, 128]
    window_results = experiment_window_sizes(data_path, window_sizes)
    
    # Experiment 2: Model sizes for window_size=64
    model_results = experiment_model_sizes(data_path, window_size=16)

PyTorch version: 2.5.1
CUDA available: True
CUDA version: 12.4
GPU count: 1
GPU name: NVIDIA H100 NVL
Attempting to access: ../Data/data_angles/data1_angles.csv
Full path: /home/exx/Desktop/quantum/Data/data_angles/data1_angles.csv
File found!

Experimenting with window size: 8
Normalized Feature Phi range: -0.9990245738912378 to 0.9996774504787057
Normalized Feature Theta range: 0.1473259301257695 to 0.8428363908826839
Normalized Target Phi range: -0.9727824167538099 to 0.9961645673765517
Normalized Target Theta range: 0.2620906844265113 to 0.9977737740563682
Normalized Feature Phi range: -0.9990245738912378 to 0.9996774504787057
Normalized Feature Theta range: 0.1473259301257695 to 0.8428363908826839
Normalized Target Phi range: -0.9727824167538099 to 0.9961645673765517
Normalized Target Theta range: 0.2620906844265113 to 0.9977737740563682
Normalized Feature Phi range: -0.9990245738912378 to 0.9996774504787057
Normalized Feature Theta range: 0.1473259301257695 to 0.8428363908826839
