In [1]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [2]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [3]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [4]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [5]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [6]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [7]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [8]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [9]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [10]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [11]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [12]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [13]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [14]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [15]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [16]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


  ecg_2=ecg_model.state_dict(checkpoint["model_state_dict"])


EncoderDecoder1D(
  (encoder): Encoder(
    (enc1): DeformableConv1D(
      (offsets): Conv1d(1, 6, kernel_size=(3,), stride=(1,), padding=(1,))
      (deform_conv): DeformConv2d(1, 8, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
    )
    (bn1): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (enc2): DeformableConv1D(
      (offsets): Conv1d(8, 6, kernel_size=(3,), stride=(1,), padding=(1,))
      (deform_conv): DeformConv2d(8, 16, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
    )
    (conv2): Conv1d(8, 8, kernel_size=(1,), stride=(1,))
    (bn2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (enc3): DeformableConv1D(
      (offsets): Conv1d(16, 6, kernel_size=(3,), stride=(1,), padding=(1,))
      (deform_conv): DeformConv2d(16, 32, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
    )
    (conv3): Conv1d(16, 16, kernel_size=(1,), stride=(1,))
    (bn3): BatchNorm1d(32, eps=1e-05, momentum=0.1, a

In [17]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [18]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [19]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [20]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [21]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [22]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [23]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()




In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()


In [None]:
import scipy.io as si
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision.ops import DeformConv2d
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import os
import pywt

# Enable cuDNN Benchmarking for faster training on fixed-size inputs
torch.backends.cudnn.benchmark = True

# Set device for computation (Ensure proper GPU utilization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fs = 100

# Load dataset
mat_Res = si.loadmat('u_lma.mat')
mat_results = mat_Res['a']

# Custom Dataset (if needed)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

dataset = CustomDataset(mat_results)

# Optimized DataLoader with num_workers and pin_memory for better performance
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# Sample Model (Ensure it runs on multiple GPUs if available)
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)  # Example layer

    def forward(self, x):
        return self.fc(x)

model = SimpleModel().to(device)

# Enable Multi-GPU Training
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (with memory management)
for epoch in range(100):
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = torch.nn.functional.mse_loss(output, batch[:, 0])  # Example loss
        loss.backward()
        optimizer.step()

    # Empty CUDA cache to free memory
    torch.cuda.empty_cache()
