In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Apr 13 22:00:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    print(f"Using {torch.cuda.get_device_name(0)} for PyTorch")
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU instead")


Using Tesla V100-SXM2-16GB for PyTorch


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models

# Set device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define data transformations
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Load CIFAR10 dataset
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# Define data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=4)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=4)

# Define ResNet-18 model
model = models.resnet18(pretrained=False).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)

# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    # Train the model
    model.train()
    train_loss = 0
    train_correct = 0
    for images, labels in trainloader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        train_correct += (outputs.argmax(dim=1) == labels).sum().item()

    # Test the model
    model.eval()
    test_loss = 0
    test_correct = 0
    with torch.no_grad():
        for images, labels in testloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            test_loss += loss.item() * images.size(0)
            test_correct += (outputs.argmax(dim=1) == labels).sum().item()

    # Print results for this epoch
    train_loss /= len(trainset)
    train_acc = train_correct / len(trainset)
    test_loss /= len(testset)
    test_acc = test_correct / len(testset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

# Save the trained model
# torch.save(model.state_dict(), 'resnet50_imagenet.pth')


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29485766.46it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified




Epoch [1/50], Train Loss: 2.0561, Train Acc: 0.2768, Test Loss: 1.5767, Test Acc: 0.4235
Epoch [2/50], Train Loss: 1.6875, Train Acc: 0.3845, Test Loss: 1.4516, Test Acc: 0.4684
Epoch [3/50], Train Loss: 1.4708, Train Acc: 0.4725, Test Loss: 1.1510, Test Acc: 0.5848
Epoch [4/50], Train Loss: 1.3050, Train Acc: 0.5351, Test Loss: 1.0044, Test Acc: 0.6425
Epoch [5/50], Train Loss: 1.1578, Train Acc: 0.5903, Test Loss: 0.8405, Test Acc: 0.7069
Epoch [6/50], Train Loss: 1.0585, Train Acc: 0.6287, Test Loss: 0.7945, Test Acc: 0.7215
Epoch [7/50], Train Loss: 0.9693, Train Acc: 0.6621, Test Loss: 0.7385, Test Acc: 0.7507
Epoch [8/50], Train Loss: 0.9215, Train Acc: 0.6785, Test Loss: 0.6633, Test Acc: 0.7707
Epoch [9/50], Train Loss: 0.8802, Train Acc: 0.6943, Test Loss: 0.6083, Test Acc: 0.7893
Epoch [10/50], Train Loss: 0.8430, Train Acc: 0.7035, Test Loss: 0.5190, Test Acc: 0.8202
Epoch [11/50], Train Loss: 0.8137, Train Acc: 0.7158, Test Loss: 0.5464, Test Acc: 0.8119
Epoch [12/50], Trai

In [None]:
torch.cuda.empty_cache()

100 epochs, larger batch size

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models

# Set device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define data transformations
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Load CIFAR10 dataset
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# Define data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=4)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=4)

# Define ResNet-18 model
model = models.resnet18(pretrained=False).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    # Train the model
    model.train()
    train_loss = 0
    train_correct = 0
    for images, labels in trainloader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        train_correct += (outputs.argmax(dim=1) == labels).sum().item()

    # Test the model
    model.eval()
    test_loss = 0
    test_correct = 0
    with torch.no_grad():
        for images, labels in testloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            test_loss += loss.item() * images.size(0)
            test_correct += (outputs.argmax(dim=1) == labels).sum().item()

    # Print results for this epoch
    train_loss /= len(trainset)
    train_acc = train_correct / len(trainset)
    test_loss /= len(testset)
    test_acc = test_correct / len(testset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

# Save the trained model
# torch.save(model.state_dict(), 'resnet50_imagenet.pth')


Files already downloaded and verified
Files already downloaded and verified
Epoch [1/100], Train Loss: 2.3427, Train Acc: 0.2294, Test Loss: 1.7524, Test Acc: 0.3252
Epoch [2/100], Train Loss: 1.7828, Train Acc: 0.3384, Test Loss: 1.5765, Test Acc: 0.4192
Epoch [3/100], Train Loss: 1.6583, Train Acc: 0.3947, Test Loss: 1.4240, Test Acc: 0.4798
Epoch [4/100], Train Loss: 1.5200, Train Acc: 0.4510, Test Loss: 1.2906, Test Acc: 0.5320
Epoch [5/100], Train Loss: 1.3762, Train Acc: 0.5071, Test Loss: 1.0479, Test Acc: 0.6318
Epoch [6/100], Train Loss: 1.2633, Train Acc: 0.5494, Test Loss: 1.0286, Test Acc: 0.6356
Epoch [7/100], Train Loss: 1.1657, Train Acc: 0.5850, Test Loss: 0.9248, Test Acc: 0.6784
Epoch [8/100], Train Loss: 1.0743, Train Acc: 0.6212, Test Loss: 0.7451, Test Acc: 0.7428
Epoch [9/100], Train Loss: 1.0018, Train Acc: 0.6464, Test Loss: 0.7659, Test Acc: 0.7312
Epoch [10/100], Train Loss: 0.9447, Train Acc: 0.6689, Test Loss: 0.7496, Test Acc: 0.7465
Epoch [11/100], Train L