<a href="https://colab.research.google.com/github/dante77999/colab/blob/main/Transfer_Learning_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import torch
from torch import nn
device = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
import os
import zipfile

from pathlib import Path

import requests

# Setup path to data folder
data_path = Path("data/")
image_path = data_path / "pizza_steak_sushi"

# If the image folder doesn't exist, download it and prepare it...
if image_path.is_dir():
    print(f"{image_path} directory exists.")
else:
    print(f"Did not find {image_path} directory, creating one...")
    image_path.mkdir(parents=True, exist_ok=True)

    # Download pizza, steak, sushi data
    with open(data_path / "pizza_steak_sushi.zip", "wb") as f:
        request = requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
        print("Downloading pizza, steak, sushi data...")
        f.write(request.content)

    # Unzip pizza, steak, sushi data
    with zipfile.ZipFile(data_path / "pizza_steak_sushi.zip", "r") as zip_ref:
        print("Unzipping pizza, steak, sushi data...")
        zip_ref.extractall(image_path)

    # Remove .zip file
    os.remove(data_path / "pizza_steak_sushi.zip")

data/pizza_steak_sushi directory exists.


In [28]:
train_dir = "data/pizza_steak_sushi/train"
test_dir = "data/pizza_steak_sushi/test"

In [29]:
import torchvision

vit = torchvision.models.vit_b_16(weights=torchvision.models.ViT_B_16_Weights.DEFAULT)

In [30]:
for params in vit.parameters():
    params.requires_grad = False

vit.heads = torch.nn.Sequential(
    nn.Dropout(0.3),
    torch.nn.Linear(in_features=768,out_features=3)
)

vit.to(device)

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [31]:
try:
    import torchinfo
except:
    !pip install torchinfo

In [32]:
from torchinfo import summary

summary(model=vit,
        input_size=(32,3,224,224),col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])


Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [32, 3, 224, 224]    [32, 3]              768                  Partial
├─Conv2d (conv_proj)                                         [32, 3, 224, 224]    [32, 768, 14, 14]    (590,592)            False
├─Encoder (encoder)                                          [32, 197, 768]       [32, 197, 768]       151,296              False
│    └─Dropout (dropout)                                     [32, 197, 768]       [32, 197, 768]       --                   --
│    └─Sequential (layers)                                   [32, 197, 768]       [32, 197, 768]       --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [32, 197, 768]       [32, 197, 768]       (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [32, 197, 768]       [32, 

In [33]:
#prepare data

from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms

# vit_transform = torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1.transforms()
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),  # 先放大一點
    transforms.RandomCrop(224, padding=4),  # 隨機裁剪
    transforms.RandomHorizontalFlip(p=0.5),  # 50% 機率水平翻轉
    transforms.RandomRotation(degrees=15),  # 隨機旋轉 ±15 度
    transforms.ColorJitter(
        brightness=0.2,  # 亮度變化 ±20%
        contrast=0.2,    # 對比度變化 ±20%
        saturation=0.2,  # 飽和度變化 ±20%
        hue=0.1         # 色調變化 ±10%
    ),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])  # ImageNet 標準化
])
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # 直接縮放到目標大小
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

batch_size=16

train_dataset = datasets.ImageFolder(root=train_dir,transform=train_transform)
test_dataset = datasets.ImageFolder(root=test_dir,transform=test_transform)

train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=1,
                              )

test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=32,
                             shuffle=False,
                             pin_memory=False,
                             num_workers=1)

class_names = train_dataset.classes

In [34]:
train_dataset.samples[123]

('data/pizza_steak_sushi/train/steak/2648423.jpg', 1)

In [35]:
dataiter = iter(train_dataloader)
images, labels = next(dataiter)

#Early stopping version


In [39]:
#train process
from tqdm.auto import tqdm
import copy

loss_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(params=vit.parameters(),
                        lr=0.001,
                        weight_decay=1e-4,
                        betas=(0.9, 0.999))


schedular = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optim,
    mode="max",
    factor=0.5,
    patience=2,
    verbose=True,
    min_lr=1e-6
)

epochs = 20
n = 0

#early stopping 參數
patience = 5
best_test_acc = 0.0
patience_counter = 0
best_model_state = None

results={
        "train_loss":[],
        "train_acc":[],
        "test_loss":[],
        "test_acc":[]
    }

for epoch in tqdm(range(epochs)):
    n+=1
    current_lr = optim.param_groups[0]["lr"] # 獲取當前學習率

    vit.train()

    train_loss,train_acc = 0, 0
    test_loss,test_acc = 0, 0

    #0. let dataloader to device
    for batch,(X,y) in enumerate(train_dataloader):
        X = X.to(device)
        y = y.to(device)
        #1. load the data to model

        y_train_pred_logit =  vit(X)

        #2. calculate the loss

        loss = loss_fn(y_train_pred_logit,y)
        train_loss+=loss.item()

        #3.optim zero grad

        optim.zero_grad()

        #4.backwards

        loss.backward()

        #5.optim step

        optim.step()

        #turn logit into prob
        y_train_prob = torch.argmax((torch.softmax(y_train_pred_logit,dim=1)),dim=1)
        train_acc += (y_train_prob==y).sum().item() / len(y)

    train_loss = train_loss / len(train_dataloader)
    train_acc = train_acc / len(train_dataloader)

    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)

    vit.eval()
    with torch.inference_mode():
        for batch,(X,y) in enumerate(test_dataloader):
            X = X.to(device)
            y = y.to(device)
            y_test_logit = vit(X)
            t_loss = loss_fn(y_test_logit,y)
            test_loss+=t_loss.item()

            #turn into pred

            y_test_pred = torch.argmax(torch.softmax(y_test_logit,dim=1),dim=1)
            test_acc += (y_test_pred==y).sum().item() / len(y)

        test_loss = test_loss / len(test_dataloader)
        test_acc = test_acc / len(test_dataloader)

    results["test_loss"].append(test_loss)
    results["test_acc"].append(test_acc)

    print(f"Epoch {n}: train_loss={train_loss:.4f} | train_acc={train_acc:.4f} | test_loss={test_loss:.4f} | test_acc={test_acc:.4f} | lr={current_lr:.6f}")

    if test_acc > best_test_acc:
        patience_counter = 0
        best_test_acc = test_acc
        best_model_state = copy.deepcopy(vit.state_dict())
        print(f"✓ 新的最佳測試準確率: {best_test_acc:.4f}")
    else:
        patience_counter+=1
        print(f"⚠️  測試準確率未提升 ({patience_counter}/{patience})")

    if train_acc > 0.95 and (train_acc - test_acc) > 0.2:
        print(f"⚠️  警告：可能過擬合！訓練準確率 {train_acc:.4f} 遠高於測試準確率 {test_acc:.4f}")

    schedular.step(test_acc)

    if patience_counter > patience:
        print(f"\n🛑 Early Stopping! 測試準確率已經 {patience} 個 epochs 沒有提升")

        break
if best_model_state is not None:
    vit.load_state_dict(best_model_state)
    print(f"\n✅ 已載入最佳模型 (測試準確率: {best_test_acc:.4f})")
print("已經訓練完成!")

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1: train_loss=0.1465 | train_acc=0.9750 | test_loss=0.1870 | test_acc=0.9384 | lr=0.001000
✓ 新的最佳測試準確率: 0.9384
Epoch 2: train_loss=0.1349 | train_acc=0.9750 | test_loss=0.1764 | test_acc=0.9384 | lr=0.001000
⚠️  測試準確率未提升 (1/5)
Epoch 3: train_loss=0.1341 | train_acc=0.9708 | test_loss=0.2193 | test_acc=0.9384 | lr=0.001000
⚠️  測試準確率未提升 (2/5)
Epoch 4: train_loss=0.1060 | train_acc=0.9667 | test_loss=0.2204 | test_acc=0.9384 | lr=0.001000
⚠️  測試準確率未提升 (3/5)
Epoch 5: train_loss=0.0711 | train_acc=0.9917 | test_loss=0.2055 | test_acc=0.9384 | lr=0.000500
⚠️  測試準確率未提升 (4/5)
Epoch 6: train_loss=0.0624 | train_acc=0.9958 | test_loss=0.1990 | test_acc=0.9384 | lr=0.000500
⚠️  測試準確率未提升 (5/5)
Epoch 7: train_loss=0.0762 | train_acc=0.9792 | test_loss=0.1982 | test_acc=0.9384 | lr=0.000500
⚠️  測試準確率未提升 (6/5)

🛑 Early Stopping! 測試準確率已經 5 個 epochs 沒有提升

✅ 已載入最佳模型 (測試準確率: 0.9384)
已經訓練完成!
