<a href="https://colab.research.google.com/github/h40300965/deep-learnin/blob/main/cafarinfnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# 📦 Step 1: Install Required Packages
!pip install torch torchvision torchaudio --quiet

# 🧠 Step 2: Define the NFNet Model (Paste from previous implementation or upload file)
# در اینجا ما کد NFNet را به صورت inline می‌نویسیم

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms


# 🔧 Scaled Weight Standardization Utility
def scaled_weight_standardization(weight, gain=1.0, eps=1e-4):
    fan_in = weight.shape[1] * weight.shape[2] * weight.shape[3]
    mean = weight.mean(dim=[1, 2, 3], keepdim=True)
    var = weight.var(dim=[1, 2, 3], keepdim=True)
    scale = gain * (fan_in ** 0.5) / (var + eps).sqrt()
    return (weight - mean) * scale


# 🧱 Conv2d with Scaled WS
class ScaledConv2d(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, dilation=1, groups=1, bias=False, gain=True):
        super().__init__(in_channels, out_channels, kernel_size,
                         stride, padding, dilation, groups, bias)
        self.gain = nn.Parameter(torch.ones(out_channels, 1, 1, 1)) if gain else 1.0

    def forward(self, x):
        weight = scaled_weight_standardization(self.weight, gain=self.gain)
        return F.conv2d(x, weight, self.bias, self.stride,
                         self.padding, self.dilation, self.groups)


# ⚙️ NFBlock with SkipInit
class NFBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, expansion=2):
        super().__init__()
        mid_channels = in_channels * expansion
        self.conv1 = ScaledConv2d(in_channels, mid_channels, 3, stride, 1)
        self.act1 = nn.GELU()

        self.conv2 = ScaledConv2d(mid_channels, out_channels, 3, 1, 1)
        self.act2 = nn.GELU()

        # SkipInit Layer
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                ScaledConv2d(in_channels, out_channels, 1, stride, 0),
            )
            self.skip_gain = nn.Parameter(torch.zeros(1))
        else:
            self.shortcut = nn.Identity()
            self.skip_gain = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        shortcut = self.shortcut(x)
        x = self.conv1(x)
        x = self.act1(x)
        x = self.conv2(x)
        x = x * self.skip_gain
        x += shortcut
        x = self.act2(x)
        return x


# 🧬 NFNet Backbone
class NFNet(nn.Module):
    def __init__(self, block, depths, widths, num_classes=10, stem_width=64):
        super().__init__()
        self.stem = nn.Sequential(
            ScaledConv2d(3, stem_width, 3, 2, 1),
            nn.GELU(),
            ScaledConv2d(stem_width, stem_width, 3, 1, 1),
            nn.GELU(),
            ScaledConv2d(stem_width, stem_width, 3, 1, 1),
            nn.GELU(),
            nn.MaxPool2d(3, 2, 1)
        )

        layers = []
        for stage_idx, (depth, width) in enumerate(zip(depths, widths)):
            in_ch = widths[stage_idx - 1] if stage_idx > 0 else stem_width
            for d in range(depth):
                layers.append(block(in_ch, width, stride=2 if d == 0 and stage_idx > 0 else 1))
                in_ch = width

        self.blocks = nn.Sequential(*layers)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(widths[-1], num_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        x = self.head(x)
        return x


# 📐 NFNet Configurations
def nfnet_f0(num_classes=10):
    return NFNet(NFBlock, depths=[1, 2, 6, 3], widths=[256, 512, 1024, 2048], num_classes=num_classes)


# 🔄 Hyperparameters
BATCH_SIZE = 512
EPOCHS = 20
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# 📦 Dataset & Dataloader
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform_train, download=True)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, transform=transform_test, download=True)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# 🧠 Create Model
model = nfnet_f0().to(DEVICE)

# ⚙️ Optimizer and Loss
optimizer = optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()


# 📈 Train Loop
def train(model, device, loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 50 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(loader.dataset)}]\tLoss: {loss.item():.6f}')


# 📊 Test Loop
def test(model, device, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    accuracy = 100 * correct / total
    print(f'\nTest Accuracy: {correct}/{total} ({accuracy:.2f}%)\n')


# 🏋️‍♂️ Training Loop
for epoch in range(1, EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    test(model, DEVICE, test_loader)

print("✅ Training completed!")

100%|██████████| 170M/170M [00:03<00:00, 45.1MB/s]


Train Epoch: 1 [0/50000]	Loss: 966237718839296.000000
Train Epoch: 1 [25600/50000]	Loss: 6298208225460224.000000

Test Accuracy: 1164/10000 (11.64%)

Train Epoch: 2 [0/50000]	Loss: 1322471970045952.000000
Train Epoch: 2 [25600/50000]	Loss: 566837905457152.000000

Test Accuracy: 1199/10000 (11.99%)

Train Epoch: 3 [0/50000]	Loss: 501828340940800.000000
Train Epoch: 3 [25600/50000]	Loss: 300496246013952.000000

Test Accuracy: 1144/10000 (11.44%)

Train Epoch: 4 [0/50000]	Loss: 312400955834368.000000
Train Epoch: 4 [25600/50000]	Loss: 383688017707008.000000

Test Accuracy: 1148/10000 (11.48%)

Train Epoch: 5 [0/50000]	Loss: 148997935202304.000000
Train Epoch: 5 [25600/50000]	Loss: 195611466924032.000000

Test Accuracy: 1063/10000 (10.63%)

Train Epoch: 6 [0/50000]	Loss: 176034653667328.000000
Train Epoch: 6 [25600/50000]	Loss: 230843284979712.000000

Test Accuracy: 1135/10000 (11.35%)

Train Epoch: 7 [0/50000]	Loss: 221516075630592.000000
Train Epoch: 7 [25600/50000]	Loss: 404619742150656