# How to implement a MobileNet architecture in PyTorch for image classification

## Understanding MobileNet

**Key Concepts**:

**Depthwise Separable Convolutions**: MobileNets replace traditional convolutional layers with depthwise separable convolutions. This factorization splits the standard convolution process into two parts:

**Depthwise Convolution**: Applying a single filter per input channel.

**Pointwise Convolution**: Simple 1x1 convolutions for combining outputs of the depthwise convolution.

**Efficiency**: This structure significantly reduces the computational cost and number of parameters, making MobileNets ideal for mobile and resource-constrained devices.

**Inverted Residuals (MobileNetV2)**: MobileNetV2 introduces inverted residuals with linear bottlenecks. This means the input and output of the residual blocks are thinner (fewer channels) with a wider representation in the middle.

In [None]:
#imports
import torch
import torch.nn as nn
import torchvision

In [None]:
# Defining the MobileNetV1 Architecture

class MobileNetV1(nn.Module):
    def __init__(self, num_classes=1000):  # Example for ImageNet
        super().__init__()

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU6(inplace=True)  # ReLU6 for numerical stability
            )

        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                # Depthwise
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU6(inplace=True),

                # Pointwise
                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU6(inplace=True),
            )

        self.model = nn.Sequential(
            conv_bn(3, 32, 2),  # Initial convolution
            conv_dw(32, 64, 1),
            # ... (More depthwise separable convolutions, check MobileNetV1 paper)
            nn.AvgPool2d(7),
        )
        self.classifier = nn.Linear(1024, num_classes)  

    def forward(self, x):
        x = self.model(x)
        x = x.view(-1, 1024)  # Flatten
        x = self.classifier(x)
        return x


In [None]:
#Loading a Dataset
transform = torchvision.transforms.Compose([
    # Data augmentation as needed 
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True) 


In [None]:
## training loop
model = MobileNetV1(num_classes=10)  # For CIFAR10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):  # Adjust epochs as needed
    for images, labels in dataloader:
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


**Key Changes for MobileNetV2**

**Inverted Residual Blocks:** MobileNetV2 uses inverted residual blocks where the input and output are thin bottleneck layers. The intermediate layer is "expanded" with more channels.

**Linear Bottlenecks:** The final 1x1 convolution in the inverted residual block does not include non-linearity (like ReLU6). This prevents non-linearities from destroying information in lower-dimensional spaces.

In [None]:
# Defining the MobileNetV2 Architecture:
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super().__init__()
        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # Depthwise
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # Pointwise (Linear)
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # Expansion
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # Depthwise
                # ... (Same as above) 
                # Pointwise (Linear)
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)

class MobileNetV2(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        # ... (Refer to the MobileNetV2 paper for the full layer configuration)
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),   # May add dropout 
            nn.Linear(1280, num_classes)  
        )

    def forward(self, x):
        # ... (Feature extraction layers)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.classifier(x)
        return x


In [None]:
# Define the MobileNetV2 Model
class MobileNetV2(nn.Module):
    def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280

        # Refer to the MobileNetV2 paper for the exact layer configurations
        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t, c, n, s (expansion, output channels, repetitions, stride)
                [1, 16, 1, 1],  
                [6, 24, 2, 2], 
                [6, 32, 3, 2],  
                # ... (More layers)
            ]

        self.features = []
        # Building the initial layers and subsequent inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                stride = s if i == 0 else 1
                self.features.append(block(input_channel, output_channel, stride, expand_ratio=t))
                input_channel = output_channel
        self.features.append(nn.Conv2d(input_channel, last_channel, kernel_size=1, bias=False))
        self.features = nn.Sequential(*self.features)

        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(last_channel, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.mean([2, 3])  # Global average pooling
        x = self.classifier(x)
        return x


In [None]:
# load CIFAR10
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True) 


In [None]:
# train and test (as before)


# Create the model
model = MobileNetV2(num_classes=10)  # 10 classes for CIFAR-10 

# Optimization and loss
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Training Loop
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)

for epoch in range(10):  # Adjust the number of epochs as needed
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data[0].to(device), data[1].to(device) 
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 200 == 199:  # Print every 200 mini-batches
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

# Testing Loop (Simplified)
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:  # Adjust for a separate testloader if needed
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy on test set: %d %%' % (100 * correct / total))
