In [1]:
!pip install guided-filter-pytorch

Collecting guided-filter-pytorch
  Downloading guided_filter_pytorch-3.7.5-py3-none-any.whl.metadata (1.6 kB)
Downloading guided_filter_pytorch-3.7.5-py3-none-any.whl (3.8 kB)
Installing collected packages: guided-filter-pytorch
Successfully installed guided-filter-pytorch-3.7.5


In [2]:
import torch
import numpy as np
import cv2
import torch.optim as optim
import torch.nn.functional as F
import torch.cuda as cuda
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torchvision.models.alexnet import AlexNet_Weights
from guided_filter_pytorch.guided_filter import GuidedFilter

**Functions to create the low and high frequency components**

In [3]:
def createLowFrequencyComponent(img, guided_filter_Radius = 10):

    image = cv2.imread(img)
    grayscale_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    img_tensor = torch.from_numpy(image).float().permute(2, 0, 1).unsqueeze(0) / 255.0
    gray_tensor = torch.from_numpy(grayscale_image).float().unsqueeze(0).unsqueeze(0) / 255.0

    # Use the already defined hr_x (GuidedFilter instance)
    GF = GuidedFilter(r=guided_filter_Radius, eps=0.01)

    low_freq_image = GF(gray_tensor, img_tensor)
    low_freq_image = low_freq_image.squeeze(0).permute(1, 2, 0)    ## convert tensor to proper image dimensions
    low_freq_image = low_freq_image.numpy()     ## convert tensor to numpy array

    return low_freq_image

def createHighFrequencyComponent(img, epsilon=0.01):

    image = cv2.imread(img)
    eps = np.full((1200, 1600, 3), epsilon)     ## for numerical stability
    eps_tensor = torch.from_numpy(eps).float().permute(0, 1, 2)     ## convert eps to tensor

    # create the low frequency image
    low_freq_image = createLowFrequencyComponent(img)
    low_freq_image = torch.from_numpy(low_freq_image)

    # create the high frequency image
    high_frequency_image = image/(low_freq_image + eps_tensor)
    Ih_yuv = cv2.cvtColor(high_frequency_image.detach().numpy(), cv2.COLOR_RGB2YUV)
    Y = Ih_yuv[:, :, 0]
    high_frequency_image = (Y - Y.min()) / (Y.max() - Y.min())

    return high_frequency_image

In [4]:
print(torch.__version__)
print(torchvision.__version__)

2.6.0+cu124
0.21.0+cu124


**Changing the weights of the first convolutional layer of alexnet architecture**

In [5]:
alex_mod = torchvision.models.alexnet(weights = AlexNet_Weights.DEFAULT)
conv_1 = alex_mod.features[0]

# get the weights of the 1st conv layer
weights = conv_1.weight
num_filters = weights.shape[0]
num_color_channels = weights.shape[1]

# change the shape of the conv_1 layer

before_luma_weights = torch.zeros(64, 3, 121)
for i in range(num_filters):
    temp = weights[i].reshape(weights[i].size(0), -1)
    before_luma_weights[i] = temp

# compute the luma weights
luma_weights = torch.zeros((num_filters, 121, 1))       ## initalize the luma_weights
luma_components = torch.tensor([[0.2989, 0.578, 0.114]])  ## luma components for RGB to grayscale conversion

for i in range(num_filters):
    temp = before_luma_weights[i].T @ luma_components.T
    luma_weights[i] = temp

alex_mod.features[0].weight = torch.nn.Parameter(luma_weights.reshape(64, 1, 11, 11))      # set the new luma weights to the conv2d_1 layer

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:01<00:00, 219MB/s] 


In [6]:
from torch import nn

# implement the second stream
class ModifiedSecondStream(nn.Module):
    def __init__(self):
        super(ModifiedSecondStream, self).__init__()
        
        modified_alexnet = alex_mod
        
        self.features = modified_alexnet.features # conv layers
        
        # Use AlexNet fc6 and fc7
        self.fc6 = modified_alexnet.classifier[0]  # Linear(9216, 4096)
        self.relu6 = modified_alexnet.classifier[1]
        self.dropout6 = modified_alexnet.classifier[2]

        self.fc7 = modified_alexnet.classifier[3]  # Linear(4096, 4096)
        self.relu7 = modified_alexnet.classifier[4]
        self.dropout7 = modified_alexnet.classifier[5]
        
        self.fc8 = nn.Linear(4096, 2048)
        self.relu8 = nn.ReLU()
        self.dropout8 = nn.Dropout(p=0.5)
        
        self.fc9 = nn.Linear(2048, 2048)
        self.relu9 = nn.ReLU()
        self.dropout9 = nn.Dropout(p=0.5)
        
        self.fc10 = nn.Linear(2048, 531)
        self.modfc = nn.Linear(531, 2)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.features(x)
        
        x = torch.flatten(x, 1)  # Flatten to (B, 9216)
        
        x = self.fc6(x)
        x = self.relu6(x)
        x = self.dropout6(x)
        
        x = self.fc7(x)
        x = self.relu7(x)
        x = self.dropout7(x)
        
        x = self.fc8(x)
        x = self.relu8(x)
        x = self.dropout8(x)
        
        x = self.fc9(x)
        x = self.relu9(x)
        x = self.dropout9(x)
        
        x = self.fc10(x)
        x = self.modfc(x)
        x = self.softmax(x)
        
        return x

In [7]:
IMG_SIZE = 224
BATCH_SIZE = 24
data_root = '/kaggle/input/11k-hands-training-dataset/content/drive/MyDrive/train_images/train'

# override the ImageFolder to include the custom fucntion
class CustomImageFolder(ImageFolder):
    def __init__(self, root, transform=None):
        super().__init__(root=root, transform=None)  # disable transform for now
        self.base_transform = transform  # keep your transform pipeline without the custom fn

    def __getitem__(self, index):
        path, target = self.samples[index]

        # custom arg is the image itself in your case
        img = createHighFrequencyComponent(path)
        img = cv2.resize(img, (224, 224))
        img = np.expand_dims(img, axis=0)  # shape: (1, 224, 224)
        img = torch.from_numpy(img).float()  # convert to  tensor
        
        if self.base_transform is not None:
            img = self.base_transform(img)

        return img, target

dataset = CustomImageFolder(root=data_root)
dataloader_stream2 = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
classes = dataset.classes

In [8]:
device = torch.device("cuda" if cuda.is_available() else "cpu")

model = ModifiedSecondStream()
model.load_state_dict(torch.load('/kaggle/input/11k-hands-stream2-model/stream2_model.pth'))

modified_second_stream = model.to(device)

pretrained_params = (
    list(modified_second_stream .features.parameters())
    + list(modified_second_stream.fc6.parameters()) + list(modified_second_stream.relu6.parameters()) + list(modified_second_stream.dropout6.parameters())
    + list(modified_second_stream.fc7.parameters()) + list(modified_second_stream.relu7.parameters()) + list(modified_second_stream.dropout7.parameters())
    + list(modified_second_stream.fc8.parameters()) + list(modified_second_stream.relu8.parameters()) + list(modified_second_stream.dropout8.parameters())
    + list(modified_second_stream.fc9.parameters()) + list(modified_second_stream.relu9.parameters()) + list(modified_second_stream.dropout9.parameters())
    + list(modified_second_stream.fc10.parameters())
)

new_fc_params = list(modified_second_stream.modfc.parameters()) + list(modified_second_stream.softmax.parameters())

optimizer = optim.SGD([
    {'params': pretrained_params, 'lr': 1e-4},
    {'params': new_fc_params, 'lr': 0.002}
], momentum=0.9)

criterion = nn.CrossEntropyLoss()

**Creating the Training Loop**

In [None]:
import time
from tqdm import tqdm

num_epochs = 10
for epoch in range(num_epochs):
    modified_second_stream.train()
    total_loss = 0.0
    correct = 0
    total = 0

    # Start timing this epoch
    start_time = time.time()

    # tqdm progress bar for this epoch
    loop = tqdm(dataloader_stream2, total=len(dataloader_stream2), desc=f"Epoch {epoch+1}/{num_epochs}")

    for images, labels in loop:
        images, labels = images.to(device), labels.to(device)

        outputs = modified_second_stream(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        # Update progress bar with latest metrics
        loop.set_postfix(loss=loss.item(), acc=correct / total)

    # End timing
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {epoch_time:.2f} sec — Accuracy: {correct/total:.4f}, Loss: {total_loss/total:.4f}")
    torch.save(modified_second_stream.state_dict(), "/kaggle/working/stream2_model.pth")

Epoch 1/10:   1%|          | 2/324 [00:27<1:13:43, 13.74s/it, acc=0.75, loss=0.584] 