# How to build custom Datasets for Images in Pytorch

[Original video](https://youtu.be/ZoZHd0Zm3RY)

Info on [how to get your api key (kaggle.json) here](https://github.com/Kaggle/kaggle-api#api-credentials)

[Using Kaggle datasets in Google CoLab](https://stackoverflow.com/a/56401296/7550928)

[Cats vs. dogs dataset with CSV](https://www.kaggle.com/dataset/c75fbba288ac0418f7786b16e713d2364a1a27936e63f4ec47502d73d6ef30ab)

[Dogs vs. cats dataset without CSV file](https://www.kaggle.com/c/dogs-vs-cats/data) so you have to create it youself.

In [1]:
# Info on how to get your api key (kaggle.json) here:
# https://github.com/Kaggle/kaggle-api#api-credentials

# Install kaggle packages if necessary. Not necessary for CoLab
# !pip install -q kaggle
# !pip install -q kaggle-cli

# Colab's file access feature
from google.colab import files

# Upload `kaggle.json` file
uploaded = files.upload()

# Retrieve uploaded file and print results
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))


# Then copy kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

# Download the dataset
!kaggle competitions download -c dogs-vs-cats
#!kaggle datasets download -d aladdinpersson/cats-dogs-example-with-csv
#!kaggle datasets list -s aladdinpersson
#!kaggle competitions list -s LANL-Earthquake-Prediction

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 65 bytes
kaggle.json
Downloading test1.zip to /content
 95% 257M/271M [00:01<00:00, 201MB/s]
100% 271M/271M [00:01<00:00, 161MB/s]
Downloading sampleSubmission.csv to /content
  0% 0.00/86.8k [00:00<?, ?B/s]
100% 86.8k/86.8k [00:00<00:00, 91.3MB/s]
Downloading train.zip to /content
 98% 532M/543M [00:02<00:00, 239MB/s]
100% 543M/543M [00:02<00:00, 242MB/s]


In [2]:
# Unzip
import zipfile

with zipfile.ZipFile('train.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [3]:
import os

source_dir = './train'

train_files = os.listdir(source_dir)
print(f'images number: {len(train_files)}')

images number: 25000


In [4]:
# # Resize all images in the dataset
# #
# # NOTE: Use transforms or albumentations library
# #   import torchvision.transforms as transforms
# #   import albumentations as A
# #
# from PIL import Image

# resized_dir = 'cats_dogs_resized'

# if os.path.isdir(resized_dir):
#     !rm -rf '$resized_dir'

# !mkdir '$resized_dir'

# # Show image sizes. Sizes are different.
# # for i, f in enumerate(train_files):
# #     img = Image.open(os.path.join(source_dir, f))
# #     print(img.size)
# #     if i >= 3:
# #         continue

# # Resize all images to 224x224
# width = height = 224
# for f in train_files:
#     img = Image.open(os.path.join(source_dir, f))
#     img = img.resize((width, height), Image.LANCZOS)
#     img.save(os.path.join(resized_dir, f))

# resized_files = os.listdir(resized_dir)
# print(f'resized images: {len(resized_files)}')

In [5]:
# Create CSV file
import pandas as pd

csv_file = 'cats_dogs.csv'

l = []
for f in train_files:
    s = f.split('.')
    if s[0] == 'cat':
        l.append([f, 0])
    elif s[0] == 'dog':
        l.append([f, 1])
    else:
        print('Error: wrong file name')

cats_dogs = pd.DataFrame(l, columns=['Filename', 'Label'])
cats_dogs.to_csv(csv_file, index=False)

print(cats_dogs.shape)
print(cats_dogs.groupby(by='Label').count())
cats_dogs.head(n=10)

(25000, 2)
       Filename
Label          
0         12500
1         12500


Unnamed: 0,Filename,Label
0,dog.1358.jpg,1
1,dog.2754.jpg,1
2,dog.9858.jpg,1
3,cat.274.jpg,0
4,cat.1928.jpg,0
5,dog.8971.jpg,1
6,cat.5556.jpg,0
7,dog.7859.jpg,1
8,dog.8085.jpg,1
9,dog.1257.jpg,1


In [6]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from skimage import io
from torch.utils.data import DataLoader, Dataset

In [7]:
# Hyperparameters
in_channel = 3
num_classes = 10
learning_rate = 1e-3
batch_size = 32
num_epochs = 5

load_model = True
filename = 'my_checkpoint.pth.tar'

In [8]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
    
    def forward(self, x):
        return x


class CatsAndDogsDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)  # 25 000 images

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image = io.imread(img_path)
        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
        
        if self.transform:
            image = self.transform(image)
        
        return (image, y_label)

In [9]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
# Load pretrained model
model = torchvision.models.googlenet(pretrained=True).to(device)
#print(model)
print('\n----------------\n')

# Freeze parameters of the model
for param in model.parameters():
    param.requires_grad = False

model.fc = nn.Linear(1024, 2)

model.to(device)
print(model)

Downloading: "https://download.pytorch.org/models/googlenet-1378be20.pth" to /root/.cache/torch/hub/checkpoints/googlenet-1378be20.pth


HBox(children=(FloatProgress(value=0.0, max=52147035.0), HTML(value='')))



----------------

GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1

In [11]:
# Check if it runs correctly
x = torch.randn(batch_size, in_channel, 224, 224).to(device)  # fake data type

# Run model on the input and print the shape
print(model(x).shape)

torch.Size([32, 2])


In [12]:
# Convert to PIL image, resize and convert to tensor
my_transforms = transforms.Compose([
    transforms.ToPILImage(),  # transform to PIL format
    transforms.Resize((224, 224)),
    transforms.ToTensor(),  # at the end transform to tensor
])

# Load data
dataset = CatsAndDogsDataset(csv_file=csv_file, root_dir=source_dir,
                             transform=my_transforms)
train_set, test_set = torch.utils.data.random_split(dataset, [20000, 5000])
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)

In [16]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

In [17]:
def save_checkpoint(state, filename=filename):
    print('=> Saving checkpoint')
    torch.save(state, filename)

def load_checkpoint(checkpoint):
    print('=> Loading checkpoint')
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    best_acc = checkpoint['acc']

In [None]:
if load_model and os.path.exists(filename):
    load_checkpoint(torch.load(filename))
else:
    best_acc = 0

In [None]:
# Check accuracy on training and test to see how good our model
def check_accuracy(loader, model, test=True):
    if test:
        print('Checking accuracy on test data', end='')
    else:
        print('Checking accuracy on training data', end='')
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)  # shape = batch_size x num_classes
            _, predictions = scores.max(dim=1)  # get index of max value
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    acc = float(num_correct) / float(num_samples) * 100
    #print(f'Got {num_correct} / {num_samples} with accuracy {acc:.2f}')

    model.train()
    return acc

In [None]:
# Unfreeze parameters and train again
# Empirically it is not better for 5 epochs

# for param in model.parameters():
#     param.requires_grad = True

In [None]:
# Train network
for epoch in range(num_epochs):
    losses = []

    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to Cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)

        # Forward
        scores = model(data)  # shape batch_size x num_classes
        loss = criterion(scores, targets)
        losses.append(loss.item())

        # Backward
        optimizer.zero_grad()
        loss.backward()

        # Gradient descent or adam step
        optimizer.step()
    
    mean_loss = sum(losses)/len(losses)
    acc = check_accuracy(test_loader, model)
    print(f'\rLoss at epoch {epoch} is {mean_loss:.5f}. Accuracy is {acc:.2f}')

    if best_acc < acc:
        best_acc = acc
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': best_acc}
        save_checkpoint(checkpoint)

Loss at epoch 0 is 0.10383. Accuracy is 96.66
=> Saving checkpoint
Loss at epoch 1 is 0.11090. Accuracy is 97.42
=> Saving checkpoint
Loss at epoch 2 is 0.09506. Accuracy is 97.52
=> Saving checkpoint
Loss at epoch 3 is 0.09763. Accuracy is 97.60
=> Saving checkpoint
Loss at epoch 4 is 0.08620. Accuracy is 97.90
=> Saving checkpoint


In [None]:
# 2021.03.02 - best accuracy was 97.98 %
print(f': {check_accuracy(train_loader, model, test=False):.2f}')
print(f': {check_accuracy(test_loader, model):.2f}')

Checking accuracy on training data: 98.06
Checking accuracy on test data: 97.90
