# How to deal with imbalanced datasets

[Original video](https://youtu.be/4JFVhJyTZ44)

For how to create CSV file review [2021.03.02-3 Build custom dataset](https://colab.research.google.com/drive/1v4_SaGOXDprZNP7YcTmi21o5v56YSGDD) CoLab.

Methods for dealing with imbalanced datasets:
  1. Oversampling (more augmentations, etc.)
  2. Class weighting (give higher priority for the network)

In [1]:
# Get Aladdin Persson GitHub repository
!git clone https://github.com/aladdinpersson/Machine-Learning-Collection.git

Cloning into 'Machine-Learning-Collection'...
remote: Enumerating objects: 247, done.[K
remote: Counting objects: 100% (247/247), done.[K
remote: Compressing objects: 100% (216/216), done.[K
remote: Total 508 (delta 52), reused 137 (delta 20), pack-reused 261[K
Receiving objects: 100% (508/508), 19.44 MiB | 23.06 MiB/s, done.
Resolving deltas: 100% (103/103), done.


In [2]:
import os
import torch
import pandas as pd
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from PIL import Image
from torch.utils.data import WeightedRandomSampler, DataLoader, Dataset

In [3]:
images_dir = 'Machine-Learning-Collection/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized'
csv_file = 'Machine-Learning-Collection/ML/Pytorch/Basics/custom_dataset/cats_dogs.csv'

df = pd.read_csv(csv_file)
num_samples = list(df.value_counts(subset='Label', normalize=True))
weights = [1/x for x in num_samples]

num_classes = len(num_samples)

print(f'samples: {num_samples}')
print(f'weights: {weights}')
print(f'classes: {num_classes}')

samples: [0.8, 0.2]
weights: [1.25, 5.0]
classes: 2


## Class weighting

In [4]:
# Multiply loss on weight of the class
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weights))

## Oversampling

In [5]:
class CatsAndDogsDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
        
        if self.transform:
            image = self.transform(image)
        
        return (image, y_label)


def get_loader(root_dir, batch_size):
    my_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    dataset = CatsAndDogsDataset(csv_file=csv_file,
                                 root_dir=images_dir,
                                 transform=my_transforms)
    
    annotations = pd.read_csv(csv_file)
    num_samples = list(df.value_counts(subset='Label', normalize=True))
    class_weights = [1/x for x in num_samples]
    sample_weights = [0] * len(dataset)

    for idx, (data, label) in enumerate(dataset):
        sample_weights[idx] = class_weights[label]

    # if replacement=False, then example is used only once
    # if replacement=True, use example several times for oversampling
    sampler = WeightedRandomSampler(sample_weights,
                                    num_samples=len(sample_weights),
                                    replacement=True)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, sampler=sampler)
    
    return loader


def main():
    loader = get_loader(root_dir=images_dir, batch_size=8)
    
    class_frequency = [0] * num_classes
    for epoch in range(100):
        # Get one batch
        for data, labels in loader:
            # print(data.shape, labels)
            for l in labels:
                class_frequency[l] += 1
    print(f'class frequency: {class_frequency}')

if __name__ == '__main__':
    main()

class frequency: [503, 497]
