<a href="https://colab.research.google.com/github/envomp/2020-Foundations-of-Artificial-Intelligence-and-Machine-Learning/blob/master/NeuralNetwork/classifying_unbalanced_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unbalanced classification example

Loads CSV files from the python list so that each file represents a separate class. Then tries to find the best network configuration and hyperparameters for the model.

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
!pip install optuna
import optuna
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
import math
import copy

In [None]:
# download and see what is inside
!wget http://linuxator.com/data/mlaine/data_class1_s.csv
!wget http://linuxator.com/data/mlaine/data_class2_d.csv

In [109]:
# this is to allow computation on GPU. To use this, enable under Runtime -> Change Runtime type
# GPU should be already enabled on most cases when using this sheet as a templete
# NB! Be sure to Restart runtime after this or you may get some very odd errors!
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using", device)

Using cuda


In [110]:
# this method accepts the list of CSV files
# first class (index 0) should be underrepresented class
# it will be oversampled 50 times
def build_dataset(csv_list):
  class_num = 0
  train_datasets = []
  val_datasets = []
  weights = []
  # each file contains a single class
  for csv_file in csv_list:
    df = pd.read_csv(csv_file)
    x = torch.from_numpy(df.iloc[:,1:].values.astype(np.float64)).float()

    # split train/val  80%/20% in each CSV file
    train_cnt = math.floor(x.shape[0] * 0.8)
    val_cnt = x.shape[0] - train_cnt

    # oversample class 0 which is undersampled
    if class_num == 0:
      # split 80/20
      train_x, val_x = torch.utils.data.random_split(x, [train_cnt, val_cnt])
      # repeat class 0 50 times
      #print('x', train_x.__len__(), 'val', val_x.__len__())
      # we oversample after splitting to avoid having same example both in
      # training and validation set
      train_x = x[train_x.indices].repeat([50,1])
      val_x = x[val_x.indices].repeat([50,1])
      #print('x', train_x.__len__(), 'val', val_x.__len__())
      # generate ground truth tensors
      train_y = torch.empty((train_x.shape[0]), dtype=torch.long).fill_(class_num)
      val_y = torch.empty((val_x.shape[0]), dtype=torch.long).fill_(class_num)
      # compose datasets
      train_dataset = TensorDataset(train_x, train_y)
      val_dataset = TensorDataset(val_x, val_y)
    # keep every other class as they are
    else:
      # ground truth generation, just use class number
      y = torch.empty((x.shape[0]), dtype=torch.long).fill_(class_num)
      dataset = TensorDataset(x, y)
      # this is how you normally split a dataset
      train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_cnt, val_cnt])
    
    train_datasets.append(train_dataset)
    val_datasets.append(val_dataset)

    # save num of rows to calculate weights for loss fn
    weights.append(train_dataset.__len__())
    class_num += 1
  
  num_classes = class_num
  celoss_weights = 1.0 - (torch.tensor(weights).float() / sum(weights))
  
  return ConcatDataset(train_datasets), ConcatDataset(val_datasets), num_classes, celoss_weights

In [111]:
train_dataset, val_dataset, num_classes, celoss_weights = build_dataset(['data_class1_s.csv', 'data_class2_d.csv'])
print(train_dataset.__len__(), val_dataset.__len__())

55739 13935


In [112]:
# celoss_weights[0] = celoss_weights[0] * 3
print(celoss_weights)

tensor([0.6089, 0.3911])


In [113]:
class ClassificationNetwork(nn.Module):

  def __init__(self, num_classes, trial):
    super().__init__()
    self.num_classes = num_classes
    num_layers = trial.suggest_int("num_layers", 1, 3)
    in_features = 8
    layers = []

    for layer_num in range(num_layers):
      out_features = trial.suggest_int("out_features_{}".format(layer_num), 2, 10)
      layers.append(nn.Linear(in_features=in_features, out_features=out_features))
      layers.append(nn.ReLU())
      in_features = out_features
      
    layers.append(nn.Linear(in_features=in_features, out_features=num_classes))
    self.model = nn.Sequential(*layers)

  # method that runs the model and returns the result
  # it is always called forward()
  def forward(self, data):
    return self.model(data)


In [119]:
def objective(trial): 
  model = ClassificationNetwork(num_classes, trial).to(device)
  
  batch_size = 128
  dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.5, 0.002)
  optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate)

  loss = nn.CrossEntropyLoss(celoss_weights).to(device)
  
  epochs = 5
  for i in range (0, epochs):
    model.train()

    for x, y in dl:
      x, y = x.to(device), y.to(device)
      optimiser.zero_grad()
      prediction = model(x)
      iter_loss = loss(prediction, y)
      iter_loss.backward()
      optimiser.step()

    trial.set_user_attr("model", model)    
    val_loss = validate(model)
    trial.report(val_loss, i)
    if trial.should_prune():
      raise optuna.TrialPruned()

  return val_loss


# Let's write the validation code!

1. Put model into evaluation mode
2. Use model to predict on validation data
3. Evaluate results using loss function

Evaluation

1. Does not update model parameters
2. Hence no need to compute gradients

In [121]:
def validate(model, output=False):
  model.eval()
  total_loss = 0.0
  loss = nn.CrossEntropyLoss(celoss_weights.to(device))

  dl = DataLoader(val_dataset, batch_size=1)
  ok = [0,0]
  cls_cnt = [0,0]

  with torch.no_grad(): # no gradient calculation inside this block
    for data in dl:
      x, y = data[0].to(device), data[1].to(device)
      prediction = model(x)
      
      iter_loss = loss(prediction, y)

      if output is True:
        cls_cnt[y.item()] += 1
        norm_pred = torch.nn.functional.softmax(prediction)
        # print out to debug values
        if norm_pred[0][y.item()] > 0.5:
          ok[y.item()] += 1
      total_loss += iter_loss.item()

  total_loss = total_loss/val_dataset.__len__()
  if output is True:
    print("Mean loss:", total_loss, "total", cls_cnt, "correct", ok)

  return total_loss


In [None]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=10))
study.optimize(objective, n_trials=100)

[32m[I 2021-04-04 13:49:27,044][0m A new study created in memory with name: no-name-ef5fa6cd-8f2e-40a8-a20c-1c908e8f1a88[0m

The distribution is specified by [0.001, 0.5] and q=0.002, but the range is not divisible by `q`. It will be replaced by [0.001, 0.499].

[32m[I 2021-04-04 13:50:11,051][0m Trial 0 finished with value: 0.6958707416831784 and parameters: {'num_layers': 2, 'out_features_0': 6, 'out_features_1': 2, 'learning_rate': 0.039}. Best is trial 0 with value: 0.6958707416831784.[0m
[32m[I 2021-04-04 13:50:48,074][0m Trial 1 finished with value: 0.6841793399923028 and parameters: {'num_layers': 1, 'out_features_0': 3, 'learning_rate': 0.329}. Best is trial 1 with value: 0.6841793399923028.[0m
[32m[I 2021-04-04 13:51:38,784][0m Trial 2 finished with value: 0.6833625862065463 and parameters: {'num_layers': 3, 'out_features_0': 4, 'out_features_1': 4, 'out_features_2': 6, 'learning_rate': 0.10300000000000001}. Best is trial 2 with value: 0.6833625862065463.[0m
[32m[

In [None]:
best_model = study.best_trial.user_attrs["model"]
validate(best_model, output=True)