<a href="https://colab.research.google.com/github/envomp/2020-Foundations-of-Artificial-Intelligence-and-Machine-Learning/blob/master/NeuralNetwork/classifying_unbalanced_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unbalanced classification example

Loads CSV files from the python list so that each file represents a separate class. Then tries to find the best network configuration and hyperparameters for the model.

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
!pip install optuna
import optuna
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
import math
import copy

In [None]:
# download and see what is inside
# !wget http://linuxator.com/data/mlaine/data_class1_s.csv
# !wget http://linuxator.com/data/mlaine/data_class2_d.csv

In [None]:
# this is to allow computation on GPU. To use this, enable under Runtime -> Change Runtime type
# GPU should be already enabled on most cases when using this sheet as a templete
# NB! Be sure to Restart runtime after this or you may get some very odd errors!
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using", device)

Using cuda


In [None]:
# this method accepts the list of CSV files
# first class (index 0) should be underrepresented class
# it will be oversampled 50 times
def build_dataset(csv_list, is_big):
  class_num = 0
  train_datasets = []
  val_datasets = []
  weights = []
  # each file contains a single class
  for csv_file in csv_list:
    df = pd.read_csv(csv_file)
    size = len(df)

    if not is_big:
      df = df[0:min(len(df), 4000)]
    x = torch.from_numpy(df.iloc[:,1:].values.astype(np.float64)).float()

    # split train/val  80%/20% in each CSV file
    train_cnt = math.floor(x.shape[0] * 0.8)
    val_cnt = x.shape[0] - train_cnt

    # oversample class 0 which is undersampled
    if class_num == 0:
      # split 80/20
      train_x, val_x = torch.utils.data.random_split(x, [train_cnt, val_cnt])
      # repeat class 0 50 times
      #print('x', train_x.__len__(), 'val', val_x.__len__())
      # we oversample after splitting to avoid having same example both in
      # training and validation set
      if is_big:
        train_x = x[train_x.indices].repeat([50,1])
        val_x = x[val_x.indices].repeat([50,1])
      else:
        train_x = x[train_x.indices].repeat([8,1])
        val_x = x[val_x.indices].repeat([8,1])
      #print('x', train_x.__len__(), 'val', val_x.__len__())
      # generate ground truth tensors
      train_y = torch.empty((train_x.shape[0]), dtype=torch.long).fill_(class_num)
      val_y = torch.empty((val_x.shape[0]), dtype=torch.long).fill_(class_num)
      # compose datasets
      train_dataset = TensorDataset(train_x, train_y)
      val_dataset = TensorDataset(val_x, val_y)
    # keep every other class as they are
    else:
      # ground truth generation, just use class number
      y = torch.empty((x.shape[0]), dtype=torch.long).fill_(class_num)
      dataset = TensorDataset(x, y)
      # this is how you normally split a dataset
      train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_cnt, val_cnt])
    
    train_datasets.append(train_dataset)
    val_datasets.append(val_dataset)

    # save num of rows to calculate weights for loss fn
    weights.append(train_dataset.__len__())
    class_num += 1
  
  num_classes = class_num
  celoss_weights = 1.0 - (torch.tensor(weights).float() / sum(weights))
  return ConcatDataset(train_datasets), ConcatDataset(val_datasets), num_classes, celoss_weights

In [None]:
train_dataset, val_dataset, num_classes, celoss_weights = build_dataset(['data_class1_s.csv', 'data_class2_d.csv'], False)
print(train_dataset.__len__(), val_dataset.__len__())
print(celoss_weights)

6688 1672
tensor([0.4785, 0.5215])


In [None]:
class ClassificationNetwork(nn.Module):

  def __init__(self, num_classes, trial):
    super().__init__()
    self.num_classes = num_classes
    num_layers = trial.suggest_int("num_layers", 1, 3)
    in_features = 8
    layers = []

    for layer_num in range(num_layers):
      out_features = trial.suggest_int("out_features_{}".format(layer_num), 2, 10)
      layer = nn.Linear(in_features=in_features, out_features=out_features, bias=True)
      nn.init.xavier_uniform_(layer.weight)
      nn.init.zeros_(layer.bias)
      layers.append(layer)
      layers.append(nn.Sigmoid())
      in_features = out_features
      
    layers.append(nn.Linear(in_features=in_features, out_features=num_classes))
    self.model = nn.Sequential(*layers)

  def forward(self, data):
    return self.model(data)


In [None]:
def objective(trial): 
  model = ClassificationNetwork(num_classes, trial).to(device)
  
  batch_size = 1 # bigger batches don't work
  dl = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)

  learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.499, 0.002)
  optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate)
  loss = nn.CrossEntropyLoss(celoss_weights).to(device)

  epochs = 5
  for i in range (0, epochs):
    model.train()

    epoch_loss = 0.0
    for x, y in dl:
      x, y = x.to(device), y.to(device)
      optimiser.zero_grad()
      prediction = model(x)
      iter_loss = loss(prediction, y)
      epoch_loss += iter_loss
      iter_loss.backward()
      optimiser.step()

    epoch_loss = epoch_loss/dl.__len__()
    # print("Epoch loss on ", i, ":", epoch_loss)
    trial.set_user_attr("model", model)    
    val_loss = validate(model)
    trial.report(val_loss, i)
    if trial.should_prune():
      raise optuna.TrialPruned()

  return val_loss


# Let's write the validation code!

1. Put model into evaluation mode
2. Use model to predict on validation data
3. Evaluate results using loss function

Evaluation

1. Does not update model parameters
2. Hence no need to compute gradients

In [None]:
def validate(model, output=False):
  model.eval()
  total_loss = 0.0
  loss = nn.CrossEntropyLoss(celoss_weights.to(device))

  dl = DataLoader(val_dataset, batch_size=1)
  ok = [0,0]
  cls_cnt = [0,0]

  with torch.no_grad(): # no gradient calculation inside this block
    for data in dl:
      x, y = data[0].to(device), data[1].to(device)
      prediction = model(x)
      
      iter_loss = loss(prediction, y)

      if output is True:
        cls_cnt[y.item()] += 1
        norm_pred = torch.nn.functional.softmax(prediction, dim=1)
        if norm_pred[0][y.item()] > 0.5:
          ok[y.item()] += 1
      total_loss += iter_loss.item()

  total_loss = total_loss/val_dataset.__len__()
  if output is True:
    print("Mean loss:", total_loss, "total", cls_cnt, "correct", ok)

  return total_loss



In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2021-04-04 22:05:49,288][0m A new study created in memory with name: no-name-4e48b1fc-a430-4adf-8194-37e4a5c06a1e[0m
[32m[I 2021-04-04 22:10:17,610][0m Trial 0 finished with value: 0.7731490429206095 and parameters: {'num_layers': 1, 'out_features_0': 5, 'learning_rate': 0.447}. Best is trial 0 with value: 0.7731490429206095.[0m
[32m[I 2021-04-04 22:15:40,234][0m Trial 1 finished with value: 0.6748915778337348 and parameters: {'num_layers': 2, 'out_features_0': 8, 'out_features_1': 5, 'learning_rate': 0.027000000000000003}. Best is trial 1 with value: 0.6748915778337348.[0m
[32m[I 2021-04-04 22:20:07,782][0m Trial 2 finished with value: 0.6837587257760975 and parameters: {'num_layers': 1, 'out_features_0': 5, 'learning_rate': 0.069}. Best is trial 1 with value: 0.6748915778337348.[0m
[32m[I 2021-04-04 22:26:20,425][0m Trial 3 finished with value: 0.7016811350211475 and parameters: {'num_layers': 3, 'out_features_0': 8, 'out_features_1': 4, 'out_features_2': 5, 'lea

In [None]:
best_model = study.best_trial.user_attrs["model"]
validate(best_model, output=True)

Mean loss: 0.6702573117621278 total [5450, 8485] correct [0, 8485]


0.6702573117621278

In [None]:
# now try real data

train_dataset, val_dataset, num_classes, celoss_weights = build_dataset(['data_class1_s.csv', 'data_class2_d.csv'], True)
print(train_dataset.__len__(), val_dataset.__len__())
print(celoss_weights)

validate(best_model, output=True)

55739 13935
tensor([0.6089, 0.3911])
Mean loss: 0.6702573117621278 total [5450, 8485] correct [0, 8485]


0.6702573117621278

Since 98.737% of real data belongs into same class, then we can just always predict that class and our average success rate would be that. 

There are only barely over 500 instances of one class compared to over 42000 instances of other class.

After spending 13+ hours on this problem I must admit, that this problem has beaten me. I didn't find a good way of solving this problem using machine learning.

Also tried SimpleKMeans, RandomForest and RandomTree classifier using Weka Explorer. 

Out of them next randomTree gave good results with next modifications:
- both CSV files were merged into one.
- columns were named with numbers 1 to 9.
- new column "res" was added which indicated the result class. (0 or 1)
- first column was removed as it held no significant information.

Command ran: `RandomTree -K 0 -M 1.0 -V 0.001 -S 1`

```

=== Run information ===

Scheme:       weka.classifiers.trees.RandomTree -K 0 -M 1.0 -V 0.001 -S 1
Relation:     data_class2_d-weka.filters.unsupervised.attribute.Remove-R1
Instances:    42971
Attributes:   9
              2
              3
              4
              5
              6
              7
              8
              9
              res
Test mode:    evaluate on training data

=== Classifier model (full training set) ===


RandomTree
==========

5 < 1359990659
|   3 < 3.5
|   |   9 < 0.45 : 0 (38376/0)
|   |   9 >= 0.45
|   |   |   7 < 5.11 : 1 (2/0)
|   |   |   7 >= 5.11
|   |   |   |   2 < 6392.32
|   |   |   |   |   3 < 1.5
|   |   |   |   |   |   6 < 1400333549.5 : 1 (4/0)
|   |   |   |   |   |   6 >= 1400333549.5 : 0 (1877/0)
|   |   |   |   |   3 >= 1.5 : 0 (462/0)
|   |   |   |   2 >= 6392.32 : 1 (1/0)
|   3 >= 3.5
|   |   7 < 1021.01
|   |   |   2 < 170.33
|   |   |   |   7 < 809.39
|   |   |   |   |   8 < 1396.91
|   |   |   |   |   |   8 < 1065.57
|   |   |   |   |   |   |   8 < 0.02
|   |   |   |   |   |   |   |   8 < 0.01
|   |   |   |   |   |   |   |   |   5 < 1305287015
|   |   |   |   |   |   |   |   |   |   6 < 1398804747 : 1 (1/0)
|   |   |   |   |   |   |   |   |   |   6 >= 1398804747 : 0 (38/0)
|   |   |   |   |   |   |   |   |   5 >= 1305287015 : 0 (325/0)
|   |   |   |   |   |   |   |   8 >= 0.01 : 1 (1/0)
|   |   |   |   |   |   |   8 >= 0.02 : 0 (1099/0)
|   |   |   |   |   |   8 >= 1065.57
|   |   |   |   |   |   |   3 < 36.5 : 0 (7/0)
|   |   |   |   |   |   |   3 >= 36.5 : 1 (1/0)
|   |   |   |   |   8 >= 1396.91
|   |   |   |   |   |   2 < 38.16 : 1 (1/0)
|   |   |   |   |   |   2 >= 38.16
|   |   |   |   |   |   |   4 < 0.5 : 0 (1/0)
|   |   |   |   |   |   |   4 >= 0.5 : 0.5 (6/0.25)
|   |   |   |   7 >= 809.39
|   |   |   |   |   9 < 0.29 : 0 (4/0)
|   |   |   |   |   9 >= 0.29
|   |   |   |   |   |   4 < 30 : 0 (1/0)
|   |   |   |   |   |   4 >= 30 : 1 (2/0)
|   |   |   2 >= 170.33
|   |   |   |   6 < 1428003360
|   |   |   |   |   5 < 1347153618
|   |   |   |   |   |   6 < 1411012367 : 0 (2/0)
|   |   |   |   |   |   6 >= 1411012367
|   |   |   |   |   |   |   2 < 206.49 : 0.5 (4/0.25)
|   |   |   |   |   |   |   2 >= 206.49
|   |   |   |   |   |   |   |   2 < 227.78 : 0 (1/0)
|   |   |   |   |   |   |   |   2 >= 227.78
|   |   |   |   |   |   |   |   |   3 < 8.5 : 0 (1/0)
|   |   |   |   |   |   |   |   |   3 >= 8.5 : 0.5 (4/0.25)
|   |   |   |   |   5 >= 1347153618 : 1 (1/0)
|   |   |   |   6 >= 1428003360
|   |   |   |   |   8 < 1500 : 0 (66/0)
|   |   |   |   |   8 >= 1500 : 0.5 (2/0.25)
|   |   7 >= 1021.01
|   |   |   8 < 1460.5 : 0 (62/0)
|   |   |   8 >= 1460.5
|   |   |   |   6 < 1427998514
|   |   |   |   |   5 < 1312184732
|   |   |   |   |   |   2 < 380.36
|   |   |   |   |   |   |   2 < 366.07
|   |   |   |   |   |   |   |   8 < 2582.97 : 0 (3/0)
|   |   |   |   |   |   |   |   8 >= 2582.97 : 0.5 (2/0.25)
|   |   |   |   |   |   |   2 >= 366.07 : 0.5 (2/0.25)
|   |   |   |   |   |   2 >= 380.36 : 0 (9/0)
|   |   |   |   |   5 >= 1312184732
|   |   |   |   |   |   5 < 1334063058 : 0.5 (6/0.25)
|   |   |   |   |   |   5 >= 1334063058
|   |   |   |   |   |   |   5 < 1338578613 : 0 (1/0)
|   |   |   |   |   |   |   5 >= 1338578613 : 0.5 (2/0.25)
|   |   |   |   6 >= 1427998514
|   |   |   |   |   4 < 52.5
|   |   |   |   |   |   2 < 1635.85
|   |   |   |   |   |   |   3 < 4.5
|   |   |   |   |   |   |   |   7 < 1884.78
|   |   |   |   |   |   |   |   |   5 < 1296852383
|   |   |   |   |   |   |   |   |   |   5 < 1287477484.5 : 0.5 (2/0.25)
|   |   |   |   |   |   |   |   |   |   5 >= 1287477484.5 : 0 (1/0)
|   |   |   |   |   |   |   |   |   5 >= 1296852383 : 0.5 (12/0.25)
|   |   |   |   |   |   |   |   7 >= 1884.78 : 0.5 (18/0.25)
|   |   |   |   |   |   |   3 >= 4.5
|   |   |   |   |   |   |   |   8 < 3205.35
|   |   |   |   |   |   |   |   |   5 < 1322418959.5
|   |   |   |   |   |   |   |   |   |   8 < 1669.21 : 0.5 (4/0.25)
|   |   |   |   |   |   |   |   |   |   8 >= 1669.21
|   |   |   |   |   |   |   |   |   |   |   3 < 5.5 : 0 (4/0)
|   |   |   |   |   |   |   |   |   |   |   3 >= 5.5
|   |   |   |   |   |   |   |   |   |   |   |   5 < 1293890987.5 : 0.5 (4/0.25)
|   |   |   |   |   |   |   |   |   |   |   |   5 >= 1293890987.5 : 0 (2/0)
|   |   |   |   |   |   |   |   |   5 >= 1322418959.5
|   |   |   |   |   |   |   |   |   |   8 < 2000 : 0.5 (8/0.25)
|   |   |   |   |   |   |   |   |   |   8 >= 2000
|   |   |   |   |   |   |   |   |   |   |   2 < 333.33 : 0 (1/0)
|   |   |   |   |   |   |   |   |   |   |   2 >= 333.33 : 0.5 (2/0.25)
|   |   |   |   |   |   |   |   8 >= 3205.35
|   |   |   |   |   |   |   |   |   3 < 22.5
|   |   |   |   |   |   |   |   |   |   3 < 19
|   |   |   |   |   |   |   |   |   |   |   2 < 428.06
|   |   |   |   |   |   |   |   |   |   |   |   2 < 408.89
|   |   |   |   |   |   |   |   |   |   |   |   |   8 < 3359 : 0.5 (6/0.25)
|   |   |   |   |   |   |   |   |   |   |   |   |   8 >= 3359
|   |   |   |   |   |   |   |   |   |   |   |   |   |   5 < 1301591468.5 : 0.5 (2/0.25)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   5 >= 1301591468.5 : 0 (1/0)
|   |   |   |   |   |   |   |   |   |   |   |   2 >= 408.89 : 0 (1/0)
|   |   |   |   |   |   |   |   |   |   |   2 >= 428.06
|   |   |   |   |   |   |   |   |   |   |   |   5 < 1297253869.5
|   |   |   |   |   |   |   |   |   |   |   |   |   3 < 14.5 : 0.5 (6/0.25)
|   |   |   |   |   |   |   |   |   |   |   |   |   3 >= 14.5 : 0 (2/0)
|   |   |   |   |   |   |   |   |   |   |   |   5 >= 1297253869.5
|   |   |   |   |   |   |   |   |   |   |   |   |   2 < 650.11 : 0.5 (30/0.25)
|   |   |   |   |   |   |   |   |   |   |   |   |   2 >= 650.11
|   |   |   |   |   |   |   |   |   |   |   |   |   |   2 < 683.33 : 0 (1/0)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   2 >= 683.33 : 0.5 (6/0.25)
|   |   |   |   |   |   |   |   |   |   3 >= 19 : 0 (1/0)
|   |   |   |   |   |   |   |   |   3 >= 22.5 : 0.5 (16/0.25)
|   |   |   |   |   |   2 >= 1635.85
|   |   |   |   |   |   |   2 < 6373.01 : 0 (2/0)
|   |   |   |   |   |   |   2 >= 6373.01 : 0.5 (2/0.25)
|   |   |   |   |   4 >= 52.5
|   |   |   |   |   |   2 < 365.26 : 0.5 (2/0.25)
|   |   |   |   |   |   2 >= 365.26 : 1 (1/0)
5 >= 1359990659 : 1 (457/0)

Size of the tree : 123

Time taken to build model: 0.05 seconds
```

In [None]:
import csv

with open('res.csv', 'r') as file:
    reader = csv.reader(file, delimiter=',')

    correct = 0
    total = 0

    for i, a, p, e in reader:
        if i.isdigit():
            total += 1

            if p == '?' or float(a) < 0.5 and p == "0": # on failure default to class 0
                correct += 1
            if float(a) >= 0.5 and p == "1":
                correct += 1

    print("Correct guesses out of all: " + str(correct) + " / " + str(total))
    print("Percentually it is: " + str(round(correct / total * 100, 2)) + "%")


Correct guesses out of all: 42823 / 42971
Percentually it is: 99.66%


All-in-all it was a good exercise but I was unsuccessful in choosing and implementing a high performing neural network.