<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
from scipy.stats import mstats

from jitsdp import metrics

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/jenkins.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entropy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,author_date_unix_timestamp,classification,contains_bug
0,False,7.0,7.0,7.0,2.641604,9.0,9.0,426.428571,100.0,9.3e-05,1.0,5171.0,30.227271,1472.714286,1555326371,,False
1,False,7.0,7.0,7.0,2.75,8.0,8.0,426.428571,100.0,6.314775,2.0,5170.0,29.227271,1471.714286,1555326363,,False
2,False,1.0,1.0,2.0,0.90658,15.0,44.0,96.0,4.0,0.034722,2.0,629.0,14.828373,414.0,1554971763,,False
3,False,1.0,1.0,1.0,0.0,0.0,0.0,40.0,1.0,1.2e-05,1.0,4.0,3.058824,3.0,1554969774,,False
4,False,1.0,2.0,4.0,1.662506,14.0,10.0,67.0,6.0,21.280683,4.0,3.0,2.058824,2.0,1554967752,Feature Addition,False


In [4]:
#df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp', 'classification']
x = df[features_cols]
x['fix'] = x['fix'].astype('int')
df_classification = pd.get_dummies(x, columns=['classification'])
x = pd.concat([x, df_classification], axis='columns')
x = x.drop(['classification'], axis='columns')
x = x.values
y = df[label_col]
y = y.astype('category')
y = y.cat.codes
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [5]:
print(x[:1])
print(y[:1])

[[0.00000000e+00 7.00000000e+00 7.00000000e+00 7.00000000e+00
  2.64160417e+00 9.00000000e+00 9.00000000e+00 4.26428571e+02
  1.00000000e+02 9.25925926e-05 1.00000000e+00 5.17100000e+03
  3.02272705e+01 1.47271429e+03 0.00000000e+00 7.00000000e+00
  7.00000000e+00 7.00000000e+00 2.64160417e+00 9.00000000e+00
  9.00000000e+00 4.26428571e+02 1.00000000e+02 9.25925926e-05
  1.00000000e+00 5.17100000e+03 3.02272705e+01 1.47271429e+03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[0]


In [6]:
val_index = int( len(x) * 0.8 )
test_index = int( len(x) * 0.9 )
x_train, y_train = x[:val_index], y[:val_index]
x_val, y_val = x[val_index:test_index], y[val_index:test_index]
x_test, y_test = x[test_index:], y[test_index:]

In [7]:
print(x_train[:1])
print(x_val[:1])
print(x_test[:1])

[[0.00000000e+00 7.00000000e+00 7.00000000e+00 7.00000000e+00
  2.64160417e+00 9.00000000e+00 9.00000000e+00 4.26428571e+02
  1.00000000e+02 9.25925926e-05 1.00000000e+00 5.17100000e+03
  3.02272705e+01 1.47271429e+03 0.00000000e+00 7.00000000e+00
  7.00000000e+00 7.00000000e+00 2.64160417e+00 9.00000000e+00
  9.00000000e+00 4.26428571e+02 1.00000000e+02 9.25925926e-05
  1.00000000e+00 5.17100000e+03 3.02272705e+01 1.47271429e+03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[[0.00000000e+00 8.00000000e+00 8.00000000e+00 8.00000000e+00
  2.73215889e+00 1.10000000e+01 1.10000000e+01 1.58625000e+02
  5.00000000e+00 1.96759259e-04 1.00000000e+00 4.17300000e+03
  2.45570735e+02 8.19125000e+02 0.00000000e+00 8.00000000e+00
  8.00000000e+00 8.00000000e+00 2.73215889e+00 1.10000000e+01
  1.10000000e+01 1.58625000e+02 5.00000000e+00 1.96759259e-04
  1.00000000e+00 4.17300000e+03 2.45570735e+02 8.19125000e+02
  0.00000000e+00 0.00000000e+00 0.00

In [8]:
mean_train = x_train.mean(axis=0)
std_train = x_train.std(axis=0)

In [9]:
print(mean_train)
print(std_train)

[2.90871477e-01 1.61432635e+00 2.60858547e+00 4.18908715e+00
 7.43390526e-01 5.00800621e+01 2.96331523e+01 8.11968929e+02
 2.32963538e+01 9.76994760e+01 2.02296354e+00 1.73095790e+03
 6.53773667e+01 8.28714959e+02 2.90871477e-01 1.61432635e+00
 2.60858547e+00 4.18908715e+00 7.43390526e-01 5.00800621e+01
 2.96331523e+01 8.11968929e+02 2.32963538e+01 9.76994760e+01
 2.02296354e+00 1.73095790e+03 6.53773667e+01 8.28714959e+02
 2.90871477e-01 1.47142488e-01 3.46521852e-02 4.71373157e-01
 1.51538660e-02 4.08068270e-02]
[4.54164354e-01 1.62736223e+00 6.93052365e+00 2.56616350e+01
 1.13236779e+00 5.86250473e+02 6.19852440e+02 1.15236538e+03
 2.84287684e+01 2.37802272e+02 7.67653474e+00 2.11471132e+03
 6.96820390e+01 1.34456679e+03 4.54164354e-01 1.62736223e+00
 6.93052365e+00 2.56616350e+01 1.13236779e+00 5.86250473e+02
 6.19852440e+02 1.15236538e+03 2.84287684e+01 2.37802272e+02
 7.67653474e+00 2.11471132e+03 6.96820390e+01 1.34456679e+03
 4.54164354e-01 3.54247902e-01 1.82897270e-01 4.99179

In [10]:
x_train = (x_train - mean_train) / std_train
x_val = (x_val - mean_train) / std_train
x_test = (x_test - mean_train) / std_train

In [11]:
print(x_train[:1])
print(x_val[:1])
print(x_test[:1])

[[-0.64045422  3.30944983  0.63363387  0.10953756  1.67632253 -0.07007254
  -0.0332872  -0.33456434  2.69809951 -0.41084293 -0.1332585   1.62671948
  -0.50443553  0.47896418 -0.64045422  3.30944983  0.63363387  0.10953756
   1.67632253 -0.07007254 -0.0332872  -0.33456434  2.69809951 -0.41084293
  -0.1332585   1.62671948 -0.50443553  0.47896418 -0.64045422 -0.41536587
  -0.18946256  1.05899079 -0.1240445  -0.20625922]]
[[-0.64045422  3.92394118  0.77792311  0.14850624  1.75629188 -0.06666103
  -0.03006063 -0.566959   -0.64358587 -0.4108425  -0.1332585   1.15478745
   2.58593708 -0.00713238 -0.64045422  3.92394118  0.77792311  0.14850624
   1.75629188 -0.06666103 -0.03006063 -0.566959   -0.64358587 -0.4108425
  -0.1332585   1.15478745  2.58593708 -0.00713238 -0.64045422 -0.41536587
  -0.18946256  1.05899079 -0.1240445  -0.20625922]]
[[-0.64045422 -0.37749822 -0.23210158 -0.12427451 -0.65649211 -0.08371859
  -0.0461935  -0.5041534  -0.74911278 -0.41080896 -0.1332585   0.26435859
   2.4362

In [12]:
def calc_fading_weights(size, fading_factor):  
  fading_weights = reversed(range(size))
  return [fading_factor**x for x in fading_weights]
  
def create_sampler(y):
  n_samples = len(y)
  fading_count = calc_fading_weights(n_samples, 0.9999) 
  total = np.sum(fading_count)
  bug = np.sum(fading_count * y)
  normal = total - bug
  class_weights = total / [normal, bug]
  class_weights = class_weights[y]

  instance_weights = calc_fading_weights(n_samples, 0.9999) 
  weights = instance_weights * class_weights
  return data.WeightedRandomSampler(weights=weights, num_samples=n_samples, replacement=True)

In [13]:
sampler = create_sampler(y_train)

In [14]:
x_train, y_train = torch.from_numpy(x_train), torch.from_numpy(y_train)
x_val, y_val = torch.from_numpy(x_val), torch.from_numpy(y_val)
x_test, y_test = torch.from_numpy(x_test), torch.from_numpy(y_test)

In [15]:
train_dataset = data.TensorDataset(x_train, y_train)
val_dataset = data.TensorDataset(x_val, y_val)
test_dataset = data.TensorDataset(x_test, y_test)

train_dataloader = data.DataLoader(train_dataset, batch_size=512, sampler=sampler)
val_dataloader = data.DataLoader(val_dataset, batch_size=32)
test_dataloader = data.DataLoader(test_dataset, batch_size=32)

In [16]:
inputs, targets = next(iter(train_dataloader))
print(inputs[:1])
print(targets[:1])

inputs, targets = next(iter(val_dataloader))
print(inputs[:1])
print(targets[:1])

inputs, targets = next(iter(test_dataloader))
print(inputs[:1])
print(targets[:1])

tensor([[-0.6405, -0.3775, -0.2321, -0.1243, -0.6565, -0.0786, -0.0381,  0.5771,
         -0.4677, -0.3812, -0.1333,  1.9355,  1.8576,  2.9856, -0.6405, -0.3775,
         -0.2321, -0.1243, -0.6565, -0.0786, -0.0381,  0.5771, -0.4677, -0.3812,
         -0.1333,  1.9355,  1.8576,  2.9856, -0.6405, -0.4154, -0.1895,  1.0590,
         -0.1240, -0.2063]], dtype=torch.float64)
tensor([1], dtype=torch.int8)
tensor([[-0.6405,  3.9239,  0.7779,  0.1485,  1.7563, -0.0667, -0.0301, -0.5670,
         -0.6436, -0.4108, -0.1333,  1.1548,  2.5859, -0.0071, -0.6405,  3.9239,
          0.7779,  0.1485,  1.7563, -0.0667, -0.0301, -0.5670, -0.6436, -0.4108,
         -0.1333,  1.1548,  2.5859, -0.0071, -0.6405, -0.4154, -0.1895,  1.0590,
         -0.1240, -0.2063]], dtype=torch.float64)
tensor([0], dtype=torch.int8)
tensor([[-0.6405, -0.3775, -0.2321, -0.1243, -0.6565, -0.0837, -0.0462, -0.5042,
         -0.7491, -0.4108, -0.1333,  0.2644,  2.4362,  0.8481, -0.6405, -0.3775,
         -0.2321, -0.1243, -0.

In [35]:
class Classifier(nn.Module):
  FILENAME = 'models/classifier.cpt'
  def __init__(self, input_size, hidden_size, drop_prob, epoch=None, val_gmean=None):
    super(Classifier, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.drop_prob = drop_prob
    self.epoch = epoch
    self.val_gmean = val_gmean
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
    self.dropout = nn.Dropout(drop_prob)
  
  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = self.dropout(x)
    x = torch.sigmoid(self.fcout(x))
    return x

  def save(self):
    checkpoint = {
        'input_size': self.input_size,
        'hidden_size': self.hidden_size,
        'drop_prob': self.drop_prob,
        'val_gmean': self.val_gmean,
        'epoch': self.epoch,
        'state_dict': self.state_dict()
    }
    with open(Classifier.FILENAME, 'wb') as f:
      torch.save(checkpoint, f)

  def load(self):
    with open(Classifier.FILENAME, 'rb') as f:
      checkpoint = torch.load(f)
      self.input_size = checkpoint['input_size']
      self.hidden_size = checkpoint['hidden_size']
      self.drop_prob = checkpoint['drop_prob']
      self.epoch = checkpoint['epoch']
      self.val_gmean = checkpoint['val_gmean']
      self.load_state_dict(checkpoint['state_dict'])

In [32]:
classifier = Classifier(input_size=x.shape[1], hidden_size=x.shape[1], drop_prob=0.5)
classifier

In [33]:
classifier.save()
classifier.load()
classifier

In [34]:
if torch.cuda.is_available():
  classifier = classifier.cuda()
  inputs = inputs.cuda()

classifier(inputs[:3].float())

tensor([[0.5747],
        [0.4403],
        [0.6097]], grad_fn=<SigmoidBackward>)

In [26]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)

In [27]:
train_loss = 0
epochs = 200
for epoch in range(epochs):
  classifier.train()
  for inputs, targets in train_dataloader:
    if torch.cuda.is_available():
      inputs, targets = inputs.cuda(), targets.cuda()

    outputs = classifier(inputs.float())
    loss = criterion(outputs.squeeze(), targets.float())
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  train_loss = train_loss / len(train_dataloader)
  val_loss = metrics.loss(classifier, val_dataloader, criterion)
  val_gmean, _ = metrics.gmean(classifier, val_dataloader)
  print('Epoch: {}, Train loss: {}, Val loss: {}, Val g-mean: {}'.format(epoch, train_loss, val_loss, val_gmean))

  if classifier.val_gmean is None or val_gmean > classifier.val_gmean:
    classifier.epoch = epoch
    classifier.val_gmean = val_gmean
    classifier.save()

classifier.epoch = epoch
classifier.val_gmean = val_gmean

n: 0.7146341935217179
Epoch: 9, Train loss: 0.5626559176440501, Val loss: 0.5275704515607733, Val g-mean: 0.714822330035359
Epoch: 10, Train loss: 0.5582143270090888, Val loss: 0.5511370650247523, Val g-mean: 0.7190957145724548
Epoch: 11, Train loss: 0.5641191575692298, Val loss: 0.5180769962699789, Val g-mean: 0.7134546727584911
Epoch: 12, Train loss: 0.5598072157977082, Val loss: 0.5099656593642736, Val g-mean: 0.7239005990901451
Epoch: 13, Train loss: 0.5540287942914536, Val loss: 0.5080413998741853, Val g-mean: 0.717797671898113
Epoch: 14, Train loss: 0.552680328883697, Val loss: 0.5151356046921328, Val g-mean: 0.7238348921244395
Epoch: 15, Train loss: 0.5450032028032491, Val loss: 0.5131815275863597, Val g-mean: 0.7229667687913341
Epoch: 16, Train loss: 0.5521451869357917, Val loss: 0.5028435399657801, Val g-mean: 0.7232398075301184
Epoch: 17, Train loss: 0.5452191626783769, Val loss: 0.5105633704285872, Val g-mean: 0.7248085496947541
Epoch: 18, Train loss: 0.5486727163346101, Val

In [28]:
def evaluate(classifier):
  if torch.cuda.is_available():
    classifier = classifier.cuda()
  train_gmean, _ = metrics.gmean(classifier, train_dataloader)
  test_gmean, test_recalls = metrics.gmean(classifier, test_dataloader)
  print('Epoch: {}'.format(classifier.epoch))
  print('Train g-mean: {}'.format(train_gmean.item()))
  print('Val g-mean: {}'.format(classifier.val_gmean))
  print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [29]:
print('Best classifier')
evaluate(Classifier.load())

Best classifier
Epoch: 176
Train g-mean: 0.779200493299207
Val g-mean: 0.7671735531003301
Test g-mean: 0.7363740914430342, recalls: [0.75900849 0.71441468]


In [30]:
print('Last classifier')
evaluate(classifier)

Last classifier
Epoch: 199
Train g-mean: 0.7662040990564849
Val g-mean: 0.7507611440742215
Test g-mean: 0.7402974366118688, recalls: [0.7769758  0.70535053]
