<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
from scipy.stats import mstats

In [28]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/jenkins.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entropy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,author_date_unix_timestamp,classification,contains_bug
0,False,7.0,7.0,7.0,2.641604,9.0,9.0,426.428571,100.0,9.3e-05,1.0,5171.0,30.227271,1472.714286,1555326371,,False
1,False,7.0,7.0,7.0,2.75,8.0,8.0,426.428571,100.0,6.314775,2.0,5170.0,29.227271,1471.714286,1555326363,,False
2,False,1.0,1.0,2.0,0.90658,15.0,44.0,96.0,4.0,0.034722,2.0,629.0,14.828373,414.0,1554971763,,False
3,False,1.0,1.0,1.0,0.0,0.0,0.0,40.0,1.0,1.2e-05,1.0,4.0,3.058824,3.0,1554969774,,False
4,False,1.0,2.0,4.0,1.662506,14.0,10.0,67.0,6.0,21.280683,4.0,3.0,2.058824,2.0,1554967752,Feature Addition,False


In [29]:
df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp', 'classification']
x = df[features_cols]
x['fix'] = x['fix'].astype('int')
df_classification = pd.get_dummies(x, columns=['classification'])
x = pd.concat([x, df_classification], axis='columns')
x = x.drop(['classification'], axis='columns')
x = x.values
y = df[label_col]
y = y.astype('category')
y = y.cat.codes
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [30]:
print(x[:3])
print(y[:3])

[[0.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 1.00000000e+00 1.14300000e+03
  2.90000000e+01 2.23151620e+00 1.00000000e+00 1.35000000e+02
  1.53895529e+01 9.00000000e+01 0.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  1.00000000e+00 1.14300000e+03 2.90000000e+01 2.23151620e+00
  1.00000000e+00 1.35000000e+02 1.53895529e+01 9.00000000e+01
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 3.00000000e+00 3.00000000e+00
  1.55665671e+00 4.00000000e+00 3.00000000e+00 3.90666667e+02
  6.00000000e+00 1.75112037e+01 2.00000000e+00 3.00000000e+00
  1.33333333e+00 2.00000000e+00 0.00000000e+00 1.00000000e+00
  3.00000000e+00 3.00000000e+00 1.55665671e+00 4.00000000e+00
  3.00000000e+00 3.90666667e+02 6.00000000e+00 1.75112037e+01
  2.00000000e+00 3.00000000e+00 1.33333333e+00 2.00000000e+00
  0.00000000e+00 0.00000000e+00 0.000

In [0]:
val_index = int( len(x) * 0.8 )
test_index = int( len(x) * 0.9 )
x_train, y_train = x[:val_index], y[:val_index]
x_val, y_val = x[val_index:test_index], y[val_index:test_index]
x_test, y_test = x[test_index:], y[test_index:]

In [32]:
print(x_train[:3])
print(x_val[:3])
print(x_test[:3])

[[0.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 1.00000000e+00 1.14300000e+03
  2.90000000e+01 2.23151620e+00 1.00000000e+00 1.35000000e+02
  1.53895529e+01 9.00000000e+01 0.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  1.00000000e+00 1.14300000e+03 2.90000000e+01 2.23151620e+00
  1.00000000e+00 1.35000000e+02 1.53895529e+01 9.00000000e+01
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 3.00000000e+00 3.00000000e+00
  1.55665671e+00 4.00000000e+00 3.00000000e+00 3.90666667e+02
  6.00000000e+00 1.75112037e+01 2.00000000e+00 3.00000000e+00
  1.33333333e+00 2.00000000e+00 0.00000000e+00 1.00000000e+00
  3.00000000e+00 3.00000000e+00 1.55665671e+00 4.00000000e+00
  3.00000000e+00 3.90666667e+02 6.00000000e+00 1.75112037e+01
  2.00000000e+00 3.00000000e+00 1.33333333e+00 2.00000000e+00
  0.00000000e+00 0.00000000e+00 0.000

In [0]:
mean_train = x_train.mean(axis=0)
std_train = x_train.std(axis=0)

In [34]:
print(mean_train)
print(std_train)

[2.80165503e-01 1.60000000e+00 2.49252651e+00 3.93167830e+00
 7.29343523e-01 5.36284458e+01 2.70048616e+01 7.15818862e+02
 1.90462891e+01 8.30126323e+01 1.91642100e+00 1.73230111e+03
 8.50179057e+01 8.93732072e+02 2.80165503e-01 1.60000000e+00
 2.49252651e+00 3.93167830e+00 7.29343523e-01 5.36284458e+01
 2.70048616e+01 7.15818862e+02 1.90462891e+01 8.30126323e+01
 1.91642100e+00 1.73230111e+03 8.50179057e+01 8.93732072e+02
 2.80165503e-01 1.57124386e-01 3.53245410e-02 4.76803724e-01
 1.56710628e-02 3.49107836e-02]
[4.49079942e-01 1.63179553e+00 6.18039590e+00 2.46665298e+01
 1.11449687e+00 7.35022221e+02 6.10443860e+02 1.07988283e+03
 2.67969402e+01 2.14765419e+02 7.35582372e+00 1.97484705e+03
 8.20515477e+01 1.29884812e+03 4.49079942e-01 1.63179553e+00
 6.18039590e+00 2.46665298e+01 1.11449687e+00 7.35022221e+02
 6.10443860e+02 1.07988283e+03 2.67969402e+01 2.14765419e+02
 7.35582372e+00 1.97484705e+03 8.20515477e+01 1.29884812e+03
 4.49079942e-01 3.63918003e-01 1.84598802e-01 4.99461

In [0]:
x_train = (x_train - mean_train) / std_train
x_val = (x_val - mean_train) / std_train
x_test = (x_test - mean_train) / std_train

In [36]:
print(x_train[:3])
print(x_val[:3])
print(x_test[:3])

[[-0.62386555 -0.36769313 -0.24149367 -0.11885248 -0.65441505 -0.07160116
  -0.04259992  0.39558101  0.37144953 -0.37613651 -0.12458442 -0.8088227
  -0.8485928  -0.61880374 -0.62386555 -0.36769313 -0.24149367 -0.11885248
  -0.65441505 -0.07160116 -0.04259992  0.39558101  0.37144953 -0.37613651
  -0.12458442 -0.8088227  -0.8485928  -0.61880374 -0.62386555  2.31611409
  -0.19135845 -0.95463532 -0.12617668 -0.19019367]
 [-0.62386555 -0.36769313  0.08211019 -0.03777095  0.74231988 -0.06751965
  -0.03932362 -0.30109951 -0.48685742 -0.30499057  0.01136229 -0.87566331
  -1.01990242 -0.68655608 -0.62386555 -0.36769313  0.08211019 -0.03777095
   0.74231988 -0.06751965 -0.03932362 -0.30109951 -0.48685742 -0.30499057
   0.01136229 -0.87566331 -1.01990242 -0.68655608 -0.62386555 -0.43175766
  -0.19135845  1.04752043 -0.12617668 -0.19019367]
 [-0.62386555  0.24512875 -0.07969174 -0.07831172 -0.09866455 -0.06207764
  -0.03604731  0.98592284  0.59535569 -0.36442892  0.01136229 -0.86756142
  -0.816778

In [0]:
def create_sampler(y):
  _, counts = np.unique(y, return_counts=True)
  n_samples = len(y)
  class_weights = n_samples / counts
  weights = class_weights[y]
  return data.WeightedRandomSampler(weights=weights, num_samples=n_samples, replacement=True)

In [0]:
sampler = create_sampler(y_train)

In [0]:
x_train, y_train = torch.from_numpy(x_train), torch.from_numpy(y_train)
x_val, y_val = torch.from_numpy(x_val), torch.from_numpy(y_val)
x_test, y_test = torch.from_numpy(x_test), torch.from_numpy(y_test)

In [0]:
train_dataset = data.TensorDataset(x_train, y_train)
val_dataset = data.TensorDataset(x_val, y_val)
test_dataset = data.TensorDataset(x_test, y_test)

train_dataloader = data.DataLoader(train_dataset, batch_size=512, sampler=sampler)
val_dataloader = data.DataLoader(val_dataset, batch_size=32)
test_dataloader = data.DataLoader(test_dataset, batch_size=32)

In [41]:
inputs, targets = next(iter(train_dataloader))
print(inputs[:3])
print(targets[:3])

inputs, targets = next(iter(val_dataloader))
print(inputs[:3])
print(targets[:3])

inputs, targets = next(iter(test_dataloader))
print(inputs[:3])
print(targets[:3])

tensor([[ 1.6029,  0.2451,  0.0821, -0.0378, -0.1389, -0.0077, -0.0426,  0.9299,
          2.0507, -0.3655,  0.1473, -0.8645, -1.0287, -0.6819,  1.6029,  0.2451,
          0.0821, -0.0378, -0.1389, -0.0077, -0.0426,  0.9299,  2.0507, -0.3655,
          0.1473, -0.8645, -1.0287, -0.6819,  1.6029, -0.4318, -0.1914, -0.9546,
         -0.1262, -0.1902],
        [-0.6239, -0.3677, -0.2415, -0.1189, -0.6544, -0.0417, -0.0442, -0.6388,
         -0.6734, -0.3865, -0.1246, -0.3865,  0.5921, -0.2577, -0.6239, -0.3677,
         -0.2415, -0.1189, -0.6544, -0.0417, -0.0442, -0.6388, -0.6734, -0.3865,
         -0.1246, -0.3865,  0.5921, -0.2577, -0.6239,  2.3161, -0.1914, -0.9546,
         -0.1262, -0.1902],
        [-0.6239, -0.3677, -0.0797, -0.0783,  0.2335,  0.0250, -0.0393, -0.3304,
         -0.3003,  0.1343, -0.1246, -0.8012, -0.7033, -0.5972, -0.6239, -0.3677,
         -0.0797, -0.0783,  0.2335,  0.0250, -0.0393, -0.3304, -0.3003,  0.1343,
         -0.1246, -0.8012, -0.7033, -0.5972, -0.6239,

In [0]:
class Classifier(nn.Module):
  def __init__(self, input_size, hidden_size, drop_prob, epoch=None, val_loss=None):
    super(Classifier, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.drop_prob = drop_prob
    self.epoch = epoch
    self.val_loss = val_loss
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
    self.dropout = nn.Dropout(drop_prob)
  
  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = self.dropout(x)
    x = torch.relu(self.fc2(x))
    x = self.dropout(x)
    x = torch.sigmoid(self.fcout(x))
    return x

In [43]:
classifier = Classifier(input_size=x.shape[1], hidden_size=128, drop_prob=0.2)
classifier

Classifier(
  (fc1): Linear(in_features=34, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fcout): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
filename = 'classifier.cpt'
def save_classifier(classifier):
  checkpoint = {
      'input_size': classifier.input_size,
      'hidden_size': classifier.hidden_size,
      'drop_prob': classifier.drop_prob,
      'val_loss': classifier.val_loss,
      'epoch': classifier.epoch,
      'state_dict': classifier.state_dict()
  }
  with open(filename, 'wb') as f:
    torch.save(checkpoint, f)

def load_classifier():
  with open(filename, 'rb') as f:
    checkpoint = torch.load(f)
  classifier = Classifier(checkpoint['input_size'], checkpoint['hidden_size'], checkpoint['drop_prob'], checkpoint['epoch'], checkpoint['val_loss'])
  classifier.load_state_dict(checkpoint['state_dict'])
  return classifier

In [0]:
save_classifier(classifier)
classifier = load_classifier()

In [46]:
if torch.cuda.is_available():
  classifier = classifier.cuda()
  inputs = inputs.cuda()

classifier(inputs[:3].float())

tensor([[0.4493],
        [0.5016],
        [0.4605]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [0]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)

In [48]:
train_loss = 0
epochs = 200
for epoch in range(epochs):
  classifier.train()
  for inputs, targets in train_dataloader:
    if torch.cuda.is_available():
      inputs, targets = inputs.cuda(), targets.cuda()

    outputs = classifier(inputs.float())
    loss = criterion(outputs.squeeze(), targets.float())
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  with torch.no_grad():
    classifier.eval()
    val_loss = 0
    for inputs, targets in val_dataloader:
      if torch.cuda.is_available():
        inputs, targets = inputs.cuda(), targets.cuda()

      outputs = classifier(inputs.float())
      loss = criterion(outputs.squeeze(), targets.float())
      val_loss += loss.item()

    train_loss = train_loss / len(train_dataloader)
    val_loss = val_loss / len(val_dataloader)
    print('Epoch: {}, Train loss: {}, Val loss: {}'.format(epoch, train_loss, val_loss))

    if classifier.val_loss is None or val_loss < classifier.val_loss:
      classifier.epoch = epoch
      classifier.val_loss = val_loss
      save_classifier(classifier)

classifier.epoch = epoch
classifier.val_loss = val_loss

Epoch: 0, Train loss: 0.610739645205046, Val loss: 0.5140360621245283
Epoch: 1, Train loss: 0.5691520774793757, Val loss: 0.5345292483505449
Epoch: 2, Train loss: 0.551976387188184, Val loss: 0.5141056219213888
Epoch: 3, Train loss: 0.537418377359848, Val loss: 0.516100591734836
Epoch: 4, Train loss: 0.528453981718477, Val loss: 0.48866499333005203
Epoch: 5, Train loss: 0.5226798066925861, Val loss: 0.5028612276441172
Epoch: 6, Train loss: 0.5191937779371986, Val loss: 0.5018313609455761
Epoch: 7, Train loss: 0.5120950806428075, Val loss: 0.49240878420440776
Epoch: 8, Train loss: 0.511282972255007, Val loss: 0.526810391953117
Epoch: 9, Train loss: 0.5063042575789819, Val loss: 0.5109318961438379
Epoch: 10, Train loss: 0.5044644230151212, Val loss: 0.4914055815652797
Epoch: 11, Train loss: 0.49918433175156, Val loss: 0.4932374773841155
Epoch: 12, Train loss: 0.4970249366722791, Val loss: 0.475483096351749
Epoch: 13, Train loss: 0.4917126152388942, Val loss: 0.48199496418237686
Epoch: 14

In [0]:
def calculate_recalls(targets, predictions):
  n_classes = 2
  confusion_matrix, _, _ = np.histogram2d(targets.detach().cpu().numpy(), predictions.squeeze().detach().cpu().numpy(), bins=n_classes)
  return np.diag(confusion_matrix) / (np.sum(confusion_matrix, axis=1))

def gmean(classifier, dataloader):
  with torch.no_grad():
    classifier.eval()
    recalls = np.zeros((2))
    for inputs, targets in dataloader:
      if torch.cuda.is_available():
        inputs, targets = inputs.cuda(), targets.cuda()
      
      outputs = classifier(inputs.float())
      predictions = torch.round(outputs).int()
      recalls += calculate_recalls(targets, predictions)

    recalls = recalls / len(dataloader)
    return mstats.gmean(recalls)    

In [0]:
def evaluate(classifier):
  if torch.cuda.is_available():
    classifier = classifier.cuda()
  train_accuracy = gmean(classifier, train_dataloader)
  test_accuracy = gmean(classifier, test_dataloader)
  print('Epoch: {}'.format(classifier.epoch))
  print('Train g-mean: {}'.format(train_accuracy.item()))
  print('Test g-mean: {}'.format(test_accuracy.item()))

In [51]:
print('Best classifier')
evaluate(load_classifier())

Best classifier
Epoch: 51
Train g-mean: 0.794108949293009
Test g-mean: 0.7376259936221754


In [52]:
print('Last classifier')
evaluate(classifier)

Last classifier
Epoch: 199
Train g-mean: 0.8423115451462121
Test g-mean: 0.7433104830683602
