<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/jenkins.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entropy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,author_date_unix_timestamp,classification,contains_bug
0,False,7.0,7.0,7.0,2.641604,9.0,9.0,426.428571,100.0,9.3e-05,1.0,5171.0,30.227271,1472.714286,1555326371,,False
1,False,7.0,7.0,7.0,2.75,8.0,8.0,426.428571,100.0,6.314775,2.0,5170.0,29.227271,1471.714286,1555326363,,False
2,False,1.0,1.0,2.0,0.90658,15.0,44.0,96.0,4.0,0.034722,2.0,629.0,14.828373,414.0,1554971763,,False
3,False,1.0,1.0,1.0,0.0,0.0,0.0,40.0,1.0,1.2e-05,1.0,4.0,3.058824,3.0,1554969774,,False
4,False,1.0,2.0,4.0,1.662506,14.0,10.0,67.0,6.0,21.280683,4.0,3.0,2.058824,2.0,1554967752,Feature Addition,False


In [3]:
df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp', 'classification']
x = df[features_cols]
x['fix'] = x['fix'].astype('int')
df_classification = pd.get_dummies(x, columns=['classification'])
x = pd.concat([x, df_classification], axis='columns')
x = x.drop(['classification'], axis='columns')
x = x.values
y = df[label_col]
y = y.astype('category')
y = y.cat.codes
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [4]:
print(x[:3])
print(y[:3])

[[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 3.00000000e+00 3.41800000e+03
  3.40000000e+01 1.15740741e-05 1.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  3.00000000e+00 3.41800000e+03 3.40000000e+01 1.15740741e-05
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.00000000e+00 5.00000000e+00 2.60000000e+01
  1.37290201e+00 6.08000000e+02 2.30000000e+01 2.43230769e+02
  9.50000000e+01 3.92893185e+01 5.00000000e+00 5.00000000e+00
  9.04040404e-01 4.00000000e+00 1.00000000e+00 2.00000000e+00
  5.00000000e+00 2.60000000e+01 1.37290201e+00 6.08000000e+02
  2.30000000e+01 2.43230769e+02 9.50000000e+01 3.92893185e+01
  5.00000000e+00 5.00000000e+00 9.04040404e-01 4.00000000e+00
  1.00000000e+00 0.00000000e+00 0.000

In [0]:
val_index = int( len(x) * 0.8 )
test_index = int( len(x) * 0.9 )
x_train, y_train = x[:val_index], y[:val_index]
x_val, y_val = x[val_index:test_index], y[val_index:test_index]
x_test, y_test = x[test_index:], y[test_index:]

In [6]:
print(x_train[:3])
print(x_val[:3])
print(x_test[:3])

[[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 3.00000000e+00 3.41800000e+03
  3.40000000e+01 1.15740741e-05 1.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  3.00000000e+00 3.41800000e+03 3.40000000e+01 1.15740741e-05
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.00000000e+00 5.00000000e+00 2.60000000e+01
  1.37290201e+00 6.08000000e+02 2.30000000e+01 2.43230769e+02
  9.50000000e+01 3.92893185e+01 5.00000000e+00 5.00000000e+00
  9.04040404e-01 4.00000000e+00 1.00000000e+00 2.00000000e+00
  5.00000000e+00 2.60000000e+01 1.37290201e+00 6.08000000e+02
  2.30000000e+01 2.43230769e+02 9.50000000e+01 3.92893185e+01
  5.00000000e+00 5.00000000e+00 9.04040404e-01 4.00000000e+00
  1.00000000e+00 0.00000000e+00 0.000

In [0]:
mean_train = x_train.mean(axis=0)
std_train = x_train.std(axis=0)

In [8]:
print(mean_train)
print(std_train)

[2.79906905e-01 1.60062064e+00 2.50969744e+00 3.94946987e+00
 7.32263415e-01 4.98735454e+01 2.67649858e+01 7.10852265e+02
 1.91607965e+01 8.25198811e+01 1.91383501e+00 1.75702715e+03
 8.56499295e+01 9.02132140e+02 2.79906905e-01 1.60062064e+00
 2.50969744e+00 3.94946987e+00 7.32263415e-01 4.98735454e+01
 2.67649858e+01 7.10852265e+02 1.91607965e+01 8.25198811e+01
 1.91383501e+00 1.75702715e+03 8.56499295e+01 9.02132140e+02
 2.79906905e-01 1.55986553e-01 3.60486165e-02 4.77217481e-01
 1.60848203e-02 3.47556245e-02]
[4.48953260e-01 1.63004491e+00 6.20794694e+00 2.46206593e+01
 1.11373816e+00 6.38695803e+02 5.33382734e+02 1.06711354e+03
 2.70511866e+01 2.13864775e+02 7.39000523e+00 1.98723452e+03
 8.19200817e+01 1.30608455e+03 4.48953260e-01 1.63004491e+00
 6.20794694e+00 2.46206593e+01 1.11373816e+00 6.38695803e+02
 5.33382734e+02 1.06711354e+03 2.70511866e+01 2.13864775e+02
 7.39000523e+00 1.98723452e+03 8.19200817e+01 1.30608455e+03
 4.48953260e-01 3.62842594e-01 1.86411142e-01 4.99480

In [0]:
x_train = (x_train - mean_train) / std_train
x_val = (x_val - mean_train) / std_train
x_test = (x_test - mean_train) / std_train

In [10]:
print(x_train[:3])
print(x_val[:3])
print(x_test[:3])

[[ 1.60393778 -0.36846876 -0.24318788 -0.11979654 -0.65748256 -0.07808654
  -0.04455522  2.53688818  0.54856017 -0.38585068 -0.12365824 -0.88415692
  -1.04553032 -0.69071496  1.60393778 -0.36846876 -0.24318788 -0.11979654
  -0.65748256 -0.07808654 -0.04455522  2.53688818  0.54856017 -0.38585068
  -0.12365824 -0.88415692 -1.04553032 -0.69071496  1.60393778 -0.42990144
  -0.19338231 -0.95542729 -0.1278584  -0.1897553 ]
 [ 1.60393778  0.24501126  0.40114753  0.89561087  0.57521473  0.87385333
  -0.00705869 -0.43821156  2.80354443 -0.20213971  0.41761337 -0.88164086
  -1.03449469 -0.68765237  1.60393778  0.24501126  0.40114753  0.89561087
   0.57521473  0.87385333 -0.00705869 -0.43821156  2.80354443 -0.20213971
   0.41761337 -0.88164086 -1.03449469 -0.68765237  1.60393778 -0.42990144
  -0.19338231 -0.95542729 -0.1278584  -0.1897553 ]
 [ 1.60393778 -0.36846876 -0.24318788 -0.11979654 -0.65748256 -0.07182378
  -0.0426804  -0.63522037 -0.67134935 -0.31420175 -0.12365824 -0.8111912
  -1.006533

In [0]:
x_train, y_train = torch.from_numpy(x_train), torch.from_numpy(y_train)
x_val, y_val = torch.from_numpy(x_val), torch.from_numpy(y_val)
x_test, y_test = torch.from_numpy(x_test), torch.from_numpy(y_test)

In [0]:
train_dataset = data.TensorDataset(x_train, y_train)
val_dataset = data.TensorDataset(x_val, y_val)
test_dataset = data.TensorDataset(x_test, y_test)

train_dataloader = data.DataLoader(train_dataset, batch_size=512, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=32)
test_dataloader = data.DataLoader(test_dataset, batch_size=32)

In [13]:
inputs, targets = next(iter(train_dataloader))
print(inputs[:3])
print(targets[:3])

inputs, targets = next(iter(val_dataloader))
print(inputs[:3])
print(targets[:3])

inputs, targets = next(iter(test_dataloader))
print(inputs[:3])
print(targets[:3])

tensor([[ 1.6039, -0.3685, -0.2432, -0.1198, -0.6575, -0.0640, -0.0502, -0.5584,
         -0.6344,  0.2449, -0.1237,  1.2731,  0.8030,  2.0978,  1.6039, -0.3685,
         -0.2432, -0.1198, -0.6575, -0.0640, -0.0502, -0.5584, -0.6344,  0.2449,
         -0.1237,  1.2731,  0.8030,  2.0978,  1.6039, -0.4299, -0.1934, -0.9554,
         -0.1279, -0.1898],
        [-0.6235, -0.3685, -0.2432, -0.1198, -0.6575, -0.0671, -0.0483,  1.3936,
         -0.4126, -0.3799, -0.1237,  0.9717, -0.0191, -0.0767, -0.6235, -0.3685,
         -0.2432, -0.1198, -0.6575, -0.0671, -0.0483,  1.3936, -0.4126, -0.3799,
         -0.1237,  0.9717, -0.0191, -0.0767, -0.6235, -0.4299, -0.1934,  1.0467,
         -0.1279, -0.1898],
        [-0.6235, -0.3685, -0.2432, -0.1198, -0.6575, -0.0765, -0.0502, -0.4347,
         -0.0799, -0.3258, -0.1237, -0.8837, -1.0440, -0.6907, -0.6235, -0.3685,
         -0.2432, -0.1198, -0.6575, -0.0765, -0.0502, -0.4347, -0.0799, -0.3258,
         -0.1237, -0.8837, -1.0440, -0.6907, -0.6235,

In [0]:
class Classifier(nn.Module):
  def __init__(self, epochs, input_size, hidden_size, val_loss=None):
    super(Classifier, self).__init__()
    self.epochs = epochs
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.val_loss = val_loss
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
  
  def forward(self, x):
    x = torch.sigmoid(self.fc1(x))
    x = torch.sigmoid(self.fc2(x))
    x = torch.sigmoid(self.fcout(x))
    return x

In [15]:
classifier = Classifier(epochs=100, input_size=x.shape[1], hidden_size=256)
classifier

Classifier(
  (fc1): Linear(in_features=34, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fcout): Linear(in_features=256, out_features=1, bias=True)
)

In [0]:
filename = 'classifier.cpt'
def save_classifier(classifier):
  checkpoint = {
      'epochs': classifier.epochs,
      'input_size': classifier.input_size,
      'hidden_size': classifier.hidden_size,
      'val_loss': classifier.val_loss,
      'state_dict': classifier.state_dict()
  }
  with open(filename, 'wb') as f:
    torch.save(checkpoint, f)

def load_classifier():
  with open(filename, 'rb') as f:
    checkpoint = torch.load(f)
  classifier = Classifier(checkpoint['epochs'], checkpoint['input_size'], checkpoint['hidden_size'], checkpoint['val_loss'])
  classifier.load_state_dict(checkpoint['state_dict'])
  return classifier

In [0]:
save_classifier(classifier)
classifier = load_classifier()

In [18]:
if torch.cuda.is_available():
  classifier = classifier.cuda()
  inputs = inputs.cuda()

classifier(inputs[:3].float())

tensor([[0.4918],
        [0.4897],
        [0.4877]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [0]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.03)

In [20]:
train_loss = 0
epochs = classifier.epochs
for epoch in range(epochs):
  classifier.train()
  for inputs, targets in train_dataloader:
    if torch.cuda.is_available():
      inputs, targets = inputs.cuda(), targets.cuda()

    outputs = classifier(inputs.float())
    loss = criterion(outputs.squeeze(), targets.float())
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  with torch.no_grad():
    classifier.eval()
    val_loss = 0
    for inputs, targets in val_dataloader:
      if torch.cuda.is_available():
        inputs, targets = inputs.cuda(), targets.cuda()

      outputs = classifier(inputs.float())
      loss = criterion(outputs.squeeze(), targets.float())
      val_loss += loss.item()

    train_loss = train_loss / len(train_dataloader)
    val_loss = val_loss / len(val_dataloader)
    print('Epoch: {}, Train loss: {}, Val loss: {}'.format(epoch, train_loss, val_loss))

    if classifier.val_loss is None or val_loss < classifier.val_loss:
      classifier.val_loss = val_loss
      classifier.epochs = epoch
      save_classifier(classifier)


Epoch: 0, Train loss: 0.6293988745463522, Val loss: 0.5178971757230005
Epoch: 1, Train loss: 0.5174578928138411, Val loss: 0.5150406968436743
Epoch: 2, Train loss: 0.4922014182157307, Val loss: 0.48938576524194916
Epoch: 3, Train loss: 0.4802163870850858, Val loss: 0.47704749240687017
Epoch: 4, Train loss: 0.462813416676876, Val loss: 0.49967874037592036
Epoch: 5, Train loss: 0.45928301177414776, Val loss: 0.45506799711208595
Epoch: 6, Train loss: 0.4329116129462114, Val loss: 0.4413805445166011
Epoch: 7, Train loss: 0.4257573683627452, Val loss: 0.4302190730446263
Epoch: 8, Train loss: 0.41540277762258265, Val loss: 0.4196354268412841
Epoch: 9, Train loss: 0.4058128236581997, Val loss: 0.42543389746233035
Epoch: 10, Train loss: 0.4049193794760006, Val loss: 0.4063084784306978
Epoch: 11, Train loss: 0.39976176132035074, Val loss: 0.41631731645841347
Epoch: 12, Train loss: 0.40285212494544603, Val loss: 0.4144957171458947
Epoch: 13, Train loss: 0.39521982813551043, Val loss: 0.409748352

In [0]:
def accuracy(classifier, dataloader):
  with torch.no_grad():
    classifier.eval()
    accuracy = 0
    for inputs, targets in dataloader:
      if torch.cuda.is_available():
        inputs, targets = inputs.cuda(), targets.cuda()
      
      outputs = classifier(inputs.float())
      predictions = torch.round(outputs).int()
      equals = (targets == predictions.squeeze())
      accuracy += torch.mean(equals.float())

    accuracy = accuracy / len(dataloader)
    return accuracy

In [22]:
classifier = load_classifier()
if torch.cuda.is_available():
  classifier = classifier.cuda()
train_accuracy = accuracy(classifier, train_dataloader)
test_accuracy = accuracy(classifier, test_dataloader)
print('Epochs: {}'.format(classifier.epochs))
print('Train accuracy: {}'.format(train_accuracy.item()))
print('Test accuracy: {}'.format(test_accuracy.item()))

Epochs: 26
Train accuracy: 0.8419219255447388
Test accuracy: 0.8356714248657227
