<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [96]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
from scipy.stats import mstats

from jitsdp import metrics
from jitsdp.pipeline import Pipeline

In [98]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/neutron.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entrophy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,contains_bug,author_date_unix_timestamp
0,False,1,1,1,0.0,202,0,0.0,1,0.0,0,0.0,0.0,0,True,1293840523
1,False,1,2,2,0.784992,28,19,103.0,1,-244.640741,3,3.5,0.996934,4,True,1293853015
2,False,1,1,1,0.0,84,22,189.0,1,-244.640613,4,7.0,0.995912,7,True,1293857524
3,True,1,1,1,0.0,8,7,251.0,1,2.775,5,8.0,1.36036,8,True,1294097284
4,False,1,1,1,0.0,205,79,252.0,1,0.174444,6,9.0,6.732484,9,True,1294112356


In [99]:
#df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entrophy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
X = df[features_cols]
X['fix'] = X['fix'].astype('int')
X = X.values
y = df[label_col]
y = y.astype('int')
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [100]:
print(X[:1])
print(y[:1])

[[  0.   1.   1.   1.   0. 202.   0.   0.   1.   0.   0.   0.   0.   0.]]
[1]


In [101]:
test_index = int( len(X) * 0.8 )
X_train, y_train = X[:test_index], y[:test_index]
X_test, y_test = X[test_index:], y[test_index:]

In [102]:
print(X_train[:1])
print(X_test[:1])

[[  0.   1.   1.   1.   0. 202.   0.   0.   1.   0.   0.   0.   0.   0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [103]:
mean_train = X_train.mean(axis=0)
std_train = X_train.std(axis=0)

In [104]:
print(mean_train)
print(std_train)

[2.24691042e-01 6.37126209e-01 2.21239675e+00 3.14484216e+00
 5.99416261e-01 2.69237754e+02 2.51954601e+02 4.93221171e+02
 7.02866748e+01 1.66493700e+01 8.44467567e+01 2.35551162e+02
 1.98825516e+00 1.46978805e+02]
[4.17378698e-01 6.99005149e-01 7.14876580e+00 1.59028200e+01
 1.07509921e+00 3.76885765e+03 9.01027281e+03 1.59901257e+03
 8.14342402e+01 5.24819955e+01 4.52271552e+02 6.40880725e+02
 2.86627928e+01 4.81337683e+02]


In [105]:
X_train = (X_train - mean_train) / std_train
X_test = (X_test - mean_train) / std_train

In [106]:
print(X_train[:1])
print(X_test[:1])

[[-0.53833855  0.51912893 -0.16959525 -0.13487181 -0.55754507 -0.01784035
  -0.02796304 -0.30845359 -0.85082976 -0.31723965 -0.18671693 -0.3675429
  -0.06936711 -0.30535487]]
[[-0.53833855 -0.9114757  -0.30947954 -0.19775374 -0.55754507 -0.0714375
  -0.02796304 -0.30845359 -0.8631096  -0.31723965 -0.18671693 -0.3675429
  -0.06936711 -0.30535487]]


In [107]:
class Classifier(nn.Module):
  FILENAME = 'models/classifier.cpt'
  def __init__(self, input_size, hidden_size, drop_prob, epoch=None, val_gmean=None):
    super(Classifier, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.drop_prob = drop_prob
    self.epoch = epoch
    self.val_gmean = val_gmean
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
    self.dropout = nn.Dropout(drop_prob)
  
  def forward(self, x):
    x = torch.sigmoid(self.fc1(x))
    x = self.dropout(x)
    x = torch.sigmoid(self.fcout(x))
    return x

  def save(self):
    checkpoint = {
        'input_size': self.input_size,
        'hidden_size': self.hidden_size,
        'drop_prob': self.drop_prob,
        'val_gmean': self.val_gmean,
        'epoch': self.epoch,
        'state_dict': self.state_dict()
    }
    with open(Classifier.FILENAME, 'wb') as f:
      torch.save(checkpoint, f)

  def load(self):
    with open(Classifier.FILENAME, 'rb') as f:
      checkpoint = torch.load(f)
      self.input_size = checkpoint['input_size']
      self.hidden_size = checkpoint['hidden_size']
      self.drop_prob = checkpoint['drop_prob']
      self.epoch = checkpoint['epoch']
      self.val_gmean = checkpoint['val_gmean']
      self.load_state_dict(checkpoint['state_dict'])

In [108]:
classifier = Classifier(input_size=X.shape[1], hidden_size=X.shape[1], drop_prob=0.5)
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [109]:
classifier.save()
classifier.load()
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [110]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)

In [111]:
pipeline = Pipeline(steps=[], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)
pipeline.train(X_train, y_train)

Epoch: 0, Train loss: 0.6779607976636579, Train g-mean: 0.7947875166708793, Val g-mean: None
Epoch: 1, Train loss: 0.6433510313917273, Train g-mean: 0.8006652655553956, Val g-mean: None
Epoch: 2, Train loss: 0.5864819253980763, Train g-mean: 0.7984759125788846, Val g-mean: None
Epoch: 3, Train loss: 0.540752310796129, Train g-mean: 0.8051761963339, Val g-mean: None
Epoch: 4, Train loss: 0.5130252805846849, Train g-mean: 0.8114405440767896, Val g-mean: None
Epoch: 5, Train loss: 0.49540114488492065, Train g-mean: 0.8137587267175546, Val g-mean: None
Epoch: 6, Train loss: 0.48267534377955934, Train g-mean: 0.8175515545055543, Val g-mean: None
Epoch: 7, Train loss: 0.477035810871401, Train g-mean: 0.8167988784043516, Val g-mean: None
Epoch: 8, Train loss: 0.4717389024817648, Train g-mean: 0.819638936646194, Val g-mean: None
Epoch: 9, Train loss: 0.46684741582128214, Train g-mean: 0.8200116394325693, Val g-mean: None
Epoch: 10, Train loss: 0.446046743535756, Train g-mean: 0.821244258479250

In [112]:
  def evaluate(pipeline):
    train_gmean, train_recalls = pipeline.evaluate(X_train, y_train)
    test_gmean, test_recalls = pipeline.evaluate(X_test, y_test)
    print('Epoch: {}'.format(pipeline.epoch))
    print('Train g-mean: {}, recalls: {}'.format(train_gmean.item(), train_recalls))
    print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [113]:
print('Last classifier')
evaluate(pipeline)

Last classifier
Epoch: 199
Train g-mean: 0.850500520372923, recalls: [0.74331318 0.9731445 ]
Test g-mean: 0.647925287847743, recalls: [0.49340125 0.85084336]


In [114]:
if pipeline.has_validation():
    print('Best classifier')
    pipeline.load()
    evaluate(pipeline)