<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
from scipy.stats import mstats

from jitsdp import metrics
from jitsdp.pipeline import Pipeline

In [16]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/brackets.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entrophy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,contains_bug,author_date_unix_timestamp,commit_type
0,False,2,5,23,3.630787,3754,0,0.0,1,0.0,0,11.0,0.0,21,False,1323292816,0
1,False,2,2,2,0.811278,4,0,0.0,1,0.0,0,23.5,0.0,22,False,1323292845,0
2,True,1,1,1,0.0,1,1,3.0,1,0.019688,1,25.0,51.793651,2,False,1323294546,0
3,False,1,1,1,0.0,2,3,31.0,1,0.035972,1,26.0,28.799228,3,False,1323295924,0
4,False,1,1,2,0.0,5,0,0.0,1,0.0,0,0.5,0.0,1,False,1323301755,0


In [17]:
df = df[df['commit_type'] != 3]
#df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entrophy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
X = df[features_cols]
X['fix'] = X['fix'].astype('int')
X = X.values
y = df[label_col]
y = y.astype('int')
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
print(X[:1])
print(y[:1])

[[0.000000e+00 2.000000e+00 5.000000e+00 2.300000e+01 3.630787e+00
  3.754000e+03 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
  0.000000e+00 1.100000e+01 0.000000e+00 2.100000e+01]]
[0]


In [19]:
test_index = int( len(X) * 0.9 )
X_train, y_train = X[:test_index], y[:test_index]
X_test, y_test = X[test_index:], y[test_index:]

In [20]:
print(X_train[:1])
print(X_test[:1])

[[0.000000e+00 2.000000e+00 5.000000e+00 2.300000e+01 3.630787e+00
  3.754000e+03 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
  0.000000e+00 1.100000e+01 0.000000e+00 2.100000e+01]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [21]:
mean_train = X_train.mean(axis=0)
std_train = X_train.std(axis=0)

In [22]:
print(mean_train)
print(std_train)

[1.58931896e-01 7.46068425e-01 1.16252648e+00 1.78008858e+00
 3.20738417e-01 8.42387830e+01 3.79735541e+01 4.22212467e+02
 2.67096091e+01 7.55832129e+00 8.86575518e+01 4.79215900e+02
 3.01487924e+01 2.58808011e+02]
[3.65612566e-01 5.92085003e-01 2.99453343e+00 1.42250140e+01
 6.57629456e-01 1.84675239e+03 9.80224735e+02 1.59479852e+03
 2.69620082e+01 3.21379111e+01 1.94257009e+02 9.22457304e+02
 2.24081690e+02 4.61634212e+02]


In [23]:
X_train = (X_train - mean_train) / std_train
X_test = (X_test - mean_train) / std_train

In [24]:
print(X_train[:1])
print(X_test[:1])

[[-0.43470031  2.11782357  1.28149297  1.4917322   5.03330341  1.98714307
  -0.03873964 -0.26474345 -0.95354949 -0.23518396 -0.45639306 -0.5075746
  -0.13454376 -0.51514382]]
[[-0.43470031 -1.26006979 -0.38821623 -0.12513791 -0.48771906 -0.04561455
  -0.03873964 -0.26474345 -0.99063871 -0.23518396 -0.45639306 -0.51949927
  -0.13454376 -0.56063438]]


In [25]:
class Classifier(nn.Module):
  FILENAME = 'models/classifier.cpt'
  def __init__(self, input_size, hidden_size, drop_prob, epoch=None, val_gmean=None):
    super(Classifier, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.drop_prob = drop_prob
    self.epoch = epoch
    self.val_gmean = val_gmean
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
    self.dropout = nn.Dropout(drop_prob)
  
  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = self.dropout(x)
    x = torch.sigmoid(self.fcout(x))
    return x

  def save(self):
    checkpoint = {
        'input_size': self.input_size,
        'hidden_size': self.hidden_size,
        'drop_prob': self.drop_prob,
        'val_gmean': self.val_gmean,
        'epoch': self.epoch,
        'state_dict': self.state_dict()
    }
    with open(Classifier.FILENAME, 'wb') as f:
      torch.save(checkpoint, f)

  def load(self):
    with open(Classifier.FILENAME, 'rb') as f:
      checkpoint = torch.load(f)
      self.input_size = checkpoint['input_size']
      self.hidden_size = checkpoint['hidden_size']
      self.drop_prob = checkpoint['drop_prob']
      self.epoch = checkpoint['epoch']
      self.val_gmean = checkpoint['val_gmean']
      self.load_state_dict(checkpoint['state_dict'])

In [26]:
classifier = Classifier(input_size=X.shape[1], hidden_size=X.shape[1], drop_prob=0.5)
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [27]:
classifier.save()
classifier.load()
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [28]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)

In [29]:
pipeline = Pipeline(steps=[], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)
pipeline.train(X_train, y_train)

4943927092668, Val g-mean: 0.7063259078411576
Epoch: 16, Train loss: 0.533887132342696, Train g-mean: 0.7148166078730956, Val g-mean: 0.6853484394299167
Epoch: 17, Train loss: 0.5388492127378796, Train g-mean: 0.7153038982132565, Val g-mean: 0.7008888779396586
Epoch: 18, Train loss: 0.5437692832383372, Train g-mean: 0.7189026585160458, Val g-mean: 0.7050902375901523
Epoch: 19, Train loss: 0.5305109337377862, Train g-mean: 0.7194146687022798, Val g-mean: 0.7007549640853871
Epoch: 20, Train loss: 0.525141663904021, Train g-mean: 0.7199964590435036, Val g-mean: 0.7041071601712149
Epoch: 21, Train loss: 0.5179915877876903, Train g-mean: 0.7223226349934385, Val g-mean: 0.6991460142921941
Epoch: 22, Train loss: 0.5366711228117284, Train g-mean: 0.7279367746845405, Val g-mean: 0.7040823682746312
Epoch: 23, Train loss: 0.5320198905220519, Train g-mean: 0.7266820529672722, Val g-mean: 0.7117198241746766
Epoch: 24, Train loss: 0.5221068382680093, Train g-mean: 0.7373415904234798, Val g-mean: 0.7

In [30]:
  def evaluate(pipeline):
    train_gmean, train_recalls = pipeline.evaluate(X_train, y_train)
    test_gmean, test_recalls = pipeline.evaluate(X_test, y_test)
    print('Epoch: {}'.format(pipeline.epoch))
    print('Train g-mean: {}, recalls: {}'.format(train_gmean.item(), train_recalls))
    print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [31]:
print('Last classifier')
evaluate(pipeline)

Last classifier
Epoch: 199
Train g-mean: 0.778112418933652, recalls: [0.70586217 0.85775801]
Test g-mean: 0.7378359396166241, recalls: [0.75457581 0.72146743]


In [32]:
print('Best classifier')
pipeline.load()
evaluate(pipeline)

Best classifier
Epoch: 190
Train g-mean: 0.7710122005465565, recalls: [0.68699519 0.86530419]
Test g-mean: 0.7315674565176691, recalls: [0.76198011 0.70236865]
