<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
from scipy.stats import mstats

from jitsdp import metrics
from jitsdp.pipeline import Pipeline

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/neutron.csv')
df.head()

Unnamed: 0,fix,ns,nd,nf,entrophy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,contains_bug,author_date_unix_timestamp,commit_type
0,False,1,1,1,0.0,202,0,0.0,1,0.0,0,0.0,0.0,0,True,1293840523,2
1,False,1,2,2,0.784992,28,19,103.0,1,-244.640741,3,3.5,0.996934,4,True,1293853015,2
2,False,1,1,1,0.0,84,22,189.0,1,-244.640613,4,7.0,0.995912,7,True,1293857524,2
3,True,1,1,1,0.0,8,7,251.0,1,2.775,5,8.0,1.36036,8,True,1294097284,2
4,False,1,1,1,0.0,205,79,252.0,1,0.174444,6,9.0,6.732484,9,True,1294112356,2


In [4]:
df = df[df['commit_type'] != 3]
#df = df.sample(frac=1)
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entrophy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
X = df[features_cols]
X['fix'] = X['fix'].astype('int')
X = X.values
y = df[label_col]
y = y.astype('int')
y = y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
print(X[:1])
print(y[:1])

[[  0.   1.   1.   1.   0. 202.   0.   0.   1.   0.   0.   0.   0.   0.]]
[1]


In [6]:
test_index = int( len(X) * 0.9 )
X_train, y_train = X[:test_index], y[:test_index]
X_test, y_test = X[test_index:], y[test_index:]

In [7]:
print(X_train[:1])
print(X_test[:1])

[[  0.   1.   1.   1.   0. 202.   0.   0.   1.   0.   0.   0.   0.   0.]]
[[  1.         1.         2.         2.         0.995253  31.
    6.        -3.5       32.       141.783623   4.         1.5
    1.00529    1.      ]]


In [8]:
mean_train = X_train.mean(axis=0)
std_train = X_train.std(axis=0)

In [9]:
print(mean_train)
print(std_train)

[2.17309340e-01 6.27420737e-01 2.12002285e+00 3.00736932e+00
 5.83037563e-01 2.42894316e+02 2.26149329e+02 4.52423502e+02
 6.93547558e+01 1.65930550e+01 7.83848615e+01 2.32328620e+02
 1.94493223e+00 1.45917624e+02]
[4.12414829e-01 6.91897781e-01 6.83150469e+00 1.51382471e+01
 1.05340297e+00 3.56071902e+03 8.51085459e+03 1.51858424e+03
 8.35065665e+01 5.33612416e+01 4.28584853e+02 6.31941494e+02
 2.85733763e+01 4.74041454e+02]


In [10]:
X_train = (X_train - mean_train) / std_train
X_test = (X_test - mean_train) / std_train

In [11]:
print(X_train[:1])
print(X_test[:1])

[[-0.52691932  0.53848888 -0.16394966 -0.13260249 -0.55348008 -0.01148485
  -0.02657187 -0.29792453 -0.81855546 -0.31095706 -0.18289228 -0.36764261
  -0.06806799 -0.30781617]]
[[ 1.89782376  0.53848888 -0.01756902 -0.06654465  0.3913179  -0.05950886
  -0.02586689 -0.30022931 -0.44732717  2.34609549 -0.17355924 -0.36526897
  -0.03288524 -0.30570665]]


In [12]:
class Classifier(nn.Module):
  FILENAME = 'models/classifier.cpt'
  def __init__(self, input_size, hidden_size, drop_prob, epoch=None, val_gmean=None):
    super(Classifier, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.drop_prob = drop_prob
    self.epoch = epoch
    self.val_gmean = val_gmean
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fcout = nn.Linear(hidden_size, 1)
    self.dropout = nn.Dropout(drop_prob)
  
  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = self.dropout(x)
    x = torch.sigmoid(self.fcout(x))
    return x

  def save(self):
    checkpoint = {
        'input_size': self.input_size,
        'hidden_size': self.hidden_size,
        'drop_prob': self.drop_prob,
        'val_gmean': self.val_gmean,
        'epoch': self.epoch,
        'state_dict': self.state_dict()
    }
    with open(Classifier.FILENAME, 'wb') as f:
      torch.save(checkpoint, f)

  def load(self):
    with open(Classifier.FILENAME, 'rb') as f:
      checkpoint = torch.load(f)
      self.input_size = checkpoint['input_size']
      self.hidden_size = checkpoint['hidden_size']
      self.drop_prob = checkpoint['drop_prob']
      self.epoch = checkpoint['epoch']
      self.val_gmean = checkpoint['val_gmean']
      self.load_state_dict(checkpoint['state_dict'])

In [13]:
classifier = Classifier(input_size=X.shape[1], hidden_size=X.shape[1], drop_prob=0.5)
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [14]:
classifier.save()
classifier.load()
classifier

Classifier(
  (fc1): Linear(in_features=14, out_features=14, bias=True)
  (fcout): Linear(in_features=14, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
criterion = nn.BCELoss()
optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)

In [16]:
pipeline = Pipeline(steps=[], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)
pipeline.train(X_train, y_train)

Epoch: 0, Train loss: 0.6450117792401995, Train g-mean: 0.7963975383109918, Val g-mean: None
Epoch: 1, Train loss: 0.5314305687680536, Train g-mean: 0.7955219140813922, Val g-mean: None
Epoch: 2, Train loss: 0.4578474614300463, Train g-mean: 0.8036099847303619, Val g-mean: None
Epoch: 3, Train loss: 0.4372924766463272, Train g-mean: 0.813510239661603, Val g-mean: None
Epoch: 4, Train loss: 0.4246317715915107, Train g-mean: 0.8154368901866548, Val g-mean: None
Epoch: 5, Train loss: 0.41697042235724147, Train g-mean: 0.8167487085491466, Val g-mean: None
Epoch: 6, Train loss: 0.4135374040969758, Train g-mean: 0.8216606898397752, Val g-mean: None
Epoch: 7, Train loss: 0.40417653893166655, Train g-mean: 0.8220552674403502, Val g-mean: None
Epoch: 8, Train loss: 0.40199231351407766, Train g-mean: 0.8258103613053495, Val g-mean: None
Epoch: 9, Train loss: 0.40487877534826383, Train g-mean: 0.8294730428034831, Val g-mean: None
Epoch: 10, Train loss: 0.39921319683591516, Train g-mean: 0.8323006

In [17]:
  def evaluate(pipeline):
    train_gmean, train_recalls = pipeline.evaluate(X_train, y_train)
    test_gmean, test_recalls = pipeline.evaluate(X_test, y_test)
    print('Epoch: {}'.format(pipeline.epoch))
    print('Train g-mean: {}, recalls: {}'.format(train_gmean.item(), train_recalls))
    print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [18]:
print('Last classifier')
evaluate(pipeline)

Last classifier
Epoch: 199
Train g-mean: 0.8546925289044587, recalls: [0.75760684 0.96421955]
Test g-mean: 0.5702050157433335, recalls: [0.47840081 0.67962627]


In [19]:
if pipeline.has_validation():
    print('Best classifier')
    pipeline.load()
    evaluate(pipeline)