<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
from scipy.stats import mstats
import math
import re

from jitsdp import metrics
from jitsdp.classifier import Classifier
from jitsdp.pipeline import Pipeline

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/brackets.csv')
df.head()

In [0]:
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
preprocess_cols = ['commit_hash', 'author_date_unix_timestamp', 'fixes'] + features_cols + [label_col]
seconds_by_day = 24 * 60 * 60
df_preprocess = df[preprocess_cols]
# filter rows with missing data 
df_preprocess = df_preprocess.dropna(subset=['fix'])
# timeline order
df_preprocess = df_preprocess[::-1]
# contains_bug
df_preprocess[label_col] = df_preprocess[label_col].astype('int')
# day
first_timestamp = df_preprocess['author_date_unix_timestamp'].min()
df_preprocess['day'] = df_preprocess['author_date_unix_timestamp'] - first_timestamp
df_preprocess['day'] = df_preprocess['day'] / seconds_by_day
df_preprocess['day'] = df_preprocess['day'].astype('int')
# fixes
df_preprocess['commit_hash_fix'] = df_preprocess['fixes'].dropna().apply(lambda x: re.findall('\\b\\w+\\b', x)[0])
df_fix = df_preprocess[['commit_hash', 'day']].set_index('commit_hash')
df_preprocess = df_preprocess.join(df_fix, on='commit_hash_fix', how='left', rsuffix='_fix')
df_preprocess.head()



In [0]:
prequential_cols = ['day', 'day_fix'] + features_cols + [label_col]
df_prequential = df_preprocess[prequential_cols]

In [0]:
def create_pipeline():
    scaler = StandardScaler()
    criterion = nn.BCELoss()
    classifier = Classifier(input_size=len(features_cols), hidden_size=len(features_cols), drop_prob=0.5)
    optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)
    return Pipeline(steps=[scaler], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)

In [0]:
  def evaluate(pipeline, X_train, y_train, X_test, y_test):
    train_gmean, train_recalls = pipeline.evaluate(X_train, y_train)
    test_gmean, test_recalls = pipeline.evaluate(X_test, y_test)
    print('Epoch: {}'.format(pipeline.epoch))
    print('Train g-mean: {}, recalls: {}'.format(train_gmean.item(), train_recalls))
    print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [8]:
# split dataset in chunks for testing and iterate over them (chunk from current to current + interval or end)
# the previous chunks are used for training (chunks from start to current)
start = 30
interval = 30
end = df_prequential['day'].max() # last day
end = math.ceil((end - start) / interval) * interval + start # last chunk

for current in range(start, end, interval):
    df_train = df_prequential[df_prequential['day'] < current]
    df_test = df_prequential[df_prequential['day'] < min(current + interval, end)]
    df_test = df_test[df_prequential['day'] >= current]  
    # TODO: check if fix has been done (bug) or verification latency has passed (normal), otherwise exclude commit
    # convert to numpy array
    X_train = df_train[features_cols].values
    y_train = df_train[label_col].values
    X_test = df_test[features_cols].values
    y_test = df_test[label_col].values
    # train and evaluate
    pipeline = create_pipeline()
    pipeline.train(X_train, y_train)
    evaluate(pipeline, X_train, y_train, X_test, y_test)
    


  # This is added back by InteractiveShellApp.init_path()
Epoch: 199
Train g-mean: 0.7148376387082239, recalls: [0.74257148 0.68813961]
Test g-mean: 0.4763998795528962, recalls: [0.7375    0.3077381]
  # This is added back by InteractiveShellApp.init_path()
Epoch: 199
Train g-mean: 0.6665091675824271, recalls: [0.68296942 0.65044563]
Test g-mean: 0.5032053968211945, recalls: [0.65590112 0.38605769]
  # This is added back by InteractiveShellApp.init_path()
Epoch: 199
Train g-mean: 0.6635201499718992, recalls: [0.63223675 0.69635147]
Test g-mean: 0.643120363852963, recalls: [0.61147186 0.67640693]
  # This is added back by InteractiveShellApp.init_path()
Epoch: 199
Train g-mean: 0.692849873756045, recalls: [0.65675841 0.73092471]
Test g-mean: 0.618017677369251, recalls: [0.62955487 0.60669192]
  # This is added back by InteractiveShellApp.init_path()
Epoch: 199
Train g-mean: 0.6982547144428805, recalls: [0.7224835  0.67483846]
Test g-mean: 0.7828585131463638, recalls: [0.82843137 0.73979