<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd /workspace/

/workspace


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
from scipy.stats import mstats
import math
import re

from jitsdp import metrics
from jitsdp.classifier import Classifier
from jitsdp.pipeline import Pipeline

import logging

In [4]:
logging.getLogger('').handlers = []
logging.basicConfig(filename='notebook/mlp.log', filemode='w', level=logging.DEBUG)

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/brackets.csv')
df.head()

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,nuc,exp,rexp,sexp,glm_probability,rf_probability,repository_id,issue_id,issue_date,issue_type
0,5f406086936d2abf1392b0f77db246b308715d6d,Narayani,1574927837,narayani@adobe.com,Thu Nov 28 13:27:17 2019 +0530,Merge pull request #14985 from adobe/alf_local...,,Merge,False,False,...,0.0,0.0,0.0,0.0,0.173803,0.0,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
1,95626cc3650dcd00e886670d80307b8f710d6168,walf,1574926230,walf@adobe.com,Wed Nov 27 23:30:30 2019 -0800,Updated by ALF automation.,False,,False,False,...,1.0,9.0,1.075033,8.0,0.464372,0.286,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
2,8f26cd850e648d6c4dd04cdfa69119a7feda0867,walf,1574915324,walf@adobe.com,Wed Nov 27 20:28:44 2019 -0800,Updated by ALF automation.,False,,False,False,...,1.0,8.0,0.075033,7.0,0.459778,0.302,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
3,a7dda4aeab550d36bc2c0ca4ecfc29efda21f9ea,Narayani,1574757872,narayani@adobe.com,Tue Nov 26 14:14:32 2019 +0530,Merge pull request #14729 from adobe/alf_local...,,Merge,False,False,...,0.0,0.0,0.0,0.0,0.173803,0.0,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
4,8a806ec41b613d70b26005e2c4907b021a41e744,Gautam Jha,1574315174,gjha@adobe.com,Thu Nov 21 11:16:14 2019 +0530,Moving command line port validation errors to ...,False,,False,False,...,2.0,4.0,2.5,2.0,0.640227,0.496,3dc71fdd-a705-47f1-9685-0dbc873af923,,,


In [6]:
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
preprocess_cols = ['commit_hash', 'author_date_unix_timestamp', 'fixes'] + features_cols + [label_col]
seconds_by_day = 24 * 60 * 60
df_preprocess = df[preprocess_cols].copy()
# filter rows with missing data 
df_preprocess = df_preprocess.dropna(subset=['fix'])
# timeline order
df_preprocess = df_preprocess[::-1]
# add sequencial
df_preprocess['seq'] = range(len(df_preprocess))
# contains_bug
df_preprocess[label_col] = df_preprocess[label_col].astype('int')
# day
first_timestamp = df_preprocess['author_date_unix_timestamp'].min()
df_preprocess['day'] = df_preprocess['author_date_unix_timestamp'] - first_timestamp
df_preprocess['day'] = df_preprocess['day'] / seconds_by_day
df_preprocess['day'] = df_preprocess['day'].astype('int')
# fixes
df_preprocess['commit_hash_fix'] = df_preprocess['fixes'].dropna().apply(lambda x: re.findall('\\b\\w+\\b', x)[0])
df_fix = df_preprocess[['commit_hash', 'day', 'seq']].set_index('commit_hash')
df_preprocess = df_preprocess.join(df_fix, on='commit_hash_fix', how='left', rsuffix='_fix')
df_preprocess.head()



Unnamed: 0,commit_hash,author_date_unix_timestamp,fixes,fix,ns,nd,nf,entropy,la,ld,...,nuc,exp,rexp,sexp,contains_bug,seq,day,commit_hash_fix,day_fix,seq_fix
17747,637d7f4ffa0f2396c2fb61a5e51b9b980f47a2c2,1323292816,"[""c94ebc139b7ca8ae681c71d28e051f4270d493c4"", ""...",False,2.0,5.0,23.0,3.630787,3754.0,0.0,...,0.0,0.0,0.0,0.0,1,0,0,c94ebc139b7ca8ae681c71d28e051f4270d493c4,30.0,206.0
17746,c8142d2dc17fc1d3777689d67d32d010f5d8dfa7,1323292845,,False,2.0,2.0,2.0,0.811278,4.0,0.0,...,0.0,1.0,1.0,0.0,0,1,0,,,
17745,af90ea5adf06c935ce1d5db4ce996054c94aeed7,1323294546,,True,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,2.0,2.0,1.0,0,2,0,,,
17744,82f3f98077bde9ac7c4ff285c593a2f351da5bf2,1323295924,,False,1.0,1.0,1.0,0.0,2.0,3.0,...,1.0,3.0,3.0,2.0,0,3,0,,,
17743,a454a7bce095b18da240cddf844bfea5a334b7cb,1323301755,,False,1.0,1.0,1.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0,4,0,,,


In [7]:
prequential_cols = ['day', 'day_fix', 'seq', 'seq_fix'] + features_cols + [label_col]
# TODO: exclude last commits which aren't labeled
df_prequential = df_preprocess[prequential_cols].copy()

In [8]:
def create_pipeline():
    scaler = StandardScaler()
    criterion = nn.BCELoss()
    classifier = Classifier(input_size=len(features_cols), hidden_size=len(features_cols) // 2, drop_prob=0.2)
    optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)
    return Pipeline(steps=[scaler], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)

In [9]:
  def evaluate(label, targets, predictions):
    gmean, recalls = metrics.gmean_recalls(targets, predictions)
    print('{} g-mean: {}, recalls: {}'.format(label, gmean, recalls))
  
  def evaluate_train_test(seq, targets_train, predictions_train, targets_test, predictions_test):
    print('Sequential: {}'.format(seq))
    evaluate('Train', targets_train, predictions_train)
    evaluate('Test', targets_test, predictions_test)

In [10]:
# split dataset in chunks for testing and iterate over them (chunk from current to current + interval or end)
# the previous chunks are used for training (chunks from start to current)
verification_latency = 0 # days
interval = 500 # commits
end = len(df_prequential) # last commit
n_chunks = math.ceil(end / interval)
end = n_chunks * interval # last chunk end
start = end - (n_chunks - 1) * interval # second chunk start
#start = end - interval # last chunk start

pipeline = create_pipeline()
pipeline.save()
predictions = []
for current in range(start, end, interval):
#for current in range(start, start+1, interval):
    df_train = df_prequential[:current].copy()
    df_test = df_prequential[current:min(current + interval, end)].copy()
    # check if fix has been done (bug) or verification latency has passed (normal), otherwise exclude commit
    train_day = df_train['day'].max()
    train_seq = df_train['seq'].max()
    df_train[label_col] = df_train.apply(lambda row: 1 if row.seq_fix <= train_seq else (0 if row.day <= train_day - verification_latency else None), axis='columns')
    df_train = df_train.dropna(subset=[label_col])
    df_train[label_col] = df_train[label_col].astype('int')
    # convert to numpy array
    X_train = df_train[features_cols].values
    y_train = df_train[label_col].values
    X_test = df_test[features_cols].values
    y_test = df_test[label_col].values
    # train and evaluate
    pipeline = create_pipeline()
    pipeline.load()
    pipeline.train(X_train, y_train)
    pipeline.save()
    predictions_test = pipeline.predict(X_test)    
    evaluate_train_test(current, y_train, pipeline.predict(X_train), y_test, predictions_test)
    predictions.append(predictions_test)

predictions = np.concatenate(predictions)
targets = df_prequential[label_col][start:].values
evaluate('Full test', targets, predictions)

  log_a = np.log(a)
Sequential: 500
Train g-mean: 0.7647433124908976, recalls: [0.66959064 0.87341772]
Test g-mean: 0.5896431189332371, recalls: [0.87859425 0.39572193]
Sequential: 1000
Train g-mean: 0.7974602923010693, recalls: [0.72374101 0.87868852]
Test g-mean: 0.4340104415468487, recalls: [0.88073394 0.21387283]
Sequential: 1500
Train g-mean: 0.7793655496628967, recalls: [0.70038168 0.86725664]
Test g-mean: 0.6789209971474433, recalls: [0.81446541 0.56593407]
Sequential: 2000
Train g-mean: 0.7738833968508804, recalls: [0.69927536 0.85645161]
Test g-mean: 0.7042936143247844, recalls: [0.80487805 0.61627907]
Sequential: 2500
Train g-mean: 0.7702149126033707, recalls: [0.72819358 0.81466113]
Test g-mean: 0.7335104362592145, recalls: [0.85478548 0.62944162]
Sequential: 3000
Train g-mean: 0.7677478577692306, recalls: [0.74964706 0.78628571]
Test g-mean: 0.684918206837917, recalls: [0.81845238 0.57317073]
Sequential: 3500
Train g-mean: 0.7669667043067473, recalls: [0.76530199 0.76863504