<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
from scipy.stats import mstats
import math
import re

from jitsdp import metrics
from jitsdp.classifier import Classifier
from jitsdp.pipeline import Pipeline

import logging

In [3]:
logging.getLogger('').handlers = []
logging.basicConfig(filename='notebook/mlp.log', filemode='w', level=logging.DEBUG)

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/brackets.csv')
df.head()

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,nuc,exp,rexp,sexp,glm_probability,rf_probability,repository_id,issue_id,issue_date,issue_type
0,5f406086936d2abf1392b0f77db246b308715d6d,Narayani,1574927837,narayani@adobe.com,Thu Nov 28 13:27:17 2019 +0530,Merge pull request #14985 from adobe/alf_local...,,Merge,False,False,...,0.0,0.0,0.0,0.0,0.173803,0.0,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
1,95626cc3650dcd00e886670d80307b8f710d6168,walf,1574926230,walf@adobe.com,Wed Nov 27 23:30:30 2019 -0800,Updated by ALF automation.,False,,False,False,...,1.0,9.0,1.075033,8.0,0.464372,0.286,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
2,8f26cd850e648d6c4dd04cdfa69119a7feda0867,walf,1574915324,walf@adobe.com,Wed Nov 27 20:28:44 2019 -0800,Updated by ALF automation.,False,,False,False,...,1.0,8.0,0.075033,7.0,0.459778,0.302,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
3,a7dda4aeab550d36bc2c0ca4ecfc29efda21f9ea,Narayani,1574757872,narayani@adobe.com,Tue Nov 26 14:14:32 2019 +0530,Merge pull request #14729 from adobe/alf_local...,,Merge,False,False,...,0.0,0.0,0.0,0.0,0.173803,0.0,3dc71fdd-a705-47f1-9685-0dbc873af923,,,
4,8a806ec41b613d70b26005e2c4907b021a41e744,Gautam Jha,1574315174,gjha@adobe.com,Thu Nov 21 11:16:14 2019 +0530,Moving command line port validation errors to ...,False,,False,False,...,2.0,4.0,2.5,2.0,0.640227,0.496,3dc71fdd-a705-47f1-9685-0dbc873af923,,,


In [5]:
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
preprocess_cols = ['commit_hash', 'author_date_unix_timestamp', 'fixes'] + features_cols + [label_col]
seconds_by_day = 24 * 60 * 60
df_preprocess = df[preprocess_cols].copy()
# filter rows with missing data 
df_preprocess = df_preprocess.dropna(subset=['fix'])
# timeline order
df_preprocess = df_preprocess[::-1]
# contains_bug
df_preprocess[label_col] = df_preprocess[label_col].astype('int')
# day
first_timestamp = df_preprocess['author_date_unix_timestamp'].min()
df_preprocess['day'] = df_preprocess['author_date_unix_timestamp'] - first_timestamp
df_preprocess['day'] = df_preprocess['day'] / seconds_by_day
df_preprocess['day'] = df_preprocess['day'].astype('int')
# fixes
df_preprocess['commit_hash_fix'] = df_preprocess['fixes'].dropna().apply(lambda x: re.findall('\\b\\w+\\b', x)[0])
df_fix = df_preprocess[['commit_hash', 'day']].set_index('commit_hash')
df_preprocess = df_preprocess.join(df_fix, on='commit_hash_fix', how='left', rsuffix='_fix')
df_preprocess.head()



Unnamed: 0,commit_hash,author_date_unix_timestamp,fixes,fix,ns,nd,nf,entropy,la,ld,...,ndev,age,nuc,exp,rexp,sexp,contains_bug,day,commit_hash_fix,day_fix
17747,637d7f4ffa0f2396c2fb61a5e51b9b980f47a2c2,1323292816,"[""c94ebc139b7ca8ae681c71d28e051f4270d493c4"", ""...",False,2.0,5.0,23.0,3.630787,3754.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0,c94ebc139b7ca8ae681c71d28e051f4270d493c4,30.0
17746,c8142d2dc17fc1d3777689d67d32d010f5d8dfa7,1323292845,,False,2.0,2.0,2.0,0.811278,4.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0,0,,
17745,af90ea5adf06c935ce1d5db4ce996054c94aeed7,1323294546,,True,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.019688,1.0,2.0,2.0,1.0,0,0,,
17744,82f3f98077bde9ac7c4ff285c593a2f351da5bf2,1323295924,,False,1.0,1.0,1.0,0.0,2.0,3.0,...,1.0,0.035972,1.0,3.0,3.0,2.0,0,0,,
17743,a454a7bce095b18da240cddf844bfea5a334b7cb,1323301755,,False,1.0,1.0,1.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,


In [6]:
prequential_cols = ['day', 'day_fix'] + features_cols + [label_col]
df_prequential = df_preprocess[prequential_cols].copy()

In [7]:
def create_pipeline():
    scaler = StandardScaler()
    criterion = nn.BCELoss()
    classifier = Classifier(input_size=len(features_cols), hidden_size=len(features_cols), drop_prob=0.5)
    optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)
    return Pipeline(steps=[scaler], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)

In [8]:
  def evaluate(label, targets, predictions):
    gmean, recalls = metrics.gmean_recalls(targets, predictions)
    print('{} g-mean: {}, recalls: {}'.format(label, gmean, recalls))
  
  def evaluate_train_test(timestep, targets_train, predictions_train, targets_test, predictions_test):
    print('Timestep: {}'.format(timestep))
    evaluate('Train', targets_train, predictions_train)
    evaluate('Test', targets_test, predictions_test)

In [9]:
# split dataset in chunks for testing and iterate over them (chunk from current to current + interval or end)
# the previous chunks are used for training (chunks from start to current)
verification_latency = 90 # days
interval = 1000 # commits
end = len(df_prequential) # last commit
n_chunks = math.ceil(end / interval)
end = n_chunks * interval # last chunk end
start = end - (n_chunks - 1) * interval # second chunk start

predictions = []
for current in range(start, end, interval):
#for current in range(start, start+1, interval):
    df_train = df_prequential[:current].copy()
    df_test = df_prequential[current:min(current + interval, end)].copy()
    # check if fix has been done (bug) or verification latency has passed (normal), otherwise exclude commit
    train_day = df_train['day'].max()
    df_train[label_col] = df_train.apply(lambda row: 1 if row.day_fix < train_day else (0 if row.day < train_day - verification_latency else None), axis='columns')
    df_train = df_train.dropna(subset=[label_col])
    df_train[label_col] = df_train[label_col].astype('int')
    # convert to numpy array
    X_train = df_train[features_cols].values
    y_train = df_train[label_col].values
    X_test = df_test[features_cols].values
    y_test = df_test[label_col].values
    # train and evaluate
    pipeline = create_pipeline()
    pipeline.train(X_train, y_train)
    predictions_test = pipeline.predict(X_test)    
    evaluate_train_test(current, pipeline.predict(X_train), y_train, predictions_test, y_test)
    predictions.append(predictions_test)

predictions = np.concatenate(predictions)
targets = df_prequential[label_col][start:].values
evaluate('Full test', targets, predictions)
    


Timestep: 1000
Train g-mean: 0.8003679132191823, recalls: [0.64827586 0.98814229]
Test g-mean: 0.5464261330535815, recalls: [0.81081081 0.36825054]
Timestep: 2000
Train g-mean: 0.7392040165352215, recalls: [0.77596996 0.70418006]
Test g-mean: 0.607690668214771, recalls: [0.82154882 0.44950213]
Timestep: 3000
Train g-mean: 0.7178561331874959, recalls: [0.82563338 0.624148  ]
Test g-mean: 0.648233253800854, recalls: [0.82352941 0.51025057]
Timestep: 4000
Train g-mean: 0.6950963747101455, recalls: [0.84258211 0.57342657]
Test g-mean: 0.6062079079436514, recalls: [0.76673428 0.47928994]
Timestep: 5000
Train g-mean: 0.6942835610383136, recalls: [0.83271375 0.57886598]
Test g-mean: 0.6279943952790398, recalls: [0.80511811 0.4898374 ]
Timestep: 6000
Train g-mean: 0.6698972203222472, recalls: [0.8155071  0.55028618]
Test g-mean: 0.5765531617054733, recalls: [0.88105727 0.37728938]
  log_a = np.log(a)
Timestep: 7000
Train g-mean: 0.6738452575597667, recalls: [0.82865082 0.54795991]
Test g-mean: