<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/notebook/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
from scipy.stats import mstats
import math
import re

from jitsdp import metrics
from jitsdp.classifier import Classifier
from jitsdp.pipeline import Pipeline

In [11]:
df = pd.read_csv('https://raw.githubusercontent.com/dinaldoap/jit-sdp-data/master/scikit-learn.csv')
df.head()

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,nuc,exp,rexp,sexp,glm_probability,rf_probability,repository_id,issue_id,issue_date,issue_type
0,4f97facc3a992c6e2459c3da86c9d69b0688d5ab,Adrin Jalali,1574678405,adrin.jalali@gmail.com,Mon Nov 25 11:40:05 2019 +0100,DOC Further 0.22 whats_new cleanup (#15675) * ...,True,Corrective,True,False,...,2.0,166.0,9.115754,55.0,0.439104,0.576,9498e9cb-538a-4aa0-87f1-ffd09b852bb5,,,
1,fc46a13d57be800da2a8a6b2f8e2621d132ac508,Hanmin Qin,1574674713,qinhanmin2005@sina.com,Mon Nov 25 03:38:33 2019 -0600,DOC Wrong statement in release highlight (#157...,True,Corrective,True,False,...,1.0,460.0,9.833898,57.0,0.42169,0.364,9498e9cb-538a-4aa0-87f1-ffd09b852bb5,,,
2,9f5b97119b08789740876e384d04b4a2e314bc8b,Thomas J Fan,1574667294,thomasjpfan@gmail.com,Mon Nov 25 02:34:54 2019 -0500,CLN Removes ccp_alpha from RandomTreesEmbeddin...,False,,False,False,...,2.0,163.0,24.037563,79.5,0.396111,0.154,9498e9cb-538a-4aa0-87f1-ffd09b852bb5,,,
3,c1ba7bfe43c2157a3fbb0acc1a49c9f14363f49c,J-A16,1574617549,Jeremy.Alexandr@gmail.com,Sun Nov 24 12:45:49 2019 -0500,"DOC versionadded labels for NearestNeighbors, ...",False,Feature Addition,False,False,...,2.0,0.0,0.0,0.0,0.465131,0.15,9498e9cb-538a-4aa0-87f1-ffd09b852bb5,,,
4,308a54e3aed87ec54348a976f8b07f346200b033,Jérémie du Boisberranger,1574458060,34657725+jeremiedbb@users.noreply.github.com,Fri Nov 22 22:27:40 2019 +0100,CI Use new conda syntax to select blas (#15705),False,Feature Addition,False,False,...,2.0,3.0,1.5,1.0,0.511461,0.32,9498e9cb-538a-4aa0-87f1-ffd09b852bb5,,,


In [12]:
label_col = 'contains_bug'
features_cols = ['fix', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
preprocess_cols = ['commit_hash', 'author_date_unix_timestamp', 'fixes'] + features_cols + [label_col]
seconds_by_day = 24 * 60 * 60
df_preprocess = df[preprocess_cols]
# timeline order
df_preprocess = df_preprocess[::-1]
# contains_bug
df_preprocess[label_col] = df_preprocess[label_col].astype('int')
# day
first_timestamp = df_preprocess['author_date_unix_timestamp'].min()
df_preprocess['day'] = df_preprocess['author_date_unix_timestamp'] - first_timestamp
df_preprocess['day'] = df_preprocess['day'] / seconds_by_day
df_preprocess['day'] = df_preprocess['day'].astype('int')
# fixes
df_preprocess['commit_hash_fix'] = df_preprocess['fixes'].dropna().apply(lambda x: re.findall('\\b\\w+\\b', x)[0])
df_fix = df_preprocess[['commit_hash', 'day']].set_index('commit_hash')
df_preprocess = df_preprocess.join(df_fix, on='commit_hash_fix', how='left', rsuffix='_fix')
df_preprocess.head()



Unnamed: 0,commit_hash,author_date_unix_timestamp,fixes,fix,ns,nd,nf,entropy,la,ld,...,ndev,age,nuc,exp,rexp,sexp,contains_bug,day,commit_hash_fix,day_fix
24875,92fbeab6ad94b6683377e26e8af7dd1cb86d615e,1262697992,,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,
24874,619da3419bf6e07c8216f8c952c41a0ea37fcabf,1262698358,,False,1.0,1.0,1.0,0.0,1.0,0.0,...,2.0,1.2e-05,1.0,1548.0,0.0,1005.0,0,0,,
24873,fbb3c512ecbffc97825eaaaf925c9af580608fa8,1262698396,"[""61bc9aae6a501219c853c7023fecb880a8055f9c"", ""...",False,2.0,4.0,15.0,3.287953,920.0,0.0,...,33.0,1.2e-05,1.0,1549.0,2.0,578.0,1,0,61bc9aae6a501219c853c7023fecb880a8055f9c,0.0
24872,def073bedb0cd71fbdc1e06d5fed1acf07960388,1262698420,"[""9afe33919a67a9131d8e3fc85e5d513c0df1d322""]",False,1.0,3.0,7.0,2.217487,421.0,0.0,...,3.0,1.2e-05,1.0,1550.0,4.0,1007.0,1,0,9afe33919a67a9131d8e3fc85e5d513c0df1d322,65.0
24871,2cc9f2be7bf7fbd36484bec76b4446704971bbbc,1262698458,,False,1.0,2.0,11.0,1.622267,22489.0,0.0,...,1.0,1.1e-05,1.0,1551.0,6.0,1008.0,0,0,,


In [13]:
prequential_cols = ['day', 'day_fix'] + features_cols + [label_col]
df_prequential = df_preprocess[prequential_cols]
df_prequential.head()

Unnamed: 0,day,day_fix,fix,ns,nd,nf,entropy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,contains_bug
24875,0,,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
24874,0,,False,1.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,1.2e-05,1.0,1548.0,0.0,1005.0,0
24873,0,0.0,False,2.0,4.0,15.0,3.287953,920.0,0.0,0.0,33.0,1.2e-05,1.0,1549.0,2.0,578.0,1
24872,0,65.0,False,1.0,3.0,7.0,2.217487,421.0,0.0,0.0,3.0,1.2e-05,1.0,1550.0,4.0,1007.0,1
24871,0,,False,1.0,2.0,11.0,1.622267,22489.0,0.0,0.0,1.0,1.1e-05,1.0,1551.0,6.0,1008.0,0


In [14]:
def create_pipeline():
    scaler = StandardScaler()
    criterion = nn.BCELoss()
    classifier = Classifier(input_size=len(features_cols), hidden_size=len(features_cols), drop_prob=0.5)
    optimizer = optim.Adam(params=classifier.parameters(), lr=0.003)
    return Pipeline(steps=[scaler], classifier=classifier, optimizer=optimizer, criterion=criterion, max_epochs=200, fading_factor=0.9999)

In [15]:
  def evaluate(pipeline, X_train, y_train, X_test, y_test):
    train_gmean, train_recalls = pipeline.evaluate(X_train, y_train)
    test_gmean, test_recalls = pipeline.evaluate(X_test, y_test)
    print('Epoch: {}'.format(pipeline.epoch))
    print('Train g-mean: {}, recalls: {}'.format(train_gmean.item(), train_recalls))
    print('Test g-mean: {}, recalls: {}'.format(test_gmean.item(), test_recalls))

In [16]:
# split dataset in chunks for testing and iterate over them (chunk from current to current + interval or end)
# the previous chunks are used for training (chunks from start to current)
start = 30
interval = 30
end = df_prequential['day'].max() # last day
end = math.ceil((end - start) / interval) * interval + start # last chunk

for current in range(start, end, interval):
    df_train = df_prequential[df_prequential['day'] < current]
    df_test = df_prequential[df_prequential['day'] < min(current + interval, end)]
    df_test = df_test[df_prequential['day'] >= current]  
    # TODO: check if fix has been done (bug) or verification latency has passed (normal), otherwise exclude commit
    # convert to numpy array
    X_train = df_train[features_cols].values
    y_train = df_train[label_col].values
    X_test = df_test[features_cols].values
    y_test = df_test[label_col].values
    # train and evaluate
    pipeline = create_pipeline()
    pipeline.train(X_train, y_train)
    evaluate(pipeline, X_train, y_train, X_test, y_test)
    


  # This is added back by InteractiveShellApp.init_path()


RuntimeError: reduce failed to synchronize: cudaErrorAssert: device-side assert triggered