# Load Necessary Dependencies

In [1]:
import text_normalizer as tn
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import dill
import gc

SEED = 42
np.random.seed(SEED)
tf.set_random_seed(SEED)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Events Data from GitHub

Focus is on
- Issues
- Pull Requests
- Commits (future scope)

Data is based on feeds from popular golang dependencies\repos which had vulnerabilities in the past to get both positive and negative data points in our dataset

In [2]:
df = pd.read_csv('./data/gokube_phase1_jun19/GH_complete_labeled_issues_prs.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
label          152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


# Basic Data Pre-processing

In [3]:
docs = df['description'].values
labels = df['label'].values

In [4]:
#%%time

norm_docs = tn.pre_process_documents_parallel(documents=docs)
print(len(norm_docs))

preprocessing: starting
ThreadPoolExecutor-0_0: working on doc num: 0
ThreadPoolExecutor-0_2: working on doc num: 5000
ThreadPoolExecutor-0_13: working on doc num: 10000
ThreadPoolExecutor-0_14: working on doc num: 15000
ThreadPoolExecutor-0_11: working on doc num: 20000
ThreadPoolExecutor-0_0: working on doc num: 25000
ThreadPoolExecutor-0_3: working on doc num: 30000
ThreadPoolExecutor-0_3: working on doc num: 35000
ThreadPoolExecutor-0_20: working on doc num: 40000
ThreadPoolExecutor-0_28: working on doc num: 45000
ThreadPoolExecutor-0_29: working on doc num: 50000
ThreadPoolExecutor-0_23: working on doc num: 55000
ThreadPoolExecutor-0_24: working on doc num: 60000
ThreadPoolExecutor-0_5: working on doc num: 65000
ThreadPoolExecutor-0_27: working on doc num: 70000
ThreadPoolExecutor-0_26: working on doc num: 75000
ThreadPoolExecutor-0_28: working on doc num: 80000
ThreadPoolExecutor-0_5: working on doc num: 85000
ThreadPoolExecutor-0_7: working on doc num: 90000
ThreadPoolExecutor-0

In [5]:
#with open('./data/gokube_phase1_jun19/gh_preprocessed_descriptions.pkl', 'wb') as f:
#    dill.dump(norm_docs, f)
#    
#with open('./data/gokube_phase1_jun19/gh_labels.pkl', 'wb') as f:
#    dill.dump(labels, f)

In [4]:
with open('./data/gokube_phase1_jun19/gh_preprocessed_descriptions.pkl', 'rb') as f:
    norm_docs = dill.load(f)
    
#with open('./data/gokube_phase1_jun19/gh_labels.pkl', 'rb') as f:
#    labels = dill.load(f)

In [5]:
positive_data = []
positive_labels = []
for doc, label in zip(norm_docs, labels):
    if label != 0:
        positive_data.append(doc)
        positive_labels.append(label)
        
norm_docs = positive_data
labels = np.array(positive_labels) - 1
len(norm_docs), labels.shape

(23243, (23243,))

In [6]:
from collections import Counter

Counter(labels)

Counter({1: 671, 0: 22572})

# Train on 75:25 Split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(norm_docs, labels, 
                                                    test_size=0.25, random_state=SEED)
len(X_train), len(X_test)

(17432, 5811)

In [8]:
## some config values 
EMBED_SIZE = 300 # how big is each word vector
MAX_FEATURES = 800000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 1000 # max number of words in a doc to use

# Create Tokenizer for word tokens

In [9]:
import os.path

CVE_WORD2IDX_MAP_FILE = 'models/v3-jun19/embeddings/cve_tokenizer_word2idx.pkl'

if not os.path.isfile(CVE_WORD2IDX_MAP_FILE):
    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<UNK>', num_words=MAX_FEATURES+1)
    tokenizer.fit_on_texts(list(X_train))
    tokenizer.word_index['<PAD>'] = 0
    with open(CVE_WORD2IDX_MAP_FILE, 'wb') as f:
        dill.dump(tokenizer.word_index, f)
else:
    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<UNK>', num_words=MAX_FEATURES+1)
    with open(CVE_WORD2IDX_MAP_FILE, 'rb') as f:
        word2idx = dill.load(f)
    tokenizer.word_index = word2idx

In [10]:
MAX_FEATURES = len(tokenizer.word_index)
MAX_FEATURES

291899

# Pad sentences to sequence length of 1000 tokens

In [11]:
## Tokenize the sentences
train_X = tokenizer.texts_to_sequences(X_train)
test_X = tokenizer.texts_to_sequences(X_test)

In [12]:
## Pad the sentences 
train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=MAX_LEN)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=MAX_LEN)

In [13]:
train_X_lengths = np.array([len(np.nonzero(item)[0]) for item in train_X])
train_X_idx = np.argwhere(train_X_lengths >= 5).ravel()
train_X = train_X[train_X_idx]
train_y = y_train[train_X_idx]

test_X_lengths = np.array([len(np.nonzero(item)[0]) for item in test_X])
test_X_idx = np.argwhere(test_X_lengths >= 5).ravel()
test_X = test_X[test_X_idx]
test_y = y_test[test_X_idx]
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((17389, 1000), (17389,), (5798, 1000), (5798,))

In [14]:
word2idx = tokenizer.word_index

# Load Pre-trained Embeddings

We have experimented with the following embeddings (pre-trained models)

- FastText
- ParaGram
- GloVe

In [15]:
def load_pretrained_embeddings(word_to_index, max_features, embedding_size, embedding_file_path):    
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*row.split(" ")) 
                                for row in open(embedding_file_path, encoding="utf8", errors='ignore') 
                                    if len(row)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_to_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    
    for word, idx in word_to_index.items():
        if idx >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [16]:
FASTTEXT_INIT_EMBEDDINGS_FILE = './models/v3-jun19/embeddings/cve_model_fasttext_init_embeddings.pkl'

if not os.path.isfile(FASTTEXT_INIT_EMBEDDINGS_FILE):
    FASTTEXT_EMBEDDINGS_PATH = './embeddings/fasttext/crawl-300d-2M.vec'
    ft_embeddings = load_pretrained_embeddings(word_to_index=word2idx, max_features=MAX_FEATURES, 
                                               embedding_size=EMBED_SIZE, 
                                               embedding_file_path=FASTTEXT_EMBEDDINGS_PATH)
    with open(FASTTEXT_INIT_EMBEDDINGS_FILE, 'wb') as f:
        dill.dump(ft_embeddings, f)
else:
    with open(FASTTEXT_INIT_EMBEDDINGS_FILE, 'rb') as f:
        ft_embeddings = dill.load(f)
        
ft_embeddings.shape

(291899, 300)

In [17]:
PARAGRAM_INIT_EMBEDDINGS_FILE = './models/v3-jun19/embeddings/cve_model_paragram_init_embeddings.pkl'

if not os.path.isfile(PARAGRAM_INIT_EMBEDDINGS_FILE):
    PARAGRAM_EMBEDDINGS_PATH = './embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    pg_embeddings = load_pretrained_embeddings(word_to_index=word2idx, max_features=MAX_FEATURES, 
                                               embedding_size=EMBED_SIZE, 
                                               embedding_file_path=PARAGRAM_EMBEDDINGS_PATH)
    with open(PARAGRAM_INIT_EMBEDDINGS_FILE, 'wb') as f:
        dill.dump(pg_embeddings, f)
else:
    with open(PARAGRAM_INIT_EMBEDDINGS_FILE, 'rb') as f:
        pg_embeddings = dill.load(f)
        
pg_embeddings.shape 

(291899, 300)

# Average pre-trained embeddings for vocabulary

In [18]:
avg_pretrained_embeddings = np.mean([ft_embeddings, pg_embeddings], axis = 0)
avg_pretrained_embeddings.shape

(291899, 300)

## Build Model Architecture

### Attention Layer

Attention Layer focuses on attending to the most important words. We sent all the states from our GRU model into the attention model.

![](https://i.imgur.com/vbGl6Vl.png)

The attention layer produces a context vector

![](https://i.imgur.com/nZ71MVd.png)

![](https://i.imgur.com/00KyS2e.png)

In [19]:
from keras.engine.topology import Layer
from keras import backend as K


class AttentionLayer(Layer):
    
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        """
        
        self.supports_masking = True
        self.init = keras.initializers.get('glorot_uniform')

        self.W_regularizer = keras.regularizers.get(W_regularizer)
        self.b_regularizer = keras.regularizers.get(b_regularizer)

        self.W_constraint = keras.constraints.get(W_constraint)
        self.b_constraint = keras.constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(AttentionLayer, self).__init__(**kwargs)
        

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    
    def call(self, x, mask=None):

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 
                              K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        
        return K.sum(weighted_input, axis=1)

    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
    
    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

### Bi-directional GRUs

![](https://i.imgur.com/PuTHi2C.png)

![](https://i.imgur.com/ewTg3gB.png)

![](https://i.imgur.com/oaBYGeu.png)

In [20]:
import keras
from keras.utils import multi_gpu_model

def build_gru_model(embedding_matrix, embedding_size, max_len, max_features, gru_units=32):
    
    inp = keras.layers.Input(shape=(max_len,))
    x = keras.layers.Embedding(max_features, embedding_size, 
                                  weights=[embedding_matrix], trainable=True)(inp)
    x = keras.layers.Bidirectional(keras.layers.CuDNNGRU(gru_units*2, return_sequences=True))(x)
    x = keras.layers.Bidirectional(keras.layers.CuDNNGRU(gru_units, return_sequences=True))(x)
    x = AttentionLayer(max_len)(x)
    x = keras.layers.Dense(gru_units*2, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)
    x = keras.layers.Dense(gru_units, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)

    outp = keras.layers.Dense(1, activation='sigmoid')(x)
    # initialize the model
    model = keras.models.Model(inputs=inp, outputs=outp)

    # make the model parallel
    #model = multi_gpu_model(model, gpus=2)
       
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    
    return model

In [21]:
gru_model = build_gru_model(embedding_matrix=avg_pretrained_embeddings, embedding_size=EMBED_SIZE, 
                            max_len=MAX_LEN, max_features=MAX_FEATURES, gru_units=32)
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         87569700  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 128)         140544    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 64)          31104     
_________________________________________________________________
attention_layer_1 (Attention (None, 64)                1064      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
__________

# Train the Model

In [22]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_y),
                                                 train_y)
class_weights = dict(enumerate(class_weights))
class_weights[1] *= 2
class_weights

{0: 0.5141024124881741, 1: 36.45492662473794}

In [23]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.4,
                              patience=2, min_lr=0.00001)

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=8, 
                           mode='auto', baseline=None, restore_best_weights=False)

callbacks = [reduce_lr, early_stop]

In [24]:
history = gru_model.fit(train_X, train_y, batch_size=256, epochs=20, callbacks=callbacks,
                        class_weight=class_weights, validation_split=0.1)

Train on 15650 samples, validate on 1739 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


# Make Model Predictions on Test Data

In [25]:
gru_model.save('./models/v3-jun19/model_files/cve_model_train75-jun19.h5')
gru_model.save_weights('./models/v3-jun19/model_files/cve_model_train75-jun19_weights.h5')

In [26]:
gru_model = keras.models.load_model('./models/v3-jun19/model_files/cve_model_train75-jun19.h5',
                                        custom_objects={'AttentionLayer': AttentionLayer})
pred_y = gru_model.predict([test_X], batch_size=128, verbose=1)



In [39]:
pred_yr = pred_y.ravel()
pred_yl = [1 if prob > 0.3 else 0 for prob in pred_yr]

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_true=test_y, y_pred=pred_yl)

array([[5579,   61],
       [  49,  109]])

In [41]:
print(classification_report(y_true=test_y, y_pred=pred_yl))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      5640
          1       0.64      0.69      0.66       158

avg / total       0.98      0.98      0.98      5798

