In [1]:
import pandas as pd
import numpy as np
import re
import dill
import tqdm
import gc
import contractions
from bs4 import BeautifulSoup
import unicodedata

In [2]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "lxml")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text, re.I)
    return stripped_text


def remove_urls(text):
    url_pattern = '((https?:\/\/)(\s)*(www\.)?|(www\.))(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*'
    text = re.sub(url_pattern, ' ', text, re.I)
    return text

def remove_checklists(text):
    checklist_pattern = r'\[[xX\.\s]\]'
    text = re.sub(checklist_pattern, ' ', text, re.I|re.DOTALL)
    return text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, ' ', text)
    return text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # remove URLS
    document = remove_urls(document)
    
    # remove checklists
    document = remove_checklists(document)
    
    # expand contractions    
    document = expand_contractions(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=False)  
    
    # remove only numbers
    document = re.sub(r'\b\d+\b', ' ', document)
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [3]:
import numpy as np
import tensorflow as tf
import keras


SEED = 42
np.random.seed(SEED)
tf.set_random_seed(SEED)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
## some config values 
EMBED_SIZE = 300 # how big is each word vector
MAX_FEATURES = 500000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 1000 # max number of words in a doc to use

In [5]:
import os.path

CVE_WORD2IDX_MAP_FILE = 'cve_tokenizer_word2idx.pkl'

print('preloading')
tokenizer = keras.preprocessing.text.Tokenizer()
with open(CVE_WORD2IDX_MAP_FILE, 'rb') as f:
    word2idx = dill.load(f)
tokenizer.word_index = word2idx

preloading


In [6]:
MAX_FEATURES = len(tokenizer.word_index)
MAX_FEATURES

557002

In [9]:
from keras.engine.topology import Layer
from keras import backend as K


class AttentionLayer(Layer):
    
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        """
        
        self.supports_masking = True
        self.init = keras.initializers.get('glorot_uniform')

        self.W_regularizer = keras.regularizers.get(W_regularizer)
        self.b_regularizer = keras.regularizers.get(b_regularizer)

        self.W_constraint = keras.constraints.get(W_constraint)
        self.b_constraint = keras.constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(AttentionLayer, self).__init__(**kwargs)
        

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    
    def call(self, x, mask=None):
        # old code doesn't work
        # eij = K.dot(x, self.W) TF backend doesn't support it
        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 
                              K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        
        return K.sum(weighted_input, axis=1)

    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
    
    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [10]:
def build_gru_cpumodel(embedding_size, max_len, max_features, gru_units=32):
    
    inp = keras.layers.Input(shape=(max_len,))
    x = keras.layers.Embedding(max_features, embedding_size,trainable=True)(inp)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units*2, return_sequences=True, 
                                                    reset_after=True, recurrent_activation='sigmoid'))(x)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units, return_sequences=True, reset_after=True, 
                                                    recurrent_activation='sigmoid'))(x)
    x = AttentionLayer(max_len)(x)
    x = keras.layers.Dense(gru_units, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.1)(x)
    x = keras.layers.Dense(gru_units//2, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.1)(x)
    outp = keras.layers.Dense(1, activation='sigmoid')(x)
    # initialize the model
    model = keras.models.Model(inputs=inp, outputs=outp)       
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    
    return model

In [11]:
with tf.device('cpu:0'):
    gru_cpu_model = build_gru_cpumodel(embedding_size=EMBED_SIZE, 
                                       max_len=MAX_LEN, max_features=MAX_FEATURES, 
                                       gru_units=32)
    gru_cpu_model.load_weights('./models/model2_cve_noncve_demo_weights.h5')


In [12]:
d = ["Bug 1622372 - Require CSRF token on all proxied requests * Require a CSRF token on all proxied requests. This prevents loading\r\n  content hosted from a pod under the console domain by clicking on a\r\n  link that uses the console proxy. Previously, it was not required for\r\n  GET requests.\r\n* Do not forward the X-CSRFToken header through the proxy.\r\n* Set `Content-Security-Policy: default-src 'none'` in the proxied\r\n  response to prevent scripts from running in proxied content.\r\n\r\nIn order to support the CSRF token for WebSockets, this adds an\r\n`x-csrf-token` query parameter when headers can't be set. It also updates\r\nthe console to check the `Origin` header when present since `Referer` is\r\nnot set for WebSockets.\r\n\r\nFixes https://bugzilla.redhat.com/show_bug.cgi?id=1622372\r\n\r\n/assign @liggitt "
    ]

In [13]:
len(d)

1

In [14]:
nd = pre_process_corpus(d)
nd

array(['bug require csrf token on all proxied requests require a csrf token on all proxied requests this prevents loading content hosted from a pod under the console domain by clicking on a link that uses the console proxy previously it was not required for get requests do not forward the x csrftoken header through the proxy set content security policy default src none in the proxied response to prevent scripts from running in proxied content in order to support the csrf token for websockets this adds an x csrf token query parameter when headers cannot be set it also updates the console to check the origin header when present since referer is not set for websockets fixes cgi id assign liggitt'],
      dtype='<U693')

In [15]:
len(nd)

1

In [16]:
test_nd = tokenizer.texts_to_sequences(nd)
test_nd = keras.preprocessing.sequence.pad_sequences(test_nd, maxlen=MAX_LEN)
test_nd.shape

(1, 1000)

In [17]:
pred_y = gru_cpu_model.predict(test_nd)

In [18]:
pred_y = pred_y.ravel()
pred_yl = [1 if prob > 1e-3 else 0 for prob in pred_y]
pred_yl

[1]

In [19]:
pred_y

array([0.999154], dtype=float32)