In [1]:
import keras
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import text, sequence
import numpy as np
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [2]:
from keras_self_attention import SeqSelfAttention

In [3]:
# If keras_self_attention didn't work
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [4]:
def make_glovevec(glovepath, max_features, embed_size, word_index, veclen=300):
    embeddings_index = {}
    f = open(glovepath)
    for line in f:
        values = line.split()
        word = ' '.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs.reshape(-1)
    f.close()

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [23]:
# def make_df(train, test=None, y, max_features, maxlen, list_classes):

#     list_sentences_train = train["Comments"].values
#     y = train["Label"].values
# #     list_sentences_test = test["comment_text"].fillna("unknown").values
    
#     tokenizer = text.Tokenizer(num_words=max_features)
#     tokenizer.fit_on_texts(list(list_sentences_train))
#     list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#     list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
#     X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
#     X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

#     word_index = tokenizer.word_index

#     return X_t, X_te, y, word_index

In [5]:
def BidLstm(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ), name="Input")
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False, name="embedding")(inp)
    x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25),name="lstm_1")(x)
    x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25),name="lstm_2")(x)
    x = Attention(maxlen)(x)
#     x = SeqSelfAttention(attention_type=SeqSelfAttention.ATTENTION_TYPE_MUL,
#                          kernel_regularizer=keras.regularizers.l2(1e-4),
#                          bias_regularizer=keras.regularizers.l1(1e-4),
#                          attention_regularizer_weight=1e-4,
#                          name="Attention")(x)
    
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = BatchNormalization()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x, name="Model")
    print(model.summary())
    return model

In [5]:
# import pandas as pd
# max_features = 100000
# maxlen = 150
# embed_size = 300
# list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult","identity_hate"]

In [6]:
# xtr, xte, y, word_index = make_df("all/train.csv",
#                                       "all/test.csv",
#                                       max_features, maxlen, list_classes)

In [53]:
# y_ = np.reshape(y,(y.shape[0],1,y.shape[1]))

In [59]:
# y_ = np.expand_dims(y[:,0],axis=1)

In [51]:
# embedding_vector = make_glovevec("crawl-300d-2M-subword.vec",max_features, embed_size, word_index)
# import pickle
# pickle.dump(embedding_vector,open("embedding.pkl","wb"))
# embedding_vector = pickle.load(open("embedding.pkl","rb"))

In [62]:
# model = BidLstm(maxlen, max_features, embed_size, embedding_vector)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 150)               0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 300)          30000000  
_________________________________________________________________
lstm_1 (Bidirectional)       (None, 150, 600)          1442400   
_________________________________________________________________
lstm_2 (Bidirectional)       (None, 150, 600)          2162400   
_________________________________________________________________
attention_4 (Attention)      (None, 600)               750       
_________________________________________________________________
dense_11 (Dense)             (None, 256)               153856    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
__________

In [None]:
# file_path = ".model.hdf5"
# ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# early = EarlyStopping(monitor="val_loss", mode="min", patience=1)

In [2]:
# model.fit(xtr, y_, batch_size=256, epochs=15, validation_split=0.1) #, callbacks=[ckpt, early])

# NVD FEED DATA

In [6]:
import pickle
import pandas as pd
data = pickle.load(open("attention_data.pkl","rb"))

In [16]:
# train = data["Comments"].values
# y = data["Label"].values

In [7]:
max_features = 100000
maxlen = 300
embed_size = 300

In [19]:
# test = pickle.load(open("attention_data.pkl","rb"))
test_issue = pickle.load(open("test_issue.pkl","rb"))
test = pd.DataFrame(test_issue)
# X_te, idx = make_df_test(data1, data, max_features, maxlen)

In [117]:
def make_df(data, test, max_features, maxlen):

    list_sentences_train = data["Comments"].values
    y = data["Label"].values
#     list_sentences_test = test["comment_text"].fillna("unknown").values
    list_sentences_test = test#[0].values

    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(list_sentences_train))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

    word_index = tokenizer.word_index

    return X_t, X_te, y, word_index

In [60]:
demo = ["Hi how are you Hi how are you Hi how are you" ,"like a boss", "Hey Bitch get out the way", "I don't give a fk","Karma is a bitch"]

In [85]:
demo_tok = text.Tokenizer(num_words=10)

In [62]:
demo_tok.fit_on_texts(demo)

In [63]:
demo_tok.word_index

{'hi': 1,
 'how': 2,
 'are': 3,
 'you': 4,
 'a': 5,
 'bitch': 6,
 'like': 7,
 'boss': 8,
 'hey': 9,
 'get': 10,
 'out': 11,
 'the': 12,
 'way': 13,
 'i': 14,
 "don't": 15,
 'give': 16,
 'fk': 17,
 'karma': 18,
 'is': 19}

In [64]:
demo_tok.texts_to_sequences(demo)

[[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], [7, 5, 8], [9, 6], [5], [5, 6]]

In [138]:
xtr, xte, y, word_index = make_df(data, test, max_features, maxlen)

In [119]:
# embedding_vector = make_glovevec("crawl-300d-2M-subword.vec", max_features, embed_size, word_index)
# pickle.dump(embedding_vector,open("embedding.pkl","wb"))
embedding_vector = pickle.load(open("embedding.pkl","rb"))

In [120]:
model = BidLstm(maxlen, max_features, embed_size, embedding_vector)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 300)               0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 300)          30000000  
_________________________________________________________________
lstm_1 (Bidirectional)       (None, 300, 600)          1442400   
_________________________________________________________________
lstm_2 (Bidirectional)       (None, 300, 600)          2162400   
_________________________________________________________________
attention_2 (Attention)      (None, 600)               900       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               153856    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [15]:
file_path = ".model.hdf5"
ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=1)

In [11]:
model.fit(xtr, y, batch_size=256, epochs=15, validation_split=0.1, callbacks=[ckpt, early])

In [121]:
model.load_weights("Attention_bid_model.hdf5")

In [24]:
import pickle

In [30]:
demo = pickle.load(open("./repo/kuberneteskompose","rb"))

In [33]:
import csv

w = csv.writer(open("output.csv", "w"))
for key, val in demo.items():
    w.writerow([key, val])

In [3]:
import pickle
test_issue = pickle.load(open("test_issue.pkl","rb"))

# import pandas as pd
# data = pd.DataFrame(test_issue)

In [24]:
len(test_issue)

41

In [26]:
# def make_df_test(data, test, max_features, maxlen):

#     list_sentences_train = data["Comments"].values
# #     y = train["Label"].values
#     list_sentences_test = test[0].values
    
#     tokenizer = text.Tokenizer(num_words=max_features)
#     tokenizer.fit_on_texts(list(list_sentences_train))
#     list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
# #     list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
#     X_t = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
# #     X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

#     word_index = tokenizer.word_index

#     return X_t, word_index

In [139]:
results = model.predict(xte)

In [40]:
for i,j in enumerate(results):
    if j < 0.7:
        print(i)

15
23
34


In [113]:
test_issue[]

'OpenSSL vulnerability need upgrade openssl to openssl 1 1 0f 3 deb9u2 version OpenSSL Security Advisory Details External Disclosures CVE 2017 3736 CVE 2017 3738 External Disclosure URL specified below Current hyperkube image includes OpenSSL 1 1 0f version it need update to openssl 1 1 0f 3 deb9u2 version which has included CVE 2017 3736 CVE 2017 3738 fix sig node Issues go stale after 90d of inactivity Mark the issue as fresh with Stale issues rot after an additional 30d of inactivity and eventually close If this issue is safe to close now please do so with Send feedback to sig testing kubernetes test infra and or lifecycle stale Stale issues rot after 30d of inactivity Mark the issue as fresh with Rotten issues close after an additional 30d of inactivity If this issue is safe to close now please do so with Send feedback to sig testing kubernetes test infra and or lifecycle rotten Rotten issues close after 30d of inactivity Reopen the issue with Mark the issue as fresh with Send feed

In [140]:
results

array([[0.0291853]], dtype=float32)

In [78]:
xte[0]

array([  324,    20,   173,    31,     3,   169,  3639,   139,     2,
        2438,  3242,     3,    15,  2674,   325,   183,    27,    77,
         131,  1220,     5,     7,     2,    16,  5900,    36,    20,
          16,   234,    56,  1545,    26,    24,  9645,     1,   169,
        1195,     1,   314,   331,     9,  3870,    10,   206,     2,
          16,  2584,   135,   913,     1,  9750,   193,     6,   135,
         442,   206,   379,    23,   402,   766,  1823,  5058,    11,
         442,    95,  1433,    23,    11,     7,  2196,   324,   135,
          11,   200,   127,  1120,     2,   151,    10, 22061,  2196,
         142,   322,    63,   150,   426,   467,   477,   437,     7,
           3,   416,   145,   198,   350,   494,     8,    94,     5,
        1490,   863,   398,  2395,  3502,  5967,  4254,  3492,    24,
        4927,  2670, 11675,    35,  2395,   172,  2283,     1,   473,
          38,   285,     1,   410,     9,     3,     6,   285,   166,
           3,   572,

In [79]:
word_index["free"]

598

In [81]:
demo_tok.fit_on_texts(data["Comments"].values)

In [101]:
# texts = ['a a a', 'b b', 'c']
dat = data["Comments"].values
tokenizer = text.Tokenizer(num_words=100000)
tokenizer.fit_on_texts(dat)
seq_data = tokenizer.texts_to_sequences(dat)

In [102]:
demo_test = tokenizer.texts_to_sequences(test[0].values)

In [103]:
print(demo_test)

[[16584, 172, 607, 7, 70, 3, 210, 2, 2372, 3, 150, 2, 3, 5120, 10, 264, 13, 29, 77, 183, 12, 7, 1447, 19, 1, 307, 1263, 12, 7, 1776, 561, 7085, 25, 510, 8, 73, 2, 1060, 1, 2220, 169, 12, 5, 214, 40, 16, 4537, 10, 264, 2, 52854, 23, 241, 961, 2, 52854, 135, 50, 150, 20, 33, 115, 85, 1935, 2, 4927, 1475, 1079, 19, 502, 6956, 7, 11, 5695, 10, 14, 2, 49, 79, 9, 1, 199, 455, 1, 183, 19, 57, 1328, 127, 3, 143, 110, 48, 1139, 5, 183, 47, 12, 11, 20, 16, 948, 19, 100, 5120, 2245, 11, 292, 2253, 27, 3, 771, 48, 7, 21, 3, 5120, 6, 135, 202, 11, 25, 3, 143, 44, 572, 8, 106, 2829, 21, 26, 14, 1438, 33, 29, 1706, 36, 11, 67, 16, 1709, 2, 408, 50, 51, 221, 123, 19, 1996, 1500, 860, 1072, 56, 249, 860, 247, 360, 435, 350, 7, 70, 3, 210, 2, 2372, 3, 150, 2, 3, 5120, 10, 264, 13, 29, 77, 183, 12, 7, 1447, 19, 1, 307, 1263, 12, 7, 1776, 561, 7085, 25, 510, 8, 73, 2, 1060, 1, 2220, 169, 12, 5, 214, 40, 16, 4537, 10, 264, 2, 52854, 299, 2, 5, 271, 313, 26, 336, 11, 19, 154, 82, 9215, 1345, 13, 200, 16, 42

In [110]:
idx = tokenizer.word_index

In [111]:
for a in idx.keys():
    if idx[a] == 16584:
        print(a)

durable


In [112]:
test[0].values[1]

'Consider not handling YAML in APIserver The APIserver processes JSON and YAML A vulnerability in the apiserver would be critical Compared to JSON YAML has a crazy complicated SPEC YAML has a go implementation that is more complex 14KLOC JSON vs 30KLOC YAML YAML has fewer tests in the go implementation 8KLOC JSON vs 2 5KLOC YAML Has a track record of vulnerabilities YAML has fewer vulnerabilities but this may be due to its less widespread use in critical software 24 CVEs match YAML vs 66 JSON using Therefore we should consider one of the following changes 1 Do not process YAML in the apiserver itself 1 Convert YAML to JSON in clients 2 Recommend that users needing to use YAML via etc can pipe YAML through simple yaml to json converter tool 2 Convert YAML to JSON in a less privileged proxy process Also this would eliminate the minor annoyance of having to repeat in go struct tags I could be persuaded Although go doesn t have buffer overruns so it s a little hard to see how you could get

In [135]:
test = ["let us add a feature to identify vulnerability for golang packages. The feature should be capable of identifying a known NVD CVE from NVD data and also unknown vulnerabilities along with their CVE. Should tne feature be implemented in this sprint?, No. It's a train goal. Let's plan this for summit. Assigning to vedant and adding labels area/security and type/feature.".lower()]

In [137]:
test

["let us add a feature to identify vulnerability for golang packages. the feature should be capable of identifying a known nvd cve from nvd data and also unknown vulnerabilities along with their cve. should tne feature be implemented in this sprint?, no. it's a train goal. let's plan this for summit. assigning to vedant and adding labels area/security and type/feature."]

In [141]:
word_index["cve"]

111

In [143]:
xte

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [148]:
test

["let us add a feature to identify vulnerability for golang packages. the feature should be capable of identifying a known nvd cve from nvd data and also unknown vulnerabilities along with their cve. should tne feature be implemented in this sprint?, no. it's a train goal. let's plan this for summit. assigning to vedant and adding labels area/security and type/feature."]

In [150]:
word_index["let"]

345

In [151]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(5184, input_shape=(9*18*64,)),
    Activation('elu'),
    Dense(5184),
    Activation('elu'),
    Dense(519),
    Activation('elu'),
    Dense(64*2),
    Activation('softmax'),
])

In [None]:
model.fit