In [1]:
import os
import re
import csv
import sys
import copy
import subprocess
import bisect
import shutil
import pickle
import gensim
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from os.path import expanduser
from filecmp import dircmp
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

tf.random.set_seed(59)

2024-02-07 15:45:40.776970: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-07 15:45:40.794915: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-07 15:45:40.794930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-07 15:45:40.795508: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-07 15:45:40.798636: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-07 15:45:40.798982: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
with open('sec_patch_commit_dict.pkl','rb') as handler:
    adv_dict = pickle.load(handler)

with open('non_patch_commit_dict.pkl','rb') as handler:
    ben_dict = pickle.load(handler)
    
w2v_model_50 = gensim.models.Word2Vec.load("patch_commit_voc50_mc5.w2v")
w2v_model_100= gensim.models.Word2Vec.load("patch_commit_voc100_mc5.w2v")

print(w2v_model_50.wv.vectors.shape)
print(w2v_model_100.wv.vectors.shape)

vocab_size = len(w2v_model_50.wv.key_to_index)
print(vocab_size)
maxlen = 200

(4621, 50)
(4621, 100)
4621


In [3]:
def generate_txt_matrix(my_dict, word2vecs, max_len):
    
    row = 0
    text_matrix = np.zeros((len(my_dict), max_len), dtype = np.int32)
    filter_out = []
    for key, value in my_dict.items():
        
        col = 0
        list_of_word = value[0]
        # print(key)
        # print("===================")
        # print(list_of_word)
        
        for word in list_of_word:
            if col >= max_len:
                break
                
            if word in word2vecs.key_to_index.keys():
                # print(word)
                # print(word2vecs.key_to_index[word])
                text_matrix[row][col] = word2vecs.key_to_index[word]
                col += 1
        
        if value[1] and col < max_len-1:
            
            if text_matrix[row][col-1] != 0:
                text_matrix[row][col] = word2vecs.key_to_index["."]
                col += 1
            
            list_of_word = value[1]
            # print(list_of_word)
            for word in list_of_word:
                if col >= max_len:
                    break
                
                if word in word2vecs.key_to_index.keys():
                    # print(word)
                    # print(word2vecs.key_to_index[word])
                    text_matrix[row][col] = word2vecs.key_to_index[word]
                    col += 1
        #print (col)
        if col<6:
            filter_out.append(row)
            #print(row)
                    
        row+=1
        #if row>2:
        #    break
    #print(len(filter_out))        
    return np.delete(text_matrix , filter_out,  0)
    

adv_matrix = generate_txt_matrix(adv_dict, w2v_model_50.wv, 200)
ben_matrix = generate_txt_matrix(ben_dict, w2v_model_50.wv, 200)

In [4]:
x_train = np.vstack((adv_matrix[:2239], ben_matrix[:3364]))
x_val =  np.vstack((adv_matrix[2239:], ben_matrix[3364:]))
print(x_train.shape)
print(x_val.shape)

y_train = np.hstack((np.ones(2239), np.zeros(3364)))
y_val = np.hstack((np.ones(adv_matrix.shape[0] - 2239), np.zeros(ben_matrix.shape[0] - 3364)))
print(y_train.shape)
print(y_val.shape)

(5603, 200)
(623, 200)
(5603,)
(623,)


In [5]:
training_data = np.hstack((x_train, y_train.reshape(-1,1)))
np.random.shuffle(training_data)
x_train = training_data[:, :-1]
y_train = training_data[:, -1]

training_data = np.hstack((x_val, y_val.reshape(-1,1)))
np.random.shuffle(training_data)
x_val = training_data[:, :-1]
y_val = training_data[:, -1]

print(x_val.shape)
print(y_val.shape)

(623, 200)
(623,)


In [6]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
 
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)  # self-attention layer
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)  # layer norm
        ffn_output = self.ffn(out1)  #feed-forward layer
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)  # layer norm

In [7]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, weights=[w2v_model_50.wv.vectors], output_dim=embed_dim, trainable=False)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [8]:
embed_dim = 50  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

2024-02-07 15:45:57.958567: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [None]:
filepath="tf_model_{epoch:02d}-accuracy{accuracy:.2f}.h5"

checkpoint=ModelCheckpoint(
        filepath=filepath,
        monitor='val_accuracy',
        save_best_only=True,
        save_weights_only=True,
        save_freq='epoch'
    )


opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, validation_data=(x_val, y_val), epochs=60, batch_size=64, callbacks=[checkpoint])

In [10]:
## Load a pre-trained model
model.load_weights("model_49-accuracy0.92.h5")

score = model.evaluate(x_val, y_val, verbose = 0) 
 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.2868306636810303
Test accuracy: 0.9165329337120056


In [11]:
predictions = np.argmax(model.predict(x_val), axis=1)
print("predictions shape:", predictions.shape)

print('Precision: %.3f' % precision_score(y_val, predictions))
print('Recall: %.3f' % recall_score(y_val, predictions))
print('F1_score: %.3f' % f1_score(y_val, predictions))
print('Accuracy: %.3f' % accuracy_score(y_val, predictions))

predictions shape: (623,)
Precision: 0.930
Recall: 0.855
F1_score: 0.891
Accuracy: 0.917
