In [4]:
import pandas as pd 
import numpy as np 
import re, unicodedata
from keras.utils import to_categorical
from gensim.models.keyedvectors import KeyedVectors

In [8]:
data = pd.read_csv("./train_data/combined_data.csv")
data.head(10)

Unnamed: 0,id,report,memc,bypass,csrf,dirtra,dos,execution,fileinc,gainpre,httprs,infor,overflow,sqli,xss
0,CVE-1999-0001,ip_input.c in BSD-derived TCP/IP implementatio...,0,0,0,0,1,0,0,0,0,0,0,0,0
1,CVE-1999-0002,Buffer overflow in NFS mountd gives root acces...,0,0,0,0,0,0,0,0,0,0,1,0,0
2,CVE-1999-0003,Execute commands as root via buffer overflow i...,0,0,0,0,0,0,0,0,0,0,1,0,0
3,CVE-1999-0004,"MIME buffer overflow in email clients, e.g. So...",0,0,0,0,0,0,0,0,0,0,1,0,0
4,CVE-1999-0005,Arbitrary command execution via IMAP buffer ov...,0,0,0,0,0,0,0,0,0,0,1,0,0
5,CVE-1999-0006,Buffer overflow in POP servers based on BSD/Qu...,0,0,0,0,0,0,0,0,0,0,1,0,0
6,CVE-1999-0008,"Buffer overflow in NIS+, in Sun's rpc.nisd pro...",0,0,0,0,0,0,0,0,0,0,1,0,0
7,CVE-1999-0009,Inverse query buffer overflow in BIND 4.9 and ...,0,0,0,0,0,0,0,0,0,0,1,0,0
8,CVE-1999-0010,Denial of Service vulnerability in BIND 8 Rele...,0,0,0,0,1,0,0,0,0,0,0,0,0
9,CVE-1999-0011,Denial of Service vulnerabilities in BIND 4.9 ...,0,0,0,0,1,0,0,0,0,0,0,0,0


In [6]:
word_vector = KeyedVectors.load_word2vec_format("vulner_embedding.bin", binary=True)
# Access_Complexity_Level = ['LOW', 'MEDIUM', 'HIGH']
# Authentication_Level = ['NONE', 'SINGLE', 'MULTIPLE']
# Access_Vector_Level = ['LOCAL', 'NETWORK', 'ADJACENT_NETWORK']
# Confidentiality_Impact_Level = ['NONE', 'PARTIAL', 'COMPLETE']
# Availability_Impact_Level = ['NONE', 'PARTIAL', 'COMPLETE']
num_row = data.shape[0]
name_labels = ['memcached', 'bypass', 'cross site request forgery', 'directory traversal', 'denial of service', 'execution', 'file inclusion', 'gain privilege',
          'http response splitting', 'information disclosure', 'overflow', 'sql injection', 'cross site scripting']
csv_labels = ['memc', 'bypass',	'csrf',	'dirtra',	'dos', 'execution',	'fileinc',	'gainpre',	'httprs',	'infor',	'overflow',	'sqli',	'xss']
data[csv_labels] = data[csv_labels].fillna(0.0)
combined_data = data.head(num_row)

# Extract Features

In [None]:
def process_bar(idx, num):
    per = idx / num * 100
    if per % 5 == 0:
        print(f'Processing {int(per)}%')

In [None]:
def process_report(report):
    report = report.lower()
    report = re.sub("<!--?.*?-->", "", report)
    report = unicodedata.normalize('NFKD', report).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    report= re.sub("(\\d|\\W)+"," ",report)
    report = re.sub('[^a-zA-z0-9\s]', "", report)
    sentences = [sentence for sentence in report.split() if sentence in word_vector]
    return sentences

In [None]:
def feautures_extractor(report):
    sentences = process_report(report)
    vectors = [word_vector[sentence] for sentence in sentences]
    avg_vectors = np.mean(vectors, axis=0)
    return avg_vectors


def similar_counter(label, sentences):
  count = 0
  for sentence in sentences:
    similarity = word_vector.similarity(label, sentence)
    if similarity > 0.7:
      count += 1
  return count


def similarities_extractor(report):
  sentences = process_report(report)
  similar_vectors = []
  for label in name_labels:
    label_parts = [label_part for label_part in label.split()]
    counter = 0
    for label_part in label_parts:
      counter += similar_counter(label_part, sentences)
    similar_vectors.append(counter)
  return similar_vectors

In [None]:
avg_vectors = []
similar_vectors = []
for idx in range(num_row):
    process_bar(idx, num_row)
    if idx == num_row-1:
      print(f'Processing 100%')
    avg_vector = feautures_extractor(data['report'][idx])
    similar_vector = similarities_extractor(data['report'][idx])
    avg_vectors.append(avg_vector)
    similar_vectors.append(similar_vector)

In [None]:
features_df = pd.DataFrame(avg_vectors)
counters_df = pd.DataFrame(similar_vectors)
X = pd.concat([counters_df, features_df], axis=1)
X = X.astype(float)
y = combined_data[csv_labels].values

In [None]:
X.to_csv("./train_data/X.csv")
y.to_csv("./train_data/y.csv")