In [1]:
import multiprocessing
import os
import sys
import typing

from nvdlib.nvd import NVD

sys.path.append('../src')
import toolkit

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns

from sklearn.model_selection import train_test_split

In [3]:
# NVDFeedPreprocessor to extract relevant attributes from NVD feed
nvd_prep = toolkit.preprocessing.NVDFeedPreprocessor(
    attributes=['cve_id', 'description']
)

In [4]:
# LabelPreprocessor to extract target labels 
label_hook = toolkit.transformers.Hook(
    key='label_hook',
    func=toolkit.utils.find_,
    reuse=True
)

label_prep = toolkit.preprocessing.LabelPreprocessor(
    hook=label_hook,
    feed_attributes=['project', 'description'],
    output_attributes=['cve_id', 'user', 'project', 'repository', 'description']
)

In [5]:
def get_labeled_data_from_nvd(feed_name: typing.Union[int, str]) -> list:
    # nvd feed
    feed = NVD.from_feeds(feed_names=[feed_name])
    feed.update()
    cves: typing.Generator = feed.cves()
    
    # transform the NVD data
    nvd_data: list = nvd_prep.fit_transform(cves)
    try:
        # extract labels and add it to the total data
        labeled_data = label_prep.fit_transform(nvd_data)
    except AssertionError:
        # When no labels are found, it will throw (can happen if there are no CVEs to label)
        labeled_data = list()

    print(f"Finished feed '{feed_name}'.")
    return labeled_data

In [6]:
FEED_NAMES = list(range(2002, 2019))
data = list()

if os.name != 'nt':  # multiprocessing does not work correctly in the interpreter on Windows
    PROCESSES = os.cpu_count()
    
    pool = multiprocessing.Pool(processes=PROCESSES)
    proc = pool.map_async(
        func=get_labeled_data_from_nvd,
        iterable=FEED_NAMES,
        callback=data.extend
    )

    pool.close()
    pool.join()
    
else:
    for feed_name in FEED_NAMES:
        data.extend(get_labeled_data_from_nvd(feed_name))

Finished feed '2002'.
Finished feed '2003'.
Finished feed '2004'.
Finished feed '2005'.
Finished feed '2006'.
Finished feed '2007'.
Finished feed '2008'.
Finished feed '2009'.
Finished feed '2010'.
Finished feed '2011'.
Finished feed '2012'.
Finished feed '2013'.
Finished feed '2014'.
Finished feed '2015'.
Finished feed '2016'.
Finished feed '2017'.
Finished feed '2018'.


In [7]:
# split into train/test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)  # type: typing.List["Series"]

In [8]:
df = pd.DataFrame(data=data).sort_values(by='cve_id', ascending=True)  # ordered from oldest

train_df = pd.DataFrame(data=train_data).sort_values(by='cve_id', ascending=True)  # ordered from oldest
test_df = pd.DataFrame(data=test_data).sort_values(by='cve_id', ascending=True)  # ordered from oldest

In [9]:
df.describe(include='all')

Unnamed: 0,cve_id,user,project,repository,description,label
count,4374,4374,4374,4374,4374,4374
unique,4374,1036,1113,1150,4345,1203
top,CVE-2012-3371,torvalds,linux,https://github.com/torvalds/linux,An issue was discovered in the Tatsuya Kinoshi...,Linux
freq,1,808,811,808,6,806


In [11]:
# export
os.makedirs('data', exist_ok=True)

# whole dataset
df.to_csv('data/data.csv')

# train/test data
train_df.to_csv('data/train_data.csv')
test_df.to_csv('data/test_data.csv')