In [1]:
import re
import sys

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Preprocessing Example

In [3]:
# apply NVDFeedPreprocessor transformation to extract relevant attributes
nvd_prep = toolkit.preprocessing.NVDFeedPreprocessor(
    attributes=['description']
)

# apply `transform` on the initial DATA
data = nvd_prep.transform(DATA)

In [4]:
# clear the instances so that it could be redefined
toolkit.pipeline.Hook.clear_current_instances()

label_hook = toolkit.pipeline.Hook(
    key='label_hook',
    # use the find_ function as a labeling hook - it searches the description for the
    # project name and returns it as a label, if applicable
    func=toolkit.utils.find_
)

In [5]:
# apply LabelPreprocessor to assign labels for future training
label_prep = toolkit.preprocessing.LabelPreprocessor(
    attributes=['project', 'description'],
    hook=label_hook
)

# apply `fit_transform` on the data outputed by NVDFeedPreprocessor
labeled_data = label_prep.fit_transform(data)

labeled_data.shape

(2373, 2)

## Preprocessing Example with sklrearn Pipeline

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
# clear all the hook instances
toolkit.pipeline.Hook.clear_current_instances()

In [8]:
pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(attributes=['description'])
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                attributes=['project', 'description'],
                hook=toolkit.pipeline.Hook(key='label_hook', func=toolkit.utils.find_)
            )
        )
    ]
)

In [9]:
# transform and fit the data with the preprocessors
pipeline_output = pipeline.fit_transform(X=DATA)

pipeline_output.shape

(2373, 2)

In [10]:
# check that the outputs are the same
(pipeline_output == labeled_data).all()

True

In [11]:
# example of the labeled data
pipeline_output[:5]

array([[Attributes(project='ChakraCore', description='A remote code execution vulnerability exists when Microsoft scripting engine improperly accesses objects in memory. The vulnerability could corrupt memory in a way that enables an attacker to execute arbitrary code in the context of the current user. An attacker who successfully exploited the vulnerability could gain the same user rights as the current user, aka "Scripting Engine Memory Corruption Vulnerability."'),
        None],
       [Attributes(project='CVE-2017-0038-EXP-C-JS', description='gdi32.dll in Graphics Device Interface (GDI) in Microsoft Windows Vista SP2, Windows Server 2008 SP2 and R2 SP1, Windows 7 SP1, Windows 8.1, Windows Server 2012 Gold and R2, Windows RT 8.1, and Windows 10 Gold, 1511, and 1607 allows remote attackers to obtain sensitive information from process heap memory via a crafted EMF file, as demonstrated by an EMR_SETDIBITSTODEVICE record with modified Device Independent Bitmap (DIB) dimensions. NOTE: