In [1]:
import re
import sys

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Preprocessing Example

In [3]:
# apply NVDFeedPreprocessor transformation to extract relevant attributes
nvd_prep = toolkit.preprocessing.NVDFeedPreprocessor(
    attributes=['description']
)

# apply `transform` on the initial DATA
data = nvd_prep.transform(DATA)

In [4]:
# clear the instances so that it could be redefined
toolkit.transformers.Hook.clear_current_instances()

label_hook = toolkit.transformers.Hook(
    key='label_hook',
    # use the find_ function as a labeling hook - it searches the description for the
    # project name and returns it as a label, if applicable
    func=toolkit.utils.find_
)

In [5]:
# apply LabelPreprocessor to assign labels for future training
label_prep = toolkit.preprocessing.LabelPreprocessor(
    feed_attributes=['project', 'description'],
    output_attributes=['description'],
    hook=label_hook
)

# apply `fit_transform` on the data outputed by NVDFeedPreprocessor
labeled_data = label_prep.fit_transform(data)

labeled_data.shape

(1468, 2)

## Preprocessing Example with sklrearn Pipeline

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
# clear all the hook instances
toolkit.transformers.Hook.clear_current_instances()

In [8]:
pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(attributes=['description'])
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['description'],
                hook=toolkit.transformers.Hook(key='label_hook', func=toolkit.utils.find_)
            )
        )
    ]
)

In [9]:
# transform and fit the data with the preprocessors
pipeline_output = pipeline.fit_transform(X=DATA)

pipeline_output.shape

(1468, 2)

In [10]:
# check that the outputs are the same
(pipeline_output == labeled_data).all()

True

In [11]:
# example of the labeled data
pipeline_output[:5]

array([['The hidden-service feature in Tor before 0.3.0.8 allows a denial of service (assertion failure and daemon exit) in the relay_send_end_cell_from_edge_ function via a malformed BEGIN cell.',
        'Tor'],
       ['The hidden-service feature in Tor before 0.3.0.8 allows a denial of service (assertion failure and daemon exit) in the connection_edge_process_relay_cell function via a BEGIN_DIR cell on a rendezvous circuit.',
        'Tor'],
       ["Tor 0.3.x before 0.3.0.9 has a guard-selection algorithm that only considers the exit relay (not the exit relay's family), which might allow remote attackers to defeat intended anonymity properties by leveraging the existence of large families.",
        'Tor'],
       ['XSS exists in the login_form function in views/helpers.php in Phamm before 0.6.7, exploitable via the PATH_INFO to main.php.',
        'Phamm'],
       ['The rend_service_intro_established function in or/rendservice.c in Tor before 0.2.8.15, 0.2.9.x before 0.2.9.12, 0.