In [1]:
import re
import sys

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Preprocessing Example

In [3]:
# apply NVDFeedPreprocessor transformation to extract relevant attributes
nvd_prep = toolkit.preprocessing.NVDFeedPreprocessor(
    attributes=['description']
)

# apply `transform` on the initial DATA
data = nvd_prep.transform(DATA)

In [4]:
# clear the instances so that it could be redefined
toolkit.transformers.Hook.clear_current_instances()

label_hook = toolkit.transformers.Hook(
    key='label_hook',
    # use the find_ function as a labeling hook - it searches the description for the
    # project name and returns it as a label, if applicable
    func=toolkit.utils.find_
)

In [5]:
# apply LabelPreprocessor to assign labels for future training
label_prep = toolkit.preprocessing.LabelPreprocessor(
    feed_attributes=['project', 'description'],
    output_attributes=['description'],
    hook=label_hook
)

# apply `fit_transform` on the data outputed by NVDFeedPreprocessor
labeled_data = label_prep.fit_transform(data)

In [6]:
# apply NLTKPreprocessor
nltk_prep = toolkit.preprocessing.NLTKPreprocessor()

# apply `fit_transform` on the data outputed by LabelPreprocessor
processed_data = nltk_prep.fit_transform(
    X=labeled_data,
    feed_attributes=['description'],
    output_attributes=['label']
)

In [7]:
len(processed_data), processed_data[0]

(1469,
 Series(values=[('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')], label='Tor'))

## Preprocessing Example with sklrearn Pipeline

In [8]:
from sklearn.pipeline import Pipeline

In [9]:
# clear all the hook instances
toolkit.transformers.Hook.clear_current_instances()

In [10]:
# create pipeline
pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(attributes=['description'])
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['description'],
                hook=toolkit.transformers.Hook(
                    key='label_hook', func=toolkit.utils.find_
                )
            )
        ),
        (
            'nltk_preprocessor',
            toolkit.preprocessing.NLTKPreprocessor()
        )
    ]
)

In [11]:
steps, preps = list(zip(*pipeline.steps))

# set up fit parameters (see sklearn fit_params notation)
fit_params = {
    "%s__feed_attributes" % steps[2]: ['description'],
    "%s__output_attributes" % steps[2]: ['label']
}
# in order to apply tokenization along with the labeling, we can make use of the `feed_attributes` argument,
# for the **fit_params argument notation and per-step feed definitions, take a look at the
# sklearn Pipeline documentation
pipeline_output = pipeline.fit_transform(X=DATA, **fit_params)

In [12]:
len(pipeline_output), pipeline_output[0]

(1469,
 Series(values=[('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')], label='Tor'))

In [13]:
import numpy as np

In [14]:
# check that the outputs are the same
(np.array(pipeline_output) == np.array(processed_data)).all()

True

In [15]:
# example of the pre-processed data
pipeline_output[:2]

[Series(values=[('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')], label='Tor'),
 Series(values=[('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('connection_edge_process_relay_cell', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 

In [16]:
# or as an ndarray
np.array(pipeline_output[:2])

array([[list([('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')]),
        'Tor'],
       [list([('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('0.3.0.8', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('connection_edge_process_relay_cell', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', '