In [1]:
import re
import sys

import nltk
import numpy as np

from sklearn.pipeline import Pipeline

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
# feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Training Example

**NOTE:**
Part of the pipeline consists of preprocessors, these were presented in [example_preprocessing_pipeline.ipynb](https://github.com/CermakM/fabric8-analytics-POCs/blob/codebase/cves/toolkit/examples/example_preprocessing_pipeline.ipynb)

In [3]:
# we can define custom features to be used in feature extraction process
from toolkit.transformers import feature_hooks

FEATURE_HOOKS = None  # will be defined later

In [4]:
toolkit.transformers.Hook.clear_current_instances()

If there is a need to split the training process into preprocessing and training, you can easily define two separate pipelines for that.

In [5]:
preprocessing_pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(
                attributes=['cve_id', 'description']
            )
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['cve_id', 'description'],
                hook=toolkit.transformers.Hook(key='label_hook', func=toolkit.utils.find_)
            )
        ),
        (
            'nltk_preprocessor',
            toolkit.preprocessing.NLTKPreprocessor()
            
        ),
    ]
)

In [6]:
steps, preps = list(zip(*preprocessing_pipeline.steps))

fit_params = {
    "%s__feed_attributes" % steps[2]: ['description'],
    "%s__output_attributes" % steps[2]: ['cve_id', 'label']
}
processed_data = preprocessing_pipeline.fit_transform(X=DATA, **fit_params)

In [7]:
# resulting data consists of list of Series object -- objects
# containing features output attributes
series = processed_data[0]
series.features, series.cve_id, series.label

([('The', 'DET'),
  ('hidden-service', 'ADJ'),
  ('feature', 'NOUN'),
  ('in', 'ADP'),
  ('Tor', 'NOUN'),
  ('before', 'ADP'),
  ('<VERSION>', 'NUM'),
  ('allows', 'VERB'),
  ('a', 'DET'),
  ('denial', 'NOUN'),
  ('of', 'ADP'),
  ('service', 'NOUN'),
  ('assertion', 'ADJ'),
  ('failure', 'NOUN'),
  ('and', 'CONJ'),
  ('daemon', 'ADJ'),
  ('exit', 'NOUN'),
  ('in', 'ADP'),
  ('the', 'DET'),
  ('relay_send_end_cell_from_edge_', 'NOUN'),
  ('function', 'NOUN'),
  ('via', 'ADP'),
  ('a', 'DET'),
  ('malformed', 'ADJ'),
  ('BEGIN', 'NOUN'),
  ('cell', 'NOUN')],
 'CVE-2017-0375',
 'Tor')

In [8]:
train_pipeline = Pipeline(
    steps=[
        (
            'feature_extractor',
            toolkit.transformers.FeatureExtractor(
                feature_hooks=FEATURE_HOOKS,
                share_hooks=True
            )
        ),
        (
            'classifier',
            toolkit.transformers.NBClassifier()
        )
    ]
)

In [9]:
# classifier = train_pipeline.fit_transform(X=data, y=labels)
classifier = train_pipeline.fit_transform(processed_data)

We can even build the whole training pipeline at once.

In [10]:
training_pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(
                attributes=['cve_id', 'description']
            )
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['cve_id', 'description'],
                hook=toolkit.transformers.Hook(key='label_hook',
                                               reuse=True,
                                               func=toolkit.utils.find_)
            )
        ),
        (
            'nltk_preprocessor',
            toolkit.preprocessing.NLTKPreprocessor()
        ),
        (
            'feature_extractor',
            toolkit.transformers.FeatureExtractor(
                feature_hooks=FEATURE_HOOKS,
                share_hooks=True
            )
        ),
        (
            'classifier',
            toolkit.transformers.NBClassifier()
        )
    ]
)

In [11]:
steps, preps = list(zip(*preprocessing_pipeline.steps))

fit_params = {
    "%s__feed_attributes" % steps[2]: ['description'],
    "%s__output_attributes" % steps[2]: ['cve_id', 'label']
}

# feed it initial raw unprocessed data and let the toolkit handle the rest
classifier = training_pipeline.fit_transform(X=DATA, **fit_params)

## Evaluation example

In [12]:
from sklearn.model_selection import train_test_split, KFold

In [13]:
toolkit.transformers.Hook.clear_current_instances()

Here we'll use only FeatureExtractor along with our processed data.

*NOTE:* The feature extractor executes hooks which extract features from the data. Each of these hooks is fed either a feed dict, if the input data consists of `namedtuple`s, or the **whole** `x` (element of `X`), which means, that if `x` is a list containing more than just features to be extracted, they will be fed to the hooks as well!

TLDR; The default hooks will search for `features` attribute in `x`, if `x` is a `namedtuple` otherwise they'll take the whole `x`.

In [14]:
# instantiate feature extractor
feature_extractor = toolkit.transformers.FeatureExtractor(
    feature_hooks=FEATURE_HOOKS,
    share_hooks=True
)

##### To demonstrate the above mentioned: This will throw and error

In [15]:
# split the processed data into data and labels (target values)
processed_data_array = np.array(processed_data)

data, labels = processed_data_array[:, :2], processed_data_array[:, 2]
data[:1]

array([[list([('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('<VERSION>', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')]),
        'CVE-2017-0375']], dtype=object)

In [16]:
# create the featureset from the evaluation pipeline
# the featureset is an ndarray of feature dicts and labels
try:
    featuresets = feature_extractor.fit_transform(data, labels)
except Exception:
    print("Exception raised!")

Exception raised!


##### To demonstrate the above mentioned: This will work

In [17]:
# split the processed data into data and labels (target values)
processed_data_array = np.array(processed_data)

data, labels = processed_data_array[:, 0], processed_data_array[:, -1]
data[:1]

array([list([('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('<VERSION>', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')])],
      dtype=object)

In [18]:
# create the featureset from the evaluation pipeline
# the featureset is an ndarray of feature dicts and labels
featuresets = feature_extractor.fit_transform(data, labels)

##### To demonstrate the above mentioned:  And so will this

In [19]:
processed_data[:1]

[Series(features=[('The', 'DET'), ('hidden-service', 'ADJ'), ('feature', 'NOUN'), ('in', 'ADP'), ('Tor', 'NOUN'), ('before', 'ADP'), ('<VERSION>', 'NUM'), ('allows', 'VERB'), ('a', 'DET'), ('denial', 'NOUN'), ('of', 'ADP'), ('service', 'NOUN'), ('assertion', 'ADJ'), ('failure', 'NOUN'), ('and', 'CONJ'), ('daemon', 'ADJ'), ('exit', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('relay_send_end_cell_from_edge_', 'NOUN'), ('function', 'NOUN'), ('via', 'ADP'), ('a', 'DET'), ('malformed', 'ADJ'), ('BEGIN', 'NOUN'), ('cell', 'NOUN')], cve_id='CVE-2017-0375', label='Tor')]

In [20]:
# create the featureset from the evaluation pipeline
# the featureset is an ndarray of feature dicts and labels
featuresets = feature_extractor.fit_transform(processed_data)

---

In [21]:
# split the data into train / test set
X_train, X_test, y_train, y_test = train_test_split(
    featuresets, labels,
    test_size = 0.8,  # split in the ratio 1:5 to show how well the model generalizes
    random_state=0
)

X_train.shape, X_test.shape

((296,), (1188,))

In [22]:
# retrain on the training data only
classifier.fit(X_train)

<toolkit.transformers.classifiers.NBClassifier at 0x7fd8d29916d8>

In [23]:
classifier.fit_predict(X_test[:5], sample=True)

array([[[('RubyGems', 'NOUN'), 0.9846977182986814],
        [('a', 'DET'), 0.8573424028641525],
        [('crafted', 'VERB'), 0.8271826757837611]],

       [[('ImageMagick', 'NOUN'), 0.4429434989650181],
        [('ReadOneJNGImage', 'NOUN'), 0.28698101790223535],
        [('in', 'ADP'), 0.004394492887864462]],

       [[('Kanboard', 'NOUN'), 0.999999854524354],
        [('In', 'ADP'), 0.2946550964779827],
        [('before', 'ADP'), 0.012014610236885015]],

       [[('ImageMagick', 'NOUN'), 0.706884492899105],
        [('The', 'DET'), 0.4348035978245417],
        [('function', 'NOUN'), 0.190282865936357]],

       [[('Linux', 'NOUN'), 0.8592898685105529],
        [('of', 'ADP'), 0.6263347642777072],
        [('The', 'DET'), 0.20409550620737257]]], dtype=object)

In [24]:
# evaluate the classifier on the test set
accuracy = classifier.evaluate(X_test, y_test, sample=True, n=3)

print("Evaluation accuracy: {:5.3f} %".format(accuracy * 100))

Evaluation accuracy: 86.953 %


In [25]:
# compute cross validation accuracy
score = toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)

In [26]:
score

Score(values=array([0.8       , 0.9       , 0.83333333, 0.73333333, 0.8       ,
       0.83333333, 0.82758621, 0.89655172, 0.82758621, 0.96551724]), mean=0.8417241379310344, std=0.0612354636264718)

In [27]:
print("Cross validation accuracy: {:5.2f} (+/- {:5.3f}) %".format(score.mean * 100, score.std * 200))

Cross validation accuracy: 84.17 (+/- 12.247) %


## BONUS

Cross validation accuracy visualization

In [28]:
# visualization tools - NOTE: These do not come in the requirements! (Hence the BONUS)
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [29]:
# initialize plotly
init_notebook_mode(connected=True)

In [30]:
# compute cross validation few more times to get more data
# NOTE: this might take a while to compute
x_val_scores = [
    toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)
    for _ in range(10)
]

In [31]:
x_val_scores = np.vstack(x_val_scores)

In [32]:
scores, mean, std = np.hstack(x_val_scores[:, 0]), x_val_scores[:, 1].mean() , x_val_scores[:, 2].std()

# subtract from 1 to get distane from the middle
scores = 1.0 - scores
mean = 1.0 - mean

# show 2x std, as this is stated above as well
std *= 2

rad = np.linspace(0, 360, num=(len(scores) + 1)) * (np.pi / 180.)
rad = rad[:-1]

x = np.cos(rad) * (scores)
y = np.sin(rad) * (scores)
    
data_labels = ["{:5.3f} %".format((1 - score) * 100) for score in scores]
score_trace = go.Scatter(x=x,
                         y=y,
                         mode='markers',
                         hovertext=data_labels,
                         hoverinfo='text')

In [33]:
cross_validation_layout = {
    'xaxis': {
        'zeroline': False,
    },
    'yaxis': {
        'zeroline': False,
        'scaleanchor': 'x',
        'scaleratio': 1
    },
    'shapes': [
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean,
            'y0': -mean,
            'x1': mean,
            'y1': mean,
            'opacity': 0.1,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean - std,
            'y0': -mean - std,
            'x1': mean + std,
            'y1': mean + std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean + std,
            'y0': -mean + std,
            'x1': mean - std,
            'y1': mean - std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -0.001,
            'y0': -0.001,
            'x1': 0.001,
            'y1': 0.001,
            'opacity': 0.8,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        }
    ]
}

In [34]:
# visualize cross validation accuracy
fig = {
    'data': [score_trace],
    'layout': cross_validation_layout
}

iplot(fig, show_link=False)