In [1]:
import re
import sys

import nltk
import numpy as np

from sklearn.pipeline import Pipeline

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
# feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Training Example

**NOTE:**
Part of the pipeline consists of preprocessors, these were presented in [example_preprocessing_pipeline.ipynb](https://github.com/CermakM/fabric8-analytics-POCs/blob/codebase/cves/toolkit/examples/example_preprocessing_pipeline.ipynb)

In [3]:
toolkit.transformers.Hook.clear_current_instances()

In [4]:
preprocessing_pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(attributes=['description'])
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['description'],
                hook=toolkit.transformers.Hook(key='label_hook', func=toolkit.utils.find_)
            )
        ),
        (
            'nltk_preprocessor',
            toolkit.preprocessing.NLTKPreprocessor()
            
        ),
    ]
)

In [5]:
steps, preps = list(zip(*preprocessing_pipeline.steps))

fit_params = {
    "%s__feed_attributes" % steps[2]: ['description'],
    "%s__output_attributes" % steps[2]: ['label']
}
processed_data = preprocessing_pipeline.fit_transform(X=DATA, **fit_params)

In [6]:
# we can define custom features to be used in feature extraction process
FEATURES = None  # TODO

In [7]:
train_pipeline = Pipeline(
    steps=[
        (
            'feature_extractor',
            toolkit.transformers.FeatureExtractor(
                feature_hooks=FEATURES
            )
        ),
        (
            'classifier',
            toolkit.transformers.NBClassifier()
        )
    ]
)

In [8]:
# split the processed data into data and labels (target values)
processed_data = np.array(processed_data)

data, labels = processed_data[:, 0], processed_data[:, 1]

In [9]:
classifier = train_pipeline.fit_transform(X=data, y=labels)

## Evaluation example

In [10]:
from sklearn.model_selection import train_test_split, KFold

In [11]:
toolkit.transformers.Hook.clear_current_instances()

In [12]:
# instantiate feature extractor
feature_extractor = toolkit.transformers.FeatureExtractor(
    feature_hooks=FEATURES
)

In [13]:
# create the featureset from the evaluation pipeline
# the featureset is an ndarray of feature dicts and labels
featuresets = feature_extractor.fit_transform(X=data, y=labels)
featuresets.shape

(1469,)

In [14]:
# split the data into train / test set
X_train, X_test, y_train, y_test = train_test_split(
    featuresets, labels,
    test_size = 0.8,  # split in the ratio 1:5 to show how well the model generalizes
    random_state=0
)

X_train.shape, X_test.shape

((293,), (1176,))

In [15]:
classifier.fit_predict(X_test[:5], sample=True)

array([[[('ImageMagick', 'NOUN'), 0.9976306868224344],
        [('function', 'NOUN'), 0.28877504889157685],
        [('The', 'DET'), 0.09576235066399115]],

       [[('Converse.js', 'NOUN'), 0.9939529235680334],
        [('0.8.0', 'NUM'), 0.8453332544257908],
        [('1.0.6', 'NUM'), 0.6630256611156655]],

       [[('ImageMagick', 'NOUN'), 0.5071571635360217],
        [('in', 'ADP'), 0.3863607672720366],
        [('Double', 'NOUN'), 0.06519147161392566]],

       [[('tcpdump', 'NOUN'), 0.9999999982583394],
        [('The', 'DET'), 0.028072095530987634],
        [('parser', 'NOUN'), 9.088232431843608e-05]],

       [[('SWFTools', 'NOUN'), 0.9999999374621514],
        [('This', 'DET'), 0.5312469864523239],
        [('function', 'NOUN'), 0.19388337432525607]]], dtype=object)

In [16]:
# evaluate the classifier on the test set
accuracy = classifier.evaluate(X_test, y_test, sample=True, n=3)

print("Evaluation accuracy: {:5.3f} %".format(accuracy * 100))

Evaluation accuracy: 93.452 %


In [17]:
# compute cross validation accuracy
score = toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)

In [18]:
score

Score(values=array([0.6       , 0.66666667, 0.63333333, 0.55172414, 0.65517241,
       0.48275862, 0.5862069 , 0.51724138, 0.62068966, 0.75862069]), mean=0.6072413793103448, std=0.07535190704704757)

In [19]:
print("Cross validation accuracy: {:5.2f} (+/- {:5.3f}) %".format(score.mean * 100, score.std * 200))

Cross validation accuracy: 60.72 (+/- 15.070) %


## BONUS

Cross validation accuracy visualization

In [20]:
# visualization tools - NOTE: These do not come in the requirements! (Hence the BONUS)
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [21]:
# initialize plotly
init_notebook_mode(connected=True)

In [22]:
# compute cross validation few more times to get more data
# NOTE: this might take a while to compute
x_val_scores = [
    toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)
    for _ in range(10)
]

In [23]:
x_val_scores = np.vstack(x_val_scores)

In [24]:
scores, mean, std = np.hstack(x_val_scores[:, 0]), x_val_scores[:, 1].mean() , x_val_scores[:, 2].std()

# subtract from 1 to get distane from the middle
scores = 1.0 - scores
mean = 1.0 - mean

# show 2x std, as this is stated above as well
std *= 2

rad = np.linspace(0, 360, num=(len(scores) + 1)) * (np.pi / 180.)
rad = rad[:-1]

x = np.cos(rad) * (scores)
y = np.sin(rad) * (scores)
    
data_labels = ["{:5.3f} %".format((1 - score) * 100) for score in scores]
score_trace = go.Scatter(x=x,
                         y=y,
                         mode='markers',
                         hovertext=data_labels,
                         hoverinfo='text')

In [25]:
cross_validation_layout = {
    'xaxis': {
        'zeroline': False,
    },
    'yaxis': {
        'zeroline': False,
        'scaleanchor': 'x',
        'scaleratio': 1
    },
    'shapes': [
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean,
            'y0': -mean,
            'x1': mean,
            'y1': mean,
            'opacity': 0.1,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean - std,
            'y0': -mean - std,
            'x1': mean + std,
            'y1': mean + std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean + std,
            'y0': -mean + std,
            'x1': mean - std,
            'y1': mean - std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -0.001,
            'y0': -0.001,
            'x1': 0.001,
            'y1': 0.001,
            'opacity': 0.8,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        }
    ]
}

In [26]:
# visualize cross validation accuracy
fig = {
    'data': [score_trace],
    'layout': cross_validation_layout
}

iplot(fig, show_link=False)