In [1]:
import re
import sys

import nltk
import numpy as np

from sklearn.pipeline import Pipeline

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
import toolkit  # import the toolkit

### Get the NVD Feed

In [2]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
# update
# feed.update()

# generator
cves = feed.cves()

# assign the cves to the DATA variable and unpack the iterator (iterator does not
# matter to the pipeline, but for the purpose of reusing the DATA in the notebook,
# it is usefull to unpack it)
DATA = list(cves)

## Training Example

**NOTE:**
Part of the pipeline consists of preprocessors, these were presented in [example_preprocessing_pipeline.ipynb](https://github.com/CermakM/fabric8-analytics-POCs/blob/codebase/cves/toolkit/examples/example_preprocessing_pipeline.ipynb)

In [3]:
toolkit.transformers.Hook.clear_current_instances()

In [4]:
preprocessing_pipeline = Pipeline(
    steps=[
        (
            'nvd_feed_preprocessor',
            toolkit.preprocessing.NVDFeedPreprocessor(attributes=['description'])
        ),
        (
            'label_preprocessor', 
            toolkit.preprocessing.LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['description'],
                hook=toolkit.transformers.Hook(key='label_hook', func=toolkit.utils.find_)
            )
        ),
        (
            'nltk_preprocessor',
            toolkit.preprocessing.NLTKPreprocessor()
            
        ),
    ]
)

In [5]:
steps, preps = list(zip(*preprocessing_pipeline.steps))

fit_params = {
    "%s__feed_attributes" % steps[2]: ['description'],
    "%s__output_attributes" % steps[2]: ['label']
}
processed_data = preprocessing_pipeline.fit_transform(X=DATA, **fit_params)

In [6]:
# we can define custom features to be used in feature extraction process
from toolkit.transformers import feature_hooks

FEATURE_HOOKS = None  # TODO

In [7]:
train_pipeline = Pipeline(
    steps=[
        (
            'feature_extractor',
            toolkit.transformers.FeatureExtractor(
                feature_hooks=FEATURE_HOOKS
            )
        ),
        (
            'classifier',
            toolkit.transformers.NBClassifier()
        )
    ]
)

In [8]:
# split the processed data into data and labels (target values)
processed_data = np.array(processed_data)

data, labels = processed_data[:, 0], processed_data[:, 1]

In [9]:
classifier = train_pipeline.fit_transform(X=data, y=labels)

## Evaluation example

In [10]:
from sklearn.model_selection import train_test_split, KFold

In [11]:
toolkit.transformers.Hook.clear_current_instances()

In [12]:
# instantiate feature extractor
feature_extractor = toolkit.transformers.FeatureExtractor(
    feature_hooks=FEATURE_HOOKS
)

In [13]:
# create the featureset from the evaluation pipeline
# the featureset is an ndarray of feature dicts and labels
featuresets = feature_extractor.fit_transform(X=data, y=labels)
featuresets.shape

(1475,)

In [14]:
# split the data into train / test set
X_train, X_test, y_train, y_test = train_test_split(
    featuresets, labels,
    test_size = 0.8,  # split in the ratio 1:5 to show how well the model generalizes
    random_state=0
)

X_train.shape, X_test.shape

((295,), (1180,))

In [15]:
# retrain on the training data only
classifier.fit(X_train)

<toolkit.transformers.classifiers.NBClassifier at 0x7fdf44672908>

In [16]:
classifier.fit_predict(X_test[:5], sample=True)

array([[[('before', 'ADP'), 0.9909833492556477],
        [('FenixHosting/fenix-open-source', 'NOUN'), 0.9064562022700459],
        [('forums/search.php', 'NOUN'), 0.21372401803568966]],

       [[('paintballrefjosh/MaNGOSWebV4', 'NOUN'), 0.788403851302702],
        [('id', 'ADJ'), 0.043442325778618854],
        [('inc/admin/template_files/admin.shop.php', 'NOUN'),
         0.03737965734222141]],

       [[('tcpdump', 'NOUN'), 0.9989017515296756],
        [('The', 'DET'), 0.47570496458877365],
        [('parser', 'NOUN'), 0.06491697579798889]],

       [[('an', 'DET'), 0.9992753320700092],
        [('BlogoText', 'NOUN'), 0.9832626562781991],
        [('through', 'ADP'), 0.9189169424083068]],

       [[('tcpdump', 'NOUN'), 0.9989017515296756],
        [('The', 'DET'), 0.47570496458877365],
        [('parser', 'NOUN'), 0.06491697579798889]]], dtype=object)

In [17]:
# evaluate the classifier on the test set
accuracy = classifier.evaluate(X_test, y_test, sample=True, n=3)

print("Evaluation accuracy: {:5.3f} %".format(accuracy * 100))

Evaluation accuracy: 87.203 %


In [18]:
# compute cross validation accuracy
score = toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)

In [19]:
score

Score(values=array([0.83333333, 0.93333333, 0.8       , 0.83333333, 0.93333333,
       0.75862069, 0.86206897, 0.86206897, 0.96551724, 0.79310345]), mean=0.8574712643678162, std=0.06450519002553116)

In [20]:
print("Cross validation accuracy: {:5.2f} (+/- {:5.3f}) %".format(score.mean * 100, score.std * 200))

Cross validation accuracy: 85.75 (+/- 12.901) %


## BONUS

Cross validation accuracy visualization

In [21]:
# visualization tools - NOTE: These do not come in the requirements! (Hence the BONUS)
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [22]:
# initialize plotly
init_notebook_mode(connected=True)

In [23]:
# compute cross validation few more times to get more data
# NOTE: this might take a while to compute
x_val_scores = [
    toolkit.transformers.cross_validate(classifier,
                                             X_train,
                                             y_train,
                                             shuffle=True,
                                             n=3,
                                             sample=True)
    for _ in range(10)
]

In [24]:
x_val_scores = np.vstack(x_val_scores)

In [25]:
scores, mean, std = np.hstack(x_val_scores[:, 0]), x_val_scores[:, 1].mean() , x_val_scores[:, 2].std()

# subtract from 1 to get distane from the middle
scores = 1.0 - scores
mean = 1.0 - mean

# show 2x std, as this is stated above as well
std *= 2

rad = np.linspace(0, 360, num=(len(scores) + 1)) * (np.pi / 180.)
rad = rad[:-1]

x = np.cos(rad) * (scores)
y = np.sin(rad) * (scores)
    
data_labels = ["{:5.3f} %".format((1 - score) * 100) for score in scores]
score_trace = go.Scatter(x=x,
                         y=y,
                         mode='markers',
                         hovertext=data_labels,
                         hoverinfo='text')

In [26]:
cross_validation_layout = {
    'xaxis': {
        'zeroline': False,
    },
    'yaxis': {
        'zeroline': False,
        'scaleanchor': 'x',
        'scaleratio': 1
    },
    'shapes': [
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean,
            'y0': -mean,
            'x1': mean,
            'y1': mean,
            'opacity': 0.1,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean - std,
            'y0': -mean - std,
            'x1': mean + std,
            'y1': mean + std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean + std,
            'y0': -mean + std,
            'x1': mean - std,
            'y1': mean - std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -0.001,
            'y0': -0.001,
            'x1': 0.001,
            'y1': 0.001,
            'opacity': 0.8,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        }
    ]
}

In [27]:
# visualize cross validation accuracy
fig = {
    'data': [score_trace],
    'layout': cross_validation_layout
}

iplot(fig, show_link=False)