In [1]:
import sys

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
from toolkit import pipelines
from toolkit.transformers.classifiers import NBClassifier

The convenient method to integrate the classifier is to use the pre-trained model and the `extract_features` method defined in `pipelines` module.

Lets use the simple pre-trained classifier provided in the [examples](/examples) folder. In your application, you can use your custom classifier or a released classifier trained by us.

### Use cases

#### 1) predicting for a single description string

In [2]:
# restored pretrained classifier from the checkpoint
clf = NBClassifier.restore(checkpoint='export/')

# sanity check
assert isinstance(clf, NBClassifier)

In [3]:
# now lets suppose you only have this CVE description
description = (
    "Any authenticated user (valid client certificate but without ACL permissions) "
    "could upload a template which contained malicious code and caused a denial "
    "of service via Java deserialization attack. The fix to properly handle Java "
    "deserialization was applied on the Apache NiFi 1.4.0 release. "
    "Users running a prior 1.x release should upgrade to the appropriate release."
)

In [4]:
# there is a whole prediction pipeline pre-build for you
pipeline = pipelines.get_prediction_pipeline(
    classifier=clf,
    feature_hooks=pipelines.FEATURE_HOOKS
)

In [5]:
# only `sample` argument needs to be provided, we want to predict the project names,
# that is where the prediction == True
pipeline.fit_predict([description], classifier__sample=True)

array([[[('NiFi', 'NOUN'), 0.9942779558315172],
        [('a', 'DET'), 0.9791180957409575],
        [('1.4.0', 'NUM'), 0.9745165900735117]]], dtype=object)

#### 2) multiple description strings

In [6]:
# of course there can be as many descriptions in the list provided, as the processor can handle
description_lst = [description] * 5

# lets limit the number of candidates here as well
pipeline.fit_predict(description_lst, classifier__sample=True, classifier__n=1)

array([[[('NiFi', 'NOUN'), 0.9942779558315172]],

       [[('NiFi', 'NOUN'), 0.9942779558315172]],

       [[('NiFi', 'NOUN'), 0.9942779558315172]],

       [[('NiFi', 'NOUN'), 0.9942779558315172]],

       [[('NiFi', 'NOUN'), 0.9942779558315172]]], dtype=object)

#### 3) Prediction from the CVE

Too lazy to parse the CVE and extract the description yourself?
No prob, there is a build-in function for that as well...

In [7]:
# lets get a sample CVE from the database
feed = NVD.from_recent()
cve_iter = feed.cves()

# lets get the first cve with a description
cve = next(cve_iter)
while not cve.description:
    cve = next(cve_iter)

In [8]:
cve.description

'Open Web Analytics (OWA) before 1.5.7 allows remote attackers to conduct PHP object injection attacks via a crafted serialized object in the owa_event parameter to queue.php.'

In [9]:
# extract features
featureset = pipelines.extract_features(
    data=[cve],
    attributes=['description'],
    feature_hooks=pipelines.FEATURE_HOOKS
)

In [10]:
# predict, again specify sample parameter
clf.fit_predict(featureset, sample=True)

array([[[('OWA', 'NOUN'), 0.9986374889276625],
        [('before', 'ADP'), 0.9942779558315172],
        [('Analytics', 'NOUN'), 0.9823971884715447]]], dtype=object)