In [1]:
import sys

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")
from toolkit import pipelines, utils
from toolkit.pipelines import FEATURE_HOOKS
from toolkit.transformers.classifiers import NBClassifier

The convenient method to integrate the classifier is to use the pre-trained model and the `extract_features` method defined in `pipelines` module.

Lets train the classifier (will be provided in the [examples](/examples) folder). In your application, you can use your custom classifier or the classifier trained by us.

### Training custom model

*The best way is to use nvd-toolkit CLI, but there are many ways training could be implemented, so find the one that suits you.*

__Note:__ This is **going to take a while**, so go grab a tea or coffee meanwhile.

In [2]:
from sklearn.model_selection import train_test_split

feed = NVD.from_feeds(feed_names=[2018])
feed.update()
data = list(feed.cves())

cve_dict = {cve.cve_id: cve for cve in data}

# set up default argument for vendor-product feature hook
FEATURE_HOOKS.vendor_product_match_hook.default_kwargs = {
    'cve_dict': cve_dict
}

In [3]:
X_train, _ = pipelines.extract_labeled_features(
    data=data,
    nvd_attributes=['cve_id', 'description'],
    nltk_feed_attributes=['description'],
    feature_hooks=FEATURE_HOOKS,
    labeling_func=utils.find_
)

classifier = NBClassifier().fit(X_train)

In [4]:
classifier.export(export_dir='export/')

### Prediction filters

In [5]:
# Create prediciton filters
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
def stopwords_filter(t):
    word, _ = t[0]
    return word.lower() not in STOPWORDS

def version_filter(t):
    word, _ = t[0]
    return word != '<VERSION>'

def num_tag_filter(t):
    _, tag = t[0]
    return tag != 'NUM'

prediction_filters = [stopwords_filter, version_filter, num_tag_filter]

### Use cases

#### 1) predicting for a single description string

In [6]:
# restored pretrained classifier from the checkpoint
clf = NBClassifier.restore(checkpoint='export/')

# sanity check
assert isinstance(clf, NBClassifier)

In [7]:
# now suppose you only have this CVE description
description = (
    "Any authenticated user (valid client certificate but without ACL permissions) "
    "could upload a template which contained malicious code and caused a denial "
    "of service via Java deserialization attack. The fix to properly handle Java "
    "deserialization was applied on the Apache NiFi 1.4.0 release. "
    "Users running a prior 1.x release should upgrade to the appropriate release."
)

In [8]:
from toolkit.transformers import feature_hooks

hooks = [
    feature_hooks.has_uppercase_hook,
    feature_hooks.is_alnum_hook,
    feature_hooks.ver_pos_hook,
    feature_hooks.word_len_hook
]

In [9]:
# there is a whole prediction pipeline pre-build for you
pipeline = pipelines.get_prediction_pipeline(
    classifier=clf,
    feature_hooks=hooks
)

In [10]:
# only `sample` argument needs to be provided, we want to predict the project names,
# that is where the prediction == True
pipeline.fit_predict([description], classifier__sample=True, classifier__filter_hooks=prediction_filters)

array([[[('NiFi', 'NOUN'), 0.8583644939473858],
        [('Users', 'NOUN'), 0.7601474942350667],
        [('Apache', 'NOUN'), 0.6644349678248976]]], dtype=object)

#### 2) multiple description strings

In [11]:
# of course there can be as many descriptions in the list provided, as the processor can handle
description_lst = [description] * 5

# let's limit the number of candidates here as well
pipeline.fit_predict(description_lst, classifier__sample=True, classifier__n=1, classifier__filter_hooks=prediction_filters)

array([[[('NiFi', 'NOUN'), 0.8583644939473858]],

       [[('NiFi', 'NOUN'), 0.8583644939473858]],

       [[('NiFi', 'NOUN'), 0.8583644939473858]],

       [[('NiFi', 'NOUN'), 0.8583644939473858]],

       [[('NiFi', 'NOUN'), 0.8583644939473858]]], dtype=object)

#### 3) Prediction from the CVE (recommended)

This is the recommended and full-stacked way.

In [12]:
# let's get the first cve with a description as a sample for prediction
for cve in data:
    if cve.description:
        break

In [13]:
cve.description

'A remote, unauthenticated attacker may be able to execute code by exploiting a use-after-free defect found in older versions of PHP through injection of crafted data via specific PHP URLs within the context of the J-Web process. Affected releases are Juniper Networks Junos OS: 12.1X46 versions prior to 12.1X46-D67; 12.3 versions prior to 12.3R12-S5; 12.3X48 versions prior to 12.3X48-D35; 14.1 versions prior to 14.1R8-S5, 14.1R9; 14.1X53 versions prior to 14.1X53-D44, 14.1X53-D50; 14.2 versions prior to 14.2R7-S7, 14.2R8; 15.1 versions prior to 15.1R3; 15.1X49 versions prior to 15.1X49-D30; 15.1X53 versions prior to 15.1X53-D70.'

Using the approach same as above

In [14]:
# extract features
featureset = pipelines.extract_features(
    data=[cve],
    nvd_attributes=['cve_id', 'description'],
    nltk_feed_attributes=['description'],
    feature_hooks=FEATURE_HOOKS,  # default feature hooks
    labeling_func=utils.find_
)

In [15]:
# predict, again specify sample parameter
# (precision is not important ATM, the pre-trained classifier is very simple)
clf.fit_predict(featureset, sample=True, n=5, filter_hooks=prediction_filters)

array([[[('Junos', 'NOUN'), 0.9992976406681799],
        [('OS', 'NOUN'), 0.9945769102547221],
        [('Affected', 'NOUN'), 0.8356839359241723],
        [('PHP', 'NOUN'), 0.7991572830147051],
        [('process.', 'NOUN'), 0.42907284697637343]]], dtype=object)