/
naive.py
121 lines (92 loc) · 4.14 KB
/
naive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""This module contains basic (naive) package name identifier."""
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from collections import OrderedDict
from cvejob.config import Config
from cvejob import utils
from cvejob.cpe2pkg import (
run_cpe2pkg, PackageNameCandidate, build_cpe2pkg_query
)
class NaivePackageNameIdentifier(object):
"""Naive package name identifier.
All words from the first sentence of a CVE description that are starting with uppercase letter
are considered to be possible package names (minus stop words).
"""
def __init__(self, doc, ecosystem, pkgfile_path, cpe2pkg_path=Config.cpe2pkg_path):
"""Initialize Constructor."""
self.doc = doc
self.ecosystem = ecosystem
self.pkgfile_path = pkgfile_path
self.cpe2pkg_path = cpe2pkg_path
def _get_vendor_product_pairs(self):
"""Get (vendor, product) pairs from the CVE.
:return: a set containing (vendor, product) pairs
"""
result = set()
for cpe in utils.get_cpe(self.doc, cpe_type='application'):
vendor = cpe.get_vendor()[0]
product = cpe.get_product()[0]
result.add((vendor, product))
return result
def _get_candidates_from_description(self):
"""Try to identify possible package names from the description."""
pkg_name_candidates = set()
sentences = sent_tokenize(
utils.get_description_by_lang(self.doc)
)
first_sentence = sentences[0] if sentences else ''
names = self._guess_from_sentence(first_sentence)
pkg_name_candidates.update(set(names))
return pkg_name_candidates
# noinspection PyMethodMayBeStatic
def _guess_from_sentence(self, sentence):
"""Guess possible package name(s) from given description.
Very naive approach. Words starting with uppercase letter
are considered to be possible package names (minus stop words).
Returns a list of possible package names, without duplicates.
"""
stop_words = set()
try:
# Fails when no downloaded stopwords are available.
stop_words.update(stopwords.words('english'))
except LookupError:
# Download stopwords since they are not available.
nltk.download('stopwords')
stop_words.update(stopwords.words('english'))
# modified the logic to include keywords that have capital letter anywhere,
# not necessarily as the first character.
# Also, two words separated by hyphen are also important,
# even when there are no capital letters
regexp = re.compile('[A-Za-z0-9-:]*[A-Z][A-Za-z0-9-:]*|[A-Za-z0-9]+[-][A-Za-z0-9]+')
suspects = regexp.findall(sentence)
results = [x.lower() for x in suspects if x.lower() not in stop_words]
# get rid of duplicates, but keep order
results = list(OrderedDict.fromkeys(results))
return results
def identify(self):
"""Identify possible package name candidates."""
vp_pairs = self._get_vendor_product_pairs()
desc_candidates = self._get_candidates_from_description()
results = []
for vp_pair in vp_pairs:
if self.ecosystem == 'java':
# in java, vendor could help us to narrow down the groupId
vendor = [*vp_pair] + list(desc_candidates)
else:
vendor = [self.ecosystem]
product = [vp_pair[1]] + list(desc_candidates)
results.extend(self._run_cpe2pkg(vendor, product))
return results
def _run_cpe2pkg(self, vendor, product):
"""Run cpe2pkg tool.
:param vendor: list[str], a list of vendor strings
:param product: list[str], a list of product strings
:return: list[PackageNameCandidate], a list of package name candidates
"""
query_str = build_cpe2pkg_query(vendor, product)
output = run_cpe2pkg(query_str, self.pkgfile_path, self.cpe2pkg_path)
return [
PackageNameCandidate.from_cpe2pkg_output(x, self.ecosystem) for x in output if x
]