Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Commit

Permalink
Add NER to gather stakeholder executives
Browse files Browse the repository at this point in the history
  • Loading branch information
cduhn17 committed Aug 17, 2022
1 parent 5446a42 commit a10d13c
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 3 deletions.
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def get_version(version_file):
"importlib_resources == 5.4.0",
"matplotlib == 3.3.4",
"mongo-db-from-config@http://github.com/cisagov/mongo-db-from-config/tarball/develop",
"nltk",
"openpyxl",
"pandas == 1.1.5",
"psutil",
Expand All @@ -125,6 +126,9 @@ def get_version(version_file):
"schema == 0.7.5",
"setuptools == 58.1.0",
"shodan ==1.27.0",
"spacy",

"beautifulsoup4",
"sublist3r",
"types-PyYAML == 6.0.4",
"urllib3 == 1.26.7",
Expand Down
3 changes: 2 additions & 1 deletion src/pe_reports/stakeholder/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class InfoFormExternal(FlaskForm):
# "What is the sub-domain for this stakeholder?" " *comma separate entries"
# )
custExecutives = StringField(
"Who are the executives for this stakeholder? " "*comma separate entries"
"What is the url for the Executives for this stakeholder? "
"*comma separate entries"
)
submit = SubmitField("Submit", render_kw={"onclick": "loading()"})
75 changes: 73 additions & 2 deletions src/pe_reports/stakeholder/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import psycopg2
import psycopg2.extras
import requests
from nltk import pos_tag, word_tokenize
from bs4 import BeautifulSoup

# cisagov Libraries
from pe_reports.data.config import config
Expand Down Expand Up @@ -523,6 +525,74 @@ def getalluserinfo():
"stakeholder", __name__, template_folder="templates/stakeholder_UI"
)

def getNames(url):
'''Get the names from url data.'''

doc = nlp(getAbout(url))

d = []

for ent in doc.ents:
d.append((ent.label_, ent.text))

return d


def getAbout(url):
'''Get stakeholder about page.'''
thepage = requests.get(url).text

soup = BeautifulSoup(thepage, "lxml")

body = soup.body.text

body = body.replace("\n", " ")
body = body.replace("\t", " ")
body = body.replace("\r", " ")
body = body.replace("\xa0", " ")
# body = re.sub(r'[^ws]', '', body)

return body


def theExecs(URL):
'''Gather all executives names from data returned from about page url.'''
mytext = getAbout(URL)

tokens = word_tokenize(mytext)

thetag = pos_tag(tokens)

ne_tree = nltk.ne_chunk(thetag)

for x in ne_tree:
if "PERSON" in x:
print(x)

regex_pattern = re.compile(r"[@_'’!#\-$%^&*()<>?/\|}{~:]")

thereturn = getNames(URL)

executives = []

for hy in thereturn:

# print(hy)

if ("PERSON" in hy) and (hy[1] not in executives) and (len(hy[1]) < 50):
# executives.append(hy[1])
# print(hy[1])

# if not regex_pattern.search(hy[1]) and len(hy[1].split()) > 1 and not difflib.get_close_matches(hy[1], executives):
if not regex_pattern.search(hy[1]) and len(hy[1].split()) > 1:
person = hy[1].split(" ")
if len(person) <= 1:
# print(person)
executives.append(hy[1])
# print(f'{hy[0]} {hy[1]}')
# print(executives)
return executives


@stakeholder_blueprint.route("/stakeholder", methods=["GET", "POST"])
def stakeholder():
Expand All @@ -540,12 +610,13 @@ def stakeholder():
custDomainAliases = formExternal.custDomainAliases.data.split(",")
custRootDomain = formExternal.custRootDomain.data.split(",")
custRootDomainValue = custRootDomain[0]
custExecutives = formExternal.custExecutives.data.split(",")
custExecutives = formExternal.custExecutives.data
formExternal.cust.data = ""
formExternal.custDomainAliases = ""
formExternal.custRootDomain.data = ""
formExternal.custExecutives.data = ""
allDomain = getAgencies(cust)
allExecutives = list(theExecs(custExecutives))
allSubDomain = getSubdomain(custRootDomainValue)
allValidIP = getallsubdomainIPS(custRootDomainValue)

Expand Down Expand Up @@ -574,7 +645,7 @@ def stakeholder():
custDomainAliases,
custRootDomain,
allValidIP,
custExecutives,
allExecutives,
)

else:
Expand Down

0 comments on commit a10d13c

Please sign in to comment.