In [1]:
import json
import nltk
import requests 
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import ssl
import itertools
from string import ascii_lowercase
import re

#### Here is just an example of tokenizer

In [14]:
needDetailsSample = "I need to determine if there is mortars in a known ISIS training camp near Raqqa, Syria"
tokens = word_tokenize(needDetailsSample)
print(tokens)

['I', 'need', 'to', 'determine', 'if', 'there', 'is', 'mortars', 'in', 'a', 'known', 'ISIS', 'training', 'camp', 'near', 'Raqqa', ',', 'Syria']


#### Sample for location stuff

In [16]:
locs = [('Elephant Carcasses', 'IN', 'Nairobi'),
...         ('ISIS Fighters', 'IN', 'Raqqa'),
...         ('Syrian militants', 'IN', 'Mosul'),
...         ('WMD', 'IN', 'Pyongyang'),
...         ('Terrorists', 'IN', 'Raqqa')]

In [17]:
locs

[('Elephant Carcasses', 'IN', 'Nairobi'),
 ('ISIS Fighters', 'IN', 'Raqqa'),
 ('Syrian militants', 'IN', 'Mosul'),
 ('WMD', 'IN', 'Pyongyang'),
 ('Terrorists', 'IN', 'Raqqa')]

In [18]:
query = [e1 for (e1, rel, e2) in locs if e2=='Raqqa']
print(query)

['ISIS Fighters', 'Terrorists']


#### Find out the grammar

In [19]:
nltk.pos_tag(tokens)

[('I', 'PRP'),
 ('need', 'VBP'),
 ('to', 'TO'),
 ('determine', 'VB'),
 ('if', 'IN'),
 ('there', 'EX'),
 ('is', 'VBZ'),
 ('mortars', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('known', 'JJ'),
 ('ISIS', 'NNP'),
 ('training', 'NN'),
 ('camp', 'NN'),
 ('near', 'IN'),
 ('Raqqa', 'NNP'),
 (',', ','),
 ('Syria', 'NNP')]

In [20]:
tokens = nltk.word_tokenize(needDetailsSample)
tagged = nltk.pos_tag(tokens)
nouns = [word for word,pos in tagged \
	if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
downcased = [x.lower() for x in nouns]
joined = " ".join(downcased).encode('utf-8')
into_string = str(nouns)

In [21]:
nouns

['mortars', 'ISIS', 'training', 'camp', 'Raqqa', 'Syria']

In [22]:
joined

b'mortars isis training camp raqqa syria'

#### This is the function to obtain the OBJECTS that General Dynamics provides

In [57]:
def getGDData():
    all_data = []
    context = ssl._create_unverified_context()
    for c in ascii_lowercase:
        url = "https://gbca-integration-568734290.us-east-1.elb.amazonaws.com/objects/search/" + c
        webURL = urllib.request.urlopen(url, context = context)
        data = webURL.read()
        encoding = webURL.info().get_content_charset('utf-8')
        data = json.loads(data.decode(encoding))
        all_data.extend(data)
    objects = [x['definition'] for x in all_data]
    removeParentheses = [re.sub("[\(\[].*?[\)\]]", "", x) for x in objects]
    removeBrackets = [re.sub("[\{\[].*?[\}\]]", "", x) for x in removeParentheses]
    trimmedObjects = [x.rstrip() for x in removeBrackets]
    trimmedObjects.sort();
    gdSet = list(set(trimmedObjects))
    gdData = [x.lower() for x in gdSet]
    gdData.remove('')
    return gdData

#### This is the function to identify a VERB within the unstructured Need

In [28]:
def identifyVerbs(unprocessedDataObject):
    verbs = ['Assess', 'Associate','Characterize','Convey','Determine','Mensurate','Monitor','Navigate','Orient','Provide','Recognize','Track','Understand','Detect', 'Identify']
    downcasedVerbs = [x.lower() for x in verbs]
    tokens = word_tokenize(unprocessedDataObject['unstructuredNeed'])
    downcasedTokens = [y.lower() for y in tokens]
    identifiedVerbs = [token.capitalize() for token in downcasedTokens if token in downcasedVerbs]
    return identifiedVerbs[0]

#### This is the function to identify an OBJECT within the unstructured Need

In [30]:
def identifyObject(unprocessedDataObject):
#     use finalObjects from above
    unstructuredText = unprocessedDataObject['unstructuredNeed'];
    
    objectsFound= [x for x in finalObjects if re.findall(x, unstructuredText)]
    return objectsFound

#### This is the function to search and find the location within the unstructured Need

In [31]:
def identifyLocation(unprocessedDataObject):
    unstructuredText = unprocessedDataObject['unstructuredNeed'];
    
    numregex = '[-+]?[0-9]{0,3}(?:(?:\.[0-9]+)|(?:[0-9]+))';
    mgrsRegEx = "^\d{1,2}[^ABIOYZabioyz][A-Za-z]{2}([0-9][0-9])+$";
    
    pattern = '(' + numregex + ',\s{1}' + numregex + ')'
    
    tokens = word_tokenize(unprocessedDataObject['unstructuredNeed'])

    regex1 = re.compile(pattern);
    regex2 = re.compile(numregex);
    regex3 = re.compile(mgrsRegEx);
    
    for x in tokens:
        if regex3.findall(x):
            print (x + " MGRS")
        elif regex1.findall(x):
            print (x + " NO MGRS BUT LAT LONG");
        else:
            print ('')
        
        

    mgrsLocation = regex3.search(unstructuredText);
    searchLocation = regex2.search(unstructuredText);
    
    if (mgrsLocation):
        return mgrsLocation.g
    else:
        return searchLocation

## This is the MAIN function which calls the helper functions listed above

In [None]:
#### Get all the Unstructured Needs
import urllib.request
response=urllib.request.urlopen("https://needs-tool-api-dev.dev.dev.east.paas.geointservices.io/need/getAllUnstructuredNeeds").read()
output = response.decode('utf-8')
data = json.loads(output)

#only process the NEW data
unprocessedData= [item for item in data['needs'] if item['status'] != 'Processed']

# go get the GD data and use this list
finalObjects= getGDData();

#begin processing
for unprocessedDataObject in unprocessedData:
    #find the Verb
    unprocessedDataObject['Verb']= identifyVerbs(unprocessedDataObject);
    #find the object
    unprocessedDataObject['Object'] = identifyObject(unprocessedDataObject);
    #find the location
    unprocessedDataObject['Location'] = identifyObject(unprocessedDataObject);

###### This is just a sample of the data object called above

In [147]:
data

{'message': 'Success to get all user need data from the database. ',
 'needs': [{'UnstructuredNeedId': '1',
   'status': 'New',
   'submissionDate': 'Mon Feb 12 2018 15:33:34 GMT+0000 (UTC)',
   'unstructuredNeed': 'I need to detect elephant carcasses near a road in Nairobi at location MX12345678 as soon as possible.',
   'user': '[object Object]'},
  {'UnstructuredNeedId': '3',
   'status': 'New',
   'submissionDate': 'Mon Feb 12 2018 15:36:59 GMT+0000 (UTC)',
   'unstructuredNeed': 'Can you identify whether the truck stuck at 35.9594N, 38.9981E is carrying ISIS soldiers or is affiliated with ISIS in any way possible.',
   'user': '[object Object]'},
  {'UnstructuredNeedId': '2',
   'status': 'New',
   'submissionDate': 'Mon Feb 12 2018 15:34:26 GMT+0000 (UTC)',
   'unstructuredNeed': 'Please determine if there are ISIS encampments near Mosul, Iraq at grid location AB87654321.',
   'user': '[object Object]'},
  {'UnstructuredNeedId': '4',
   'status': 'New',
   'submissionDate': 'Mon 

###### Another sample from above

In [43]:
re.findall('elephant carcasses', testString)

['elephant carcasses']

###### This is a sample of the data object AFTER processing is finished

In [63]:
unprocessedData

[{'Object': ['road', 'elephant carcasses'],
  'UnstructuredNeedId': '1',
  'Verb': 'Detect',
  'email': 'chaddddddd111111122222@chad.com',
  'organization': 'Booz Allen',
  'role': 'customer',
  'status': 'New',
  'submissionDate': 'undefined',
  'unstructuredNeed': 'I need to detect elephant carcasses near a road in Nairobi at location MX12345678 as soon as possible.',
  'userFirstName': 'Chad',
  'userLastName': 'D',
  'username': 'chadd'},
 {'Object': ['camp'],
  'UnstructuredNeedId': '3',
  'Verb': 'Determine',
  'email': 'chaddddddd111111122222@chad.com',
  'organization': 'Booz Allen',
  'role': 'customer',
  'status': 'New',
  'submissionDate': 'undefined',
  'unstructuredNeed': 'Please determine if there are ISIS encampments near Mosul, Iraq at grid location AB87654321.',
  'userFirstName': 'Chad',
  'userLastName': 'D',
  'username': 'chadd'},
 {'Object': [],
  'UnstructuredNeedId': '2',
  'Verb': 'Identify',
  'email': 'chaddddddd111111122222@chad.com',
  'organization': 'Boo

In [206]:
m=identifyLocation(unprocessedData[0]).group(0)
m[m.start(g):m.end(g)]






















AttributeError: 'str' object has no attribute 'start'

In [176]:
identifyLocation(unprocessedData[1])


['Please', 'determine', 'if', 'there', 'are', 'ISIS', 'encampments', 'near', 'Mosul', ',', 'Iraq', 'at', 'grid', 'location', 'AB87654321', '.']
Please
nothing
determine
nothing
if
nothing
there
nothing
are
nothing
ISIS
nothing
encampments
nothing
near
nothing
Mosul
nothing
,
nothing
Iraq
nothing
at
nothing
grid
nothing
location
nothing
AB87654321
nothing
.
nothing


<_sre.SRE_Match object; span=(83, 91), match='87654321'>

In [144]:
identifyLocation(unprocessedData[2]).group(0)

'35.9594'

In [133]:
identifyLocation(unprocessedData[3])

IndexError: list index out of range

In [69]:
re.findall(^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?),\s*[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$,test)


SyntaxError: invalid syntax (<ipython-input-69-60aa3df4ee52>, line 1)