Permalink
Browse files

I think NER is done.

  • Loading branch information...
1 parent c86115c commit f30f59b4a38f2abe0e8fae28e46bf4fd1d7eb9cc @dmnapolitano committed Mar 13, 2013
View
@@ -0,0 +1,17 @@
+How to Get Named Entities from the Stanford Named Entity Recognizer (NER) via this Apache Thrift Server
+=======================================================================================================
+
+## How to Interact with the Methods and Data Structures
+
+The core return type here is a data structure called `NamedEntity` which has four members:
+
+* `entity`: A string containing the actual named entity itself, potentially a multi-word expression if that's what Stanford NER recognized.
+* `tag`: A string containing the tag assigned to this named entity (PERSON, LOCATION, etc.). Should always be upper-case.
+* `startOffset`: All named entities exist in some sentence. This integer represents the starting character offset of this named entity in its sentence.
+* `endOffset`: Like `startOffset`, only tells you the character offset of the last character of the named entity in its sentence.
+
+In order to get these `NamedEntity` objects, you have three choices, depending on what kind of data you'd like to recognize named entities in. The return type for ALL of these is a Java `ArrayList`/Python list containing `NamedEntity` objects corresponding to entities recognized across the ENTIRETY of your text, no matter how many sentences, parse trees, etc. were passed in. If you'd like to recognize named entities in:
+
+* arbitrary (potentially several sentences worth of), untokenized, un-parsed, un-tagged text, and you're cool with CoreNLP handling all of those tasks for you, call `get_entities_from_text(text)`. `text` is a Java `String`/Python `str` or `unicode`.
+* one sentence worth of tokens (the output from some sentence and then word tokenizer), call `get_entities_from_tokens(tokens)`, where `tokens` is a Java `List<String>`/Python list containing `str`/`unicode`. Since Stanford NER requires either parse trees or POS-tagged text, the Stanford Parser will be called.
+* one or more parse trees in Stanford Parser's "oneline" output format, call `get_entities_from_trees(trees)`, where `trees` is a Java `List<String>`/Python list containing `str`/`unicode`.
View
@@ -26,6 +26,7 @@ service StanfordCoreNLP
list<ParseTree> parse_text(1:string text, 2:list<string> outputFormat),
ParseTree parse_tokens(1:list<string> tokens, 2:list<string> outputFormat),
oneway void zip(),
- list<NamedEntity> getNamedEntitiesFromText(1:string text),
- list<NamedEntity> getNamedEntitiesFromTrees(1:list<string> trees)
+ list<NamedEntity> get_entities_from_text(1:string text),
+ list<NamedEntity> get_entities_from_tokens(1:list<string> tokens),
+ list<NamedEntity> get_entities_from_trees(1:list<string> trees)
}
@@ -27,8 +27,9 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
print ' parse_text(string text, outputFormat)'
print ' ParseTree parse_tokens( tokens, outputFormat)'
print ' void zip()'
- print ' getNamedEntitiesFromText(string text)'
- print ' getNamedEntitiesFromTrees( trees)'
+ print ' get_entities_from_text(string text)'
+ print ' get_entities_from_tokens( tokens)'
+ print ' get_entities_from_trees( trees)'
print ''
sys.exit(0)
@@ -104,17 +105,23 @@ elif cmd == 'zip':
sys.exit(1)
pp.pprint(client.zip())
-elif cmd == 'getNamedEntitiesFromText':
+elif cmd == 'get_entities_from_text':
if len(args) != 1:
- print 'getNamedEntitiesFromText requires 1 args'
+ print 'get_entities_from_text requires 1 args'
sys.exit(1)
- pp.pprint(client.getNamedEntitiesFromText(args[0],))
+ pp.pprint(client.get_entities_from_text(args[0],))
-elif cmd == 'getNamedEntitiesFromTrees':
+elif cmd == 'get_entities_from_tokens':
if len(args) != 1:
- print 'getNamedEntitiesFromTrees requires 1 args'
+ print 'get_entities_from_tokens requires 1 args'
sys.exit(1)
- pp.pprint(client.getNamedEntitiesFromTrees(eval(args[0]),))
+ pp.pprint(client.get_entities_from_tokens(eval(args[0]),))
+
+elif cmd == 'get_entities_from_trees':
+ if len(args) != 1:
+ print 'get_entities_from_trees requires 1 args'
+ sys.exit(1)
+ pp.pprint(client.get_entities_from_trees(eval(args[0]),))
else:
print 'Unrecognized method %s' % cmd
Oops, something went wrong.

0 comments on commit f30f59b

Please sign in to comment.