Skip to content

Commit

Permalink
added function to parse parser output
Browse files Browse the repository at this point in the history
  • Loading branch information
dasmith committed Feb 27, 2011
1 parent 9f544a8 commit 72b0db1
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 20 deletions.
21 changes: 15 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

This a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as an JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM and usually a few minutes loading time), most applications will probably want to run it as a server.

This uses [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/)
There's not much to this script.

It requires `pexpect`.

This uses [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/), which are included in this repository.


## Download and Usage

You should have [downloaded](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpacked the tgz file. Then copy all of the python files from this repository into the `stanford-corenlp-2010-11-12` folder.
You should have [downloaded](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpacked the tgz file containing Stanford's Core-NLP package. Then copy all of the python files from this repository into the `stanford-corenlp-2010-11-12` folder.

Then, to launch a server:

Expand All @@ -21,11 +25,16 @@ To run a public JSON-RPC server on port 3456.

See `client.py` for example of how to connect with a client.

## Questions
<!--
## Adding WordNet
I have only tested this on **version 1.0.2** released 2010-11-12.
Download WordNet-3.0 Prolog: http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
-->

## Questions

If you think there may be a problem with this wrapper, first make sure can run the java program:
I have only tested this on **Core NLP tools version 1.0.2** released 2010-11-12.

java -cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:xom.jar:jgraph.jar:jgrapht.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP
If you think there may be a problem with this wrapper, first ensure you can run the Java program:

java -cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:xom.jar:jgraph.jar:jgrapht.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
2 changes: 1 addition & 1 deletion default.properties
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
annotators = tokenize, ssplit, pos, lemma, ner, dcoref
annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref

# A true-casing annotator is also available (see below)
#annotators = tokenize, ssplit, pos, lemma, truecase
Expand Down
90 changes: 77 additions & 13 deletions server.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,74 @@
"""
This is a Python interface to Stanford Core NLP tools.
It can be imported as a module or run as a server.
Works with the 2010-11-22 release.
For more details:
https://github.com/dasmith/stanford-corenlp-python
Dustin Smith, 2011
By Dustin Smith, 2011
"""
import pexpect
from simplejson import loads, dumps
import optparse
import sys
import os

import time
import re
import jsonrpc

from progressbar import *


def remove_id(word):
"""Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
return word.count("-") == 0 and word or word[0:word.rindex("-")]

def parse_parser_results(text):
state = 0
tmp = {}
results = []
for line in text.split("\n "):
if line.startswith("Sentence #"):
state = 1
if len(tmp.keys()) != 0:
results.append(tmp)
tmp = {}
elif state == 1:
tmp['text'] = line
state = 2
elif state == 2:
if not line.startswith("[Text="):
print line
raise Exception("Parse error")
tmp['words'] = {}
exp = re.compile('\[([a-zA-Z0-9=. ]+)\]')
m = exp.findall(line)
for s in m:
av = re.split("=| ", s) # attribute-value tuples
tmp['words'][av[1]] = dict(zip(*[av[2:][x::2] for x in (0, 1)]))
print tmp
state = 3
elif state == 3:
# skip over parse tree
if not (line.startswith(" ") or line.startswith("(ROOT")):
state = 4
tmp['tuples'] = []
if state == 4:
# dependency parse
if not line.startswith(" ") and line.rstrip().endswith(")"):
split_entry = re.split("\(|, ", line[:-2])
if len(split_entry) == 3:
rel, left, right = map(lambda x: remove_id(x), split_entry)
tmp['tuples'].append((rel,left,right))
print "\n", rel, left, right
elif "Coreference links" in line:
state = 5
elif state == 5:
# coreference links. Not yet implemented
print "CR", line
if len(tmp.keys()) != 0:
results.append(tmp)
return results

class StanfordCoreNLP(object):

def __init__(self):
Expand Down Expand Up @@ -58,10 +110,26 @@ def __init__(self):
self._server.expect("Entering interactive shell.")
pbar.finish()
print self._server.before

def parse(self, text):
self._server.sendline(text)
return self._server.readlines()
"""
This function takes a text string, sends it to the Stanford parser,
reads in the result, parses the results and returns a list
with one dictionary entry for each parsed sentence, in JSON format.
"""
print "Request", text
print self._server.sendline(text)
end_time = time.time() + 2
incoming = ""
while True:
# Still have time left, so read more data
ch = self._server.read_nonblocking (2000, 3)
freshlen = len(ch)
time.sleep (0.0001)
incoming = incoming + ch
if end_time - time.time() < 0:
break
return dumps(parse_parser_results(incoming))


if __name__ == '__main__':
Expand All @@ -73,15 +141,11 @@ def parse(self, text):
'-H', '--host', default='127.0.0.1',
help='Host to serve on (default localhost; 0.0.0.0 to make public)')
options, args = parser.parse_args()
parser.print_help()
#parser.print_help()
server = jsonrpc.Server(jsonrpc.JsonRpc20(),
jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
corenlp = StanfordCoreNLP()
server.register_function(corenlp.parse)
#server.register_instance(StanfordCoreNLP())
print 'Serving on http://%s:%s' % (options.host, options.port)
server.serve()




0 comments on commit 72b0db1

Please sign in to comment.