Permalink
Browse files

(1) log4j is now set up in a useful way πŸ˜€, (2) made start_server.sh l…

…ess awkward to use by moving the optional argument to the end of the arguments list, (3) some Parser code clean-up/re-org, plus a way to lexicalize (find head words) in existing parse trees.
  • Loading branch information...
1 parent 7a01eb6 commit 8a0a81b8465d96161199a63cccb8e47b3ad8f43b @dmnapolitano committed Jul 17, 2013
View
@@ -8,7 +8,7 @@ The core return type here is a data structure called `ParseTree` which has two m
* `tree`: A string representing your parse tree (or, quite optionally, parse treeS; keep reading).
* `score`: A double representing the score for that parse.
-In order to get these `ParseTree` objects, you have two choices, depending on whether or not you'd like Stanford's tokenizer to do some of the work for you. The arguments are supplied in both Python and Java terms for ease of understanding, but again, see the clients if you're confused. Keep reading for more information on the `outputFormat` parameter to each of these methods.
+In order to get these `ParseTree` objects, you have three choices, depending on whether or not you'd like Stanford's tokenizer to do some of the work for you. The arguments are supplied in both Python and Java terms for ease of understanding, but again, see the clients if you're confused. Keep reading for more information on the `outputFormat` parameter to each of these methods.
* `parse_text(text, outputFormat)` where `text` is a Java `String`/Python `str` or `unicode`, `outputFormat` is a Java `List<String>`/Python list containing `str`/`unicode`.
Returns: Java `List<ParseTree>`/Python list containing `ParseTree` objects.
@@ -22,6 +22,7 @@ In order to get these `ParseTree` objects, you have two choices, depending on wh
Returns: A `ParseTree` object.
Given a single Penn Treebank part-of-speech-tagged sentence from the tokenizer and tagger combination of your choice, have Stanford generate a parse tree based on those tags.
+If you already have a `ParseTree` or a `String`/`str` or `unicode` that represents a valid parse tree, and you'd like to find the head words for each phrase, you can call `lexicalize_parse_tree(tree)` where `tree` a Java `String`/Python `str`/`unicode`. This method returns a `String`/etc. that would be the same if you generated a parse tree using any of the methods above and had specified `-outputFormatOptions lexicalize` in the `outputFormat` argument. Please note that if you pass in a tree that is already lexicalized, CoreNLP will just re-lexicalize that tree for you, resulting in duplicate head word information. Whatever format `tree` was in when you called this function will be the same format as your output, only the tree itself will be annotated with head word information.
##### What one can do with the `outputFormat` argument to both of these methods
View
@@ -33,6 +33,7 @@ service StanfordCoreNLP
list<ParseTree> parse_text(1:string text, 2:list<string> outputFormat),
ParseTree parse_tokens(1:list<string> tokens, 2:list<string> outputFormat),
ParseTree parse_tagged_sentence(1:string taggedSentence, 2:list<string> outputFormat, 3:string divider),
+ string lexicalize_parse_tree(1:string tree),
list<NamedEntity> get_entities_from_text(1:string text),
list<NamedEntity> get_entities_from_tokens(1:list<string> tokens),
list<NamedEntity> get_entities_from_trees(1:list<string> trees),
@@ -28,6 +28,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
print ' parse_text(string text, outputFormat)'
print ' ParseTree parse_tokens( tokens, outputFormat)'
print ' ParseTree parse_tagged_sentence(string taggedSentence, outputFormat, string divider)'
+ print ' string lexicalize_parse_tree(string tree)'
print ' get_entities_from_text(string text)'
print ' get_entities_from_tokens( tokens)'
print ' get_entities_from_trees( trees)'
@@ -120,6 +121,12 @@ elif cmd == 'parse_tagged_sentence':
sys.exit(1)
pp.pprint(client.parse_tagged_sentence(args[0],eval(args[1]),args[2],))
+elif cmd == 'lexicalize_parse_tree':
+ if len(args) != 1:
+ print 'lexicalize_parse_tree requires 1 args'
+ sys.exit(1)
+ pp.pprint(client.lexicalize_parse_tree(args[0],))
+
elif cmd == 'get_entities_from_text':
if len(args) != 1:
print 'get_entities_from_text requires 1 args'
@@ -49,6 +49,13 @@ def parse_tagged_sentence(self, taggedSentence, outputFormat, divider):
"""
pass
+ def lexicalize_parse_tree(self, tree):
+ """
+ Parameters:
+ - tree
+ """
+ pass
+
def get_entities_from_text(self, text):
"""
Parameters:
@@ -265,6 +272,36 @@ def recv_parse_tagged_sentence(self, ):
return result.success
raise TApplicationException(TApplicationException.MISSING_RESULT, "parse_tagged_sentence failed: unknown result");
+ def lexicalize_parse_tree(self, tree):
+ """
+ Parameters:
+ - tree
+ """
+ self.send_lexicalize_parse_tree(tree)
+ return self.recv_lexicalize_parse_tree()
+
+ def send_lexicalize_parse_tree(self, tree):
+ self._oprot.writeMessageBegin('lexicalize_parse_tree', TMessageType.CALL, self._seqid)
+ args = lexicalize_parse_tree_args()
+ args.tree = tree
+ args.write(self._oprot)
+ self._oprot.writeMessageEnd()
+ self._oprot.trans.flush()
+
+ def recv_lexicalize_parse_tree(self, ):
+ (fname, mtype, rseqid) = self._iprot.readMessageBegin()
+ if mtype == TMessageType.EXCEPTION:
+ x = TApplicationException()
+ x.read(self._iprot)
+ self._iprot.readMessageEnd()
+ raise x
+ result = lexicalize_parse_tree_result()
+ result.read(self._iprot)
+ self._iprot.readMessageEnd()
+ if result.success is not None:
+ return result.success
+ raise TApplicationException(TApplicationException.MISSING_RESULT, "lexicalize_parse_tree failed: unknown result");
+
def get_entities_from_text(self, text):
"""
Parameters:
@@ -607,6 +644,7 @@ def __init__(self, handler):
self._processMap["parse_text"] = Processor.process_parse_text
self._processMap["parse_tokens"] = Processor.process_parse_tokens
self._processMap["parse_tagged_sentence"] = Processor.process_parse_tagged_sentence
+ self._processMap["lexicalize_parse_tree"] = Processor.process_lexicalize_parse_tree
self._processMap["get_entities_from_text"] = Processor.process_get_entities_from_text
self._processMap["get_entities_from_tokens"] = Processor.process_get_entities_from_tokens
self._processMap["get_entities_from_trees"] = Processor.process_get_entities_from_trees
@@ -685,6 +723,17 @@ def process_parse_tagged_sentence(self, seqid, iprot, oprot):
oprot.writeMessageEnd()
oprot.trans.flush()
+ def process_lexicalize_parse_tree(self, seqid, iprot, oprot):
+ args = lexicalize_parse_tree_args()
+ args.read(iprot)
+ iprot.readMessageEnd()
+ result = lexicalize_parse_tree_result()
+ result.success = self._handler.lexicalize_parse_tree(args.tree)
+ oprot.writeMessageBegin("lexicalize_parse_tree", TMessageType.REPLY, seqid)
+ result.write(oprot)
+ oprot.writeMessageEnd()
+ oprot.trans.flush()
+
def process_get_entities_from_text(self, seqid, iprot, oprot):
args = get_entities_from_text_args()
args.read(iprot)
@@ -1492,6 +1541,149 @@ def __ne__(self, other):
return not (self == other)
+class lexicalize_parse_tree_args(object):
+ """
+ Attributes:
+ - tree
+ """
+
+ __slots__ = [
+ 'tree',
+ ]
+
+ thrift_spec = (
+ None, # 0
+ (1, TType.STRING, 'tree', None, None, ), # 1
+ )
+
+ def __init__(self, tree=None,):
+ self.tree = tree
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 1:
+ if ftype == TType.STRING:
+ self.tree = iprot.readString().decode('utf-8')
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('lexicalize_parse_tree_args')
+ if self.tree is not None:
+ oprot.writeFieldBegin('tree', TType.STRING, 1)
+ oprot.writeString(self.tree.encode('utf-8'))
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
+
+class lexicalize_parse_tree_result(object):
+ """
+ Attributes:
+ - success
+ """
+
+ __slots__ = [
+ 'success',
+ ]
+
+ thrift_spec = (
+ (0, TType.STRING, 'success', None, None, ), # 0
+ )
+
+ def __init__(self, success=None,):
+ self.success = success
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 0:
+ if ftype == TType.STRING:
+ self.success = iprot.readString().decode('utf-8')
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('lexicalize_parse_tree_result')
+ if self.success is not None:
+ oprot.writeFieldBegin('success', TType.STRING, 0)
+ oprot.writeString(self.success.encode('utf-8'))
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
+
class get_entities_from_text_args(object):
"""
Attributes:
@@ -65,18 +65,20 @@
# This list is for options for how we'd like the output formatted. See README.md for the full list of possible options.
# Note that the DEFAULT is what you would get if you specified "oneline" on the command line, or "None" here.
#outputOptions = ["-outputFormat", "typedDependencies,penn", "-outputFormatOptions", "basicDependencies"]
-#outputOptions = None
-#outputOptions = ["-outputFormat", "oneline"] # Same as specifying "None", as above.
-outputOptions = ["-outputFormat", "oneline,typedDependencies"]
+#outputOptions = []
+outputOptions = ["-outputFormat", "penn"]
+#outputOptions = ["-outputFormat", "oneline,typedDependencies"]
+
-'''
try:
parse_trees = client.parse_text(arbitrary_text, outputOptions)
for result in parse_trees:
sys.stdout.write(result.tree.strip() + " [" + str(result.score) + "]\n")
+ sys.stdout.write(client.lexicalize_parse_tree(result.tree.strip()) + "\n\n")
except Exception as e:
print e
+'''
print
for sentence in tokenized_sentences:
@@ -85,9 +87,11 @@
sys.stdout.write(tree.tree.strip() + " [" + str(tree.score) + "]\n")
except Exception as e:
print e
+'''
print
+'''
for sentence in more_tokenized_sentences:
try:
tree = client.parse_tokens(sentence, outputOptions)
@@ -96,14 +100,16 @@
print e
'''
+'''
try:
tree = client.parse_tokens(weird_sentence, outputOptions)
- sys.stdout.write(tree.tree.strip() + "\n")
+ sys.stdout.write(tree.tree.strip() + "\n\n")
except Exception as e:
print e
+'''
-#tree = client.parse_tagged_sentence(tagged_sentence, outputOptions, "/")
-#sys.stdout.write("\n" + tree.tree.strip() + "\n")
+tree = client.parse_tagged_sentence(tagged_sentence, outputOptions, "/")
+sys.stdout.write("\n" + tree.tree.strip() + "\n")
# All done
transport.close()
@@ -4,16 +4,16 @@ MAINDIR=$(dirname $0:A)/../
if [ $# -eq 3 ]; then
PORT=$1
- MODEL=$2
- HEAPSIZE=$3
+ HEAPSIZE=$2
+ MODEL=$3
java -cp $CLASSPATH:$MAINDIR/stanford-corenlp-wrapper.jar -Xmx$HEAPSIZE -XX:-UseGCOverheadLimit StanfordCoreNLPServer $PORT $MODEL
elif [ $# -eq 2 ]; then
PORT=$1
HEAPSIZE=$2
java -cp $CLASSPATH:$MAINDIR/stanford-corenlp-wrapper.jar -Xmx$HEAPSIZE -XX:-UseGCOverheadLimit StanfordCoreNLPServer $PORT
else
- echo "Usage: $(basename $0) <port> [<model>] <heapsize>"
- echo "e.g., $(basename $0) 9999 edu/stanford/nlp/models/lexparser/englishFactored.ser.gz 4G"
+ echo "Usage: $(basename $0) <port> <heapsize> [<model>]"
+ echo "e.g., $(basename $0) 9999 4G edu/stanford/nlp/models/lexparser/englishFactored.ser.gz"
echo "or, $(basename $0) 9999 4G"
echo "Parser model is optional; will use edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz by default."
fi
@@ -77,6 +77,11 @@ public ParseTree parse_tagged_sentence(String taggedSentence, List<String> outpu
}
return parser.parse_tagged_sentence(taggedSentence, outputFormat, divider);
}
+
+ public String lexicalize_parse_tree(String tree)
+ {
+ return parser.lexicalize_parse_tree(tree);
+ }
/* End Stanford Parser methods */
@@ -64,9 +64,9 @@ public static void main(String[] args)
if (args.length < 1 || args.length > 2)
{
System.err.println("Usage: StanfordCoreNLPServer <port> [<path to parser model file>]");
- System.err.println("You only need to specify the full path to a model if you wish to use a model "
+ System.err.println("You only need to specify the full path to a model if you wish to use a model "
+ "other than the English PCFG one.");
- System.err.println("English Factored model path = edu/stanford/nlp/models/lexparser/englishFactored.ser.gz");
+ System.err.println("English Factored model path = edu/stanford/nlp/models/lexparser/englishFactored.ser.gz");
System.exit(2);
}
else if (args.length == 1)
@@ -79,10 +79,12 @@ else if (args.length == 1)
parserModelFile = args[1];
}
+ org.apache.log4j.BasicConfigurator.configure();
+
try
{
handler = new StanfordCoreNLPHandler(parserModelFile);
- processor = new StanfordCoreNLP.Processor(handler);
+ processor = new StanfordCoreNLP.Processor(handler);
Runnable r = new ServerThread(processor, portNum);
new Thread(r).start();
}
@@ -19,6 +19,7 @@
import edu.stanford.nlp.pipeline.ParserAnnotatorUtils;
import edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory;
import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.util.CoreMap;
public class CoreNLPThriftUtil
Oops, something went wrong.

0 comments on commit 8a0a81b

Please sign in to comment.