/
FastNLPProcessor.scala
113 lines (92 loc) · 4.42 KB
/
FastNLPProcessor.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package org.clulab.processors.fastnlp
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations
import org.clulab.processors.{Document, OpenIEAnnotator}
import org.clulab.struct.GraphMap
import org.clulab.processors.corenlp.CoreNLPUtils
import org.clulab.processors.shallownlp.ShallowNLPProcessor
import edu.stanford.nlp.ling.CoreAnnotations
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation
import edu.stanford.nlp.parser.nndep.DependencyParser
import edu.stanford.nlp.pipeline.Annotation
import edu.stanford.nlp.semgraph.SemanticGraphFactory
import edu.stanford.nlp.trees.GrammaticalStructure
import java.util.Properties
import FastNLPProcessor._
import org.clulab.processors.clu.tokenizer.TokenizerStep
import scala.jdk.CollectionConverters._
/**
* Fast NLP tools
* Extends ShallowNLP with a dependency parser based on the Stanford NN dependency parser
* This means that constituent trees and coreference (which depends on constituent syntax) are not available
* The default setting is to use the Stanford parser with "basic" dependencies
* User: mihais
* Date: 1/4/14
* Last Modified: Update for Scala 2.12: java converters.
*/
class FastNLPProcessor(
tokenizerPostProcessor:Option[TokenizerStep],
internStrings:Boolean,
withChunks:Boolean,
withRelationExtraction:Boolean,
withDiscourse:Int)
extends ShallowNLPProcessor(tokenizerPostProcessor, internStrings, withChunks, withRelationExtraction) with OpenIEAnnotator{
def this(internStrings:Boolean = true,
withChunks:Boolean = true,
withRelationExtraction:Boolean = false,
withDiscourse:Int = ShallowNLPProcessor.NO_DISCOURSE) = {
this(None, internStrings, withChunks, withRelationExtraction, withDiscourse)
}
/** Stanford's NN dependency parser */
lazy val stanfordDepParser: DependencyParser = fetchStanfordParser()
override def parse(doc:Document): Unit = {
basicSanityCheck(doc).foreach { annotation =>
if (doc.sentences.head.tags.isEmpty)
throw new RuntimeException("ERROR: you have to run the POS tagger before parsing!")
if (doc.sentences.head.lemmas.isEmpty)
throw new RuntimeException("ERROR: you have to run the lemmatizer before parsing!")
parseWithStanford(doc, annotation)
}
}
private def parseWithStanford(doc:Document, annotation:Annotation): Unit = {
val sas = annotation.get(classOf[SentencesAnnotation]).asScala
var offset = 0
for (sa <- sas) {
// convert parens to Penn Treebank symbols because this is what the parser has seen in training
val words = CoreNLPUtils.parensToSymbols(sa.get(classOf[CoreAnnotations.TokensAnnotation]))
sa.set(classOf[CoreAnnotations.TokensAnnotation], words)
val sentenceSize = words.size()
// println("Parsing sentence: " + words.map(_.word()).mkString(" "))
// the actual parsing job
val gs = stanfordDepParser.predict(sa)
// convert to Stanford's semantic graph representation
val basicDeps = SemanticGraphFactory.makeFromTree(gs, SemanticGraphFactory.Mode.BASIC, GrammaticalStructure.Extras.NONE, null)
val enhancedDeps = SemanticGraphFactory.makeFromTree(gs, SemanticGraphFactory.Mode.ENHANCED_PLUS_PLUS, GrammaticalStructure.Extras.NONE, null)
// Add the dependency parses to the CoreNLP sentence annotation object
sa.set(classOf[SemanticGraphCoreAnnotations.BasicDependenciesAnnotation], basicDeps)
sa.set(classOf[SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation], enhancedDeps)
// convert to our own directed graph
doc.sentences(offset).setDependencies(GraphMap.UNIVERSAL_BASIC, CoreNLPUtils.toDirectedGraph(basicDeps, in, Some(sentenceSize)))
doc.sentences(offset).setDependencies(GraphMap.UNIVERSAL_ENHANCED, CoreNLPUtils.toDirectedGraph(enhancedDeps, in, Some(sentenceSize)))
//println("Output directed graph:")
//println(dg)
offset += 1
}
}
override def discourse(doc:Document): Unit = {
// no longer used
}
/** Semantic role labeling */
override def srl(doc: Document): Unit = {
// not implemented yet
}
}
object FastNLPProcessor {
var stanfordDependencyParser:Option[DependencyParser] = None
def fetchStanfordParser():DependencyParser = {
this.synchronized {
if(stanfordDependencyParser.isEmpty)
stanfordDependencyParser = Some(DependencyParser.loadFromModelFile(DependencyParser.DEFAULT_MODEL, new Properties()))
stanfordDependencyParser.get
}
}
}