-
Notifications
You must be signed in to change notification settings - Fork 120
/
functions.scala
166 lines (144 loc) · 4.95 KB
/
functions.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
package com.databricks.spark.corenlp
import java.util.Properties
import scala.collection.JavaConverters._
import edu.stanford.nlp.ling.CoreAnnotations
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations
import edu.stanford.nlp.pipeline.{Annotation, CleanXmlAnnotator, StanfordCoreNLP, TokenizerAnnotator}
import edu.stanford.nlp.pipeline.CoreNLPProtos.Sentiment
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations
import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple
import org.apache.spark.sql.functions.udf
/**
* A collection of Spark SQL UDFs that wrap CoreNLP annotators and simple functions.
* @see [[edu.stanford.nlp.simple]]
*/
object functions {
@transient private var sentimentPipeline: StanfordCoreNLP = _
private def getOrCreateSentimentPipeline(): StanfordCoreNLP = {
if (sentimentPipeline == null) {
val props = new Properties()
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment")
sentimentPipeline = new StanfordCoreNLP(props)
}
sentimentPipeline
}
private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
}
private case class CorefMention(sentNum: Int, startIndex: Int, mention: String)
private case class CorefChain(representative: String, mentions: Seq[CorefMention])
private case class SemanticGraphEdge(
source: String,
sourceIndex: Int,
relation: String,
target: String,
targetIndex: Int,
weight: Double)
/**
* Cleans XML tags in a document.
*/
def cleanxml = udf { document: String =>
val annotation = new Annotation(document)
val tokenizerAnnotator = new TokenizerAnnotator()
tokenizerAnnotator.annotate(annotation)
val cleanXmlAnnotator = new CleanXmlAnnotator()
cleanXmlAnnotator.annotate(annotation)
val tokens = annotation.get(classOf[CoreAnnotations.TokensAnnotation])
tokens.asScala.map(_.word()).mkString(" ")
}
/**
* Tokenizes a sentence into words.
* @see [[Sentence#words]]
*/
def tokenize = udf { sentence: String =>
new Sentence(sentence).words().asScala
}
/**
* Splits a document into sentences.
* @see [[Document#sentences]]
*/
def ssplit = udf { document: String =>
new Document(document).sentences().asScala.map(_.text())
}
/**
* Generates the part of speech tags of the sentence.
* @see [[Sentence#posTags]]
*/
def pos = udf { sentence: String =>
new Sentence(sentence).posTags().asScala
}
/**
* Generates the word lemmas of the sentence.
* @see [[Sentence#lemmas]]
*/
def lemma = udf { sentence: String =>
new Sentence(sentence).lemmas().asScala
}
/**
* Generates the named entity tags of the sentence.
* @see [[Sentence#nerTags]]
*/
def ner = udf { sentence: String =>
new Sentence(sentence).nerTags().asScala
}
/**
* Generates the semantic dependencies of the sentence.
* @see [[Sentence#dependencyGraph]]
*/
def depparse = udf { sentence: String =>
new Sentence(sentence).dependencyGraph().edgeListSorted().asScala.map { edge =>
SemanticGraphEdge(
edge.getSource.word(),
edge.getSource.index(),
edge.getRelation.toString,
edge.getTarget.word(),
edge.getTarget.index(),
edge.getWeight)
}
}
/**
* Generates the coref chains of the document.
*/
def coref = udf { document: String =>
new Document(document).coref().asScala.values.map { chain =>
val rep = chain.getRepresentativeMention.mentionSpan
val mentions = chain.getMentionsInTextualOrder.asScala.map { m =>
CorefMention(m.sentNum, m.startIndex, m.mentionSpan)
}
CorefChain(rep, mentions)
}.toSeq
}
/**
* Generates the Natural Logic notion of polarity for each token in a sentence,
* returned as "up", "down", or "flat".
* @see [[Sentence#natlogPolarities]]
*/
def natlog = udf { sentence: String =>
new Sentence(sentence).natlogPolarities().asScala
.map(_.toString)
}
/**
* Generates a list of Open IE triples as flat (subject, relation, target, confidence) quadruples.
* @see [[Sentence#openie]]
*/
def openie = udf { sentence: String =>
new Sentence(sentence).openie().asScala.map(q => new OpenIE(q)).toSeq
}
/**
* Measures the sentiment of an input sentence on a scale of 0 (strong negative) to 4 (strong
* positive).
* If the input contains multiple sentences, only the first one is used.
* @see [[Sentiment]]
*/
def sentiment = udf { sentence: String =>
val pipeline = getOrCreateSentimentPipeline()
val annotation = pipeline.process(sentence)
val tree = annotation.get(classOf[CoreAnnotations.SentencesAnnotation])
.asScala
.head
.get(classOf[SentimentCoreAnnotations.SentimentAnnotatedTree])
RNNCoreAnnotations.getPredictedClass(tree)
}
}