Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialize JSON in a way that doesn't require duplicate calculations of Document hashes #789

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import org.apache.commons.io.FilenameUtils
import org.clulab.reach.assembly.relations.corpus.{CorpusReader, EventPair}
import org.clulab.odin.Mention
import org.clulab.reach.PaperReader
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{CorefMention, MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.utils.Serializer

import com.typesafe.config.ConfigFactory
Expand Down Expand Up @@ -127,9 +127,6 @@ object RunAnnotationEval extends App with LazyLogging {
* Serialize each paper in a directory to json
*/
object SerializePapersToJSON extends App with LazyLogging {

import org.clulab.reach.mentions.serialization.json._

val config = ConfigFactory.load()
val papersDir = new File(config.getString("papersDir"))
val outDir = new File(config.getString("outDir"))
Expand All @@ -150,7 +147,8 @@ object SerializePapersToJSON extends App with LazyLogging {
val mentions = PaperReader.getMentionsFromPaper(paper)
val cms: Seq[CorefMention] = mentions.map(_.toCorefMention)
logger.info(s"extracted ${mentions.size} mentions for $paperID")
cms.saveJSON(outFile, pretty = true)

MentionsOps(cms).saveJSON(outFile, pretty = true)
logger.info(s"saved json to $outFile")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ import org.clulab.processors.Document
import org.clulab.reach.assembly.relations.classifier.AssemblyRelationClassifier
import org.clulab.reach.assembly.sieves.Constraints
import org.clulab.reach.mentions.CorefMention
import org.clulab.reach.mentions.serialization.json.{MentionJSONOps, REACHMentionSeq, JSONSerializer}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps, MentionsOps}
import org.clulab.serialization.json.JSONSerialization
import org.json4s.jackson.JsonMethods._
import org.json4s.JsonDSL._
import org.json4s._

import scala.util.hashing.MurmurHash3._
import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.io.FileUtils.forceMkdir
import ai.lum.common.FileUtils._

import java.io.File


Expand Down Expand Up @@ -44,8 +46,8 @@ case class EventPair(
// the seed (not counted in the length of finalizeHash)
val h0 = stringHash("org.clulab.assembly.TrainingInstance")
// get hashes for each event
val h1 = mix(h0, e1.equivalenceHash)
val h2 = mix(h1, e2.equivalenceHash)
val h1 = mix(h0, MentionOps(e1).equivalenceHash)
val h2 = mix(h1, MentionOps(e2).equivalenceHash)
// is it cross-sentence?
val h3 = mix(h2, isCrossSentence.hashCode)
// the text of the sentences containing the two event mentions
Expand All @@ -66,34 +68,37 @@ case class EventPair(


def jsonAST: JValue = {
val e1EventOps = new EventOps(e1)
val e2EventOps = new EventOps(e2)

// build json
("id" -> this.equivalenceHash) ~
("text" -> this.text) ~
("coref" -> this.coref) ~
// event 1
("e1-id" -> this.e1.id) ~
("e1-label" -> this.e1.eventLabel) ~
("e1-sentence-text" -> this.e1.sentenceText) ~
("e1-id" -> MentionOps(this.e1).id) ~
("e1-label" -> e1EventOps.eventLabel) ~
("e1-sentence-text" -> e1EventOps.sentenceText) ~
("e1-sentence-index" -> this.e1.sentence) ~
("e1-sentence-tokens" -> this.e1.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e1-start" -> this.e1.start) ~
("e1-end" -> this.e1.end) ~
("e1-trigger" -> this.e1.trigger.text) ~
("e1-trigger-start" -> this.e1.trigger.start) ~
("e1-trigger-end" -> this.e1.trigger.end) ~
("e1-trigger" -> e1EventOps.trigger.text) ~
("e1-trigger-start" -> e1EventOps.trigger.start) ~
("e1-trigger-end" -> e1EventOps.trigger.end) ~
// event 2
("e2-id" -> this.e2.id) ~
("e2-label" -> this.e2.eventLabel) ~
("e2-sentence-text" -> this.e2.sentenceText) ~
("e2-id" -> MentionOps(this.e2).id) ~
("e2-label" -> e2EventOps.eventLabel) ~
("e2-sentence-text" -> e2EventOps.sentenceText) ~
("e2-sentence-index" -> this.e2.sentence) ~
("e2-sentence-tokens" -> this.e2.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e2-start" -> this.e2.start) ~
("e2-end" -> this.e2.end) ~
("e2-trigger" -> this.e2.trigger.text) ~
("e2-trigger-start" -> this.e2.trigger.start) ~
("e2-trigger-end" -> this.e2.trigger.end) ~
("e2-trigger" -> e2EventOps.trigger.text) ~
("e2-trigger-start" -> e2EventOps.trigger.start) ~
("e2-trigger-end" -> e2EventOps.trigger.end) ~
// these will be filled out during annotation
("annotator-id" -> this.annotatorID) ~
("relation" -> this.relation) ~
Expand Down Expand Up @@ -155,7 +160,7 @@ case class Corpus(instances: Seq[EventPair]) extends JSONSerialization {
// for each doc, write doc + mentions to a json file
for ((paperID, cms) <- dmLUT) {
val of = new File(mentionDataDir, s"$paperID-mention-data.json")
of.writeString(cms.json(pretty), java.nio.charset.StandardCharsets.UTF_8)
of.writeString(MentionsOps(cms).json(pretty), java.nio.charset.StandardCharsets.UTF_8)
}
// write event pair info to json file
val epf = new File(corpusDir, s"${Corpus.EVENT_PAIRS}.json")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ object CorpusBuilder extends LazyLogging {
// create training instance
ep = EventPair(Set(m1, m2))
// triggers should not be the same
if ep.e1.trigger != ep.e2.trigger
if new EventOps(ep.e1).trigger != new EventOps(ep.e2).trigger
} yield ep

distinctEventPairs(eps.toSeq)
Expand All @@ -133,7 +133,7 @@ object CorpusBuilder extends LazyLogging {
def distinctEventPairs(eps: Seq[EventPair]): Seq[EventPair] = {
eps.distinct.groupBy(ep =>
// distinct by...
(ep.e1.sentence, ep.e2.trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, ep.e2.trigger, ep.e2.label, ep.e2.text)
(ep.e1.sentence, new EventOps(ep.e2).trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, new EventOps(ep.e2).trigger, ep.e2.label, ep.e2.text)
).values.map(_.head) // get one value for each key
.toSeq
.sortBy{ ep => (ep.doc.id.getOrElse(""), ep.sentenceIndices.head) }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ package org.clulab.reach.assembly.relations

import org.clulab.odin.Mention
import org.clulab.reach.assembly.sieves.SieveUtils
import org.clulab.reach.mentions.serialization.json.{ CorefMentionOps, JSONSerializer => ReachJsonSerializer }
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.CorefMention
import org.clulab.reach.mentions.serialization.json.JSONSerializer
import com.typesafe.scalalogging.LazyLogging
import scala.collection.GenSeq
import java.io.File
Expand All @@ -12,7 +12,7 @@ import java.io.File
package object corpus extends LazyLogging {

/** Additional attributes and methods for a [[CorefMention]] */
implicit class EventOps(mention: CorefMention) extends CorefMentionOps(mention) {
class EventOps(mention: CorefMention) {
val eventLabel: String = mention.label
val sentenceText: String = mention.sentenceObj.getSentenceText
// NOTE: if mention is a TB, trigger will simply be the mention (ex. BioProcess)
Expand All @@ -29,7 +29,7 @@ package object corpus extends LazyLogging {
def datasetLUT(jsonFiles: GenSeq[File]): Map[String, Vector[CorefMention]] = {
val docMentionPairs = jsonFiles.filter(_.getName.endsWith(".json")).map{ f: File =>
logger.debug(s"parsing ${f.getName}")
val cms: Vector[CorefMention] = ReachJsonSerializer.toCorefMentions(f).toVector
val cms: Vector[CorefMention] = JSONSerializer.toCorefMentions(f).toVector
if (cms.nonEmpty) logger.debug(s"successfully parsed ${f.getName}")
val paperID = getPMID(cms.head)
paperID -> cms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import com.typesafe.scalalogging.Logger
import org.clulab.odin.Mention
import org.clulab.reach.FriesEntry
import org.clulab.reach.ReachConstants._
import org.clulab.odin.serialization.json._
import org.clulab.odin.serialization.json.MentionOps
import org.json4s.jackson.Serialization
import org.slf4j.LoggerFactory

Expand Down Expand Up @@ -154,7 +154,7 @@ object JsonOutputter {
else {
// "Gene_or_gene_product" is another possibility.
// Also "Family", "Disease", "Simple_chemical"
val json = mention.json(pretty = true)
val json = MentionOps(mention).json(pretty = true)
val message = s"""Unknown event type "$label" in event:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@ package org.clulab.reach.export.indexcards
import java.io.File
import java.util.Date
import java.util.regex.Pattern

import scala.collection.mutable
import scala.collection.mutable.ListBuffer

import com.typesafe.scalalogging.LazyLogging
import org.clulab.odin.Mention
import org.clulab.reach.ReachConstants._
import org.clulab.reach.{FriesEntry, display}
import org.clulab.reach.export.JsonOutputter._
import org.clulab.reach.export.{JsonOutputter, OutputDegrader}
import org.clulab.reach.grounding.KBResolution
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json.mentionToJSON
import org.clulab.reach.mentions.{BioEventMention, CorefMention, Mutant, PTM, MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps}
import org.clulab.reach.utils.MentionManager
import IndexCardOutput._

Expand Down Expand Up @@ -173,7 +171,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "amount" => mkSimpleEventIndexCard(mention, mention.label)
case _ =>
// "conversion" is one example of an eventType not handled.
val json = mentionToJSON(mention, pretty = true)
val json = MentionOps(mention).json(pretty = true)
val message = s"""Event type "$eventType" is not supported for indexcard output:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
Expand Down Expand Up @@ -211,7 +209,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "complex" => Some(new PropMapOrFrameList(mkComplexArgument(derefArg))) // FrameList
case _ => {
// "event" is a typical culprit.
val json = mentionToJSON(arg, pretty = true)
val json = MentionOps(arg).json(pretty = true)
val message = s"""Argument type "$argType" is not supported for indexcard output:\n$json"""
logger.warn(message)
None
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
package org.clulab.reach.export.serial

import java.io.File
import java.io.{File, PrintWriter}
import java.util.Date
import java.util.regex.Pattern

import java.nio.charset.Charset
import java.nio.charset.StandardCharsets.UTF_8

import ai.lum.common.FileUtils._

import com.fasterxml.jackson.databind.ObjectWriter
import com.typesafe.scalalogging.LazyLogging

import org.clulab.odin.Mention
import org.clulab.reach.FriesEntry
import org.clulab.reach.export.JsonOutputter
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.serialization.json.stringify
import org.clulab.utils.Closer.AutoCloser
import org.clulab.utils.Sink
import org.json4s.JValue
import org.json4s.jackson.{JsonMethods, prettyJson, renderJValue}

/**
* Defines classes and methods used to output the serial-json output format.
Expand All @@ -28,6 +29,7 @@ class SerialJsonOutput (
encoding: Charset = UTF_8

) extends JsonOutputter with LazyLogging {
val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter()

/** Returns the given mentions in the serial-json format, as one big string. */
override def toJSON (
Expand All @@ -39,7 +41,7 @@ class SerialJsonOutput (
outFilePrefix:String
): String = {
val mentions = allMentions.map(_.toCorefMention)
mentions.json(true) // true = pretty print
MentionsOps(mentions).json(pretty = true)
}

/**
Expand All @@ -54,15 +56,17 @@ class SerialJsonOutput (
endTime:Date,
outFilePrefix:String
): Unit = {
val f: File = new File(outFilePrefix + ".json")
val mentions = allMentions.map(_.toCorefMention)
val jsonAST = MentionsOps(mentions).jsonAST
// Code here has been modified so that no json string is produced.
// String lengths max out at 2GB, unlike files, and with large inputs
// we were crashing when output could not be stuffed into a string.
val renderedJsonAST = JsonMethods.render(jsonAST)
val file = new File(outFilePrefix + ".json")
val printWriter = new PrintWriter(new Sink(file, encoding.name, append = false))

f.writeString(
string = mentions.json(true),
charset = encoding,
append = false,
gzipSupport = false
)
printWriter.autoClose { printWriter =>
objectWriter.writeValue(printWriter, renderedJsonAST)
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ import akka.stream.{ActorMaterializer, Materializer}
import akka.stream.scaladsl._
import akka.util.ByteString

import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.reach.PaperReader


Expand Down Expand Up @@ -102,7 +102,7 @@ object FileProcessorWebUI extends App with FileUpload {
def processFile(tempFile: File, outputType: String): String = {
val cms = PaperReader.getMentionsFromPaper(tempFile).map(_.toCorefMention)
outputType match {
case JSON => cms.json(false)
case JSON => MentionsOps(cms).json(false)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package org.clulab.polarity.ml.data

import java.io.PrintWriter
import java.util.{Calendar, Date}

import com.typesafe.scalalogging.LazyLogging
import org.clulab.polarity.{NegativePolarity, Polarity, PositivePolarity}
import org.clulab.reach.{PaperReader, ReachSystem}
Expand All @@ -11,8 +10,8 @@ import org.clulab.reach.mentions.{BioEventMention, BioMention, CorefEventMention
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import scala.util.{Failure, Success, Try}
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => MOps}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionsOps}
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s._
Expand Down Expand Up @@ -119,7 +118,7 @@ object PolarityDatasetPreprocessor extends App with LazyLogging{
def saveOutput(digestedData: Seq[(BioEventMention, Polarity)], outputPath: String): Unit = {
val (evts, labels) = digestedData.unzip

val jsonEvts = evts.jsonAST
val jsonEvts = MentionsOps(evts).jsonAST


val json =
Expand Down
Loading