Skip to content

Commit

Permalink
Merge pull request #247 from clulab/kwalcock/ghana
Browse files Browse the repository at this point in the history
Recover the data from Ghana
  • Loading branch information
kwalcock committed Mar 21, 2024
2 parents a7df3c5 + da4922d commit 53b25b8
Show file tree
Hide file tree
Showing 10 changed files with 659 additions and 14 deletions.
6 changes: 3 additions & 3 deletions belief_pipeline/tpi_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ def get_in_and_out() -> Tuple[str, str]:
belief_model_name: str = "maxaalexeeva/belief-classifier_mturk_unmarked-trigger_bert-base-cased_2023-4-26-0-34"
sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1"
locations_file_name: str = "./belief_pipeline/GH.tsv"
input_file_name: str = "../corpora/ghana-regulations/ghana-regulations.tsv"
output_file_name: str = "../corpora/ghana-regulations/ghana-regulations-2.tsv"
input_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4.tsv"
output_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4a.tsv"
# input_file_name, output_file_name = get_in_and_out()
pipeline = Pipeline(
TpiInputStage(input_file_name),
[
TpiResolutionStage(),
TpiBeliefStage(belief_model_name),
# TpiSentimentStage(sentiment_model_name),
TpiSentimentStage(sentiment_model_name),
TpiLocationStage(locations_file_name)
],
PandasOutputStage(output_file_name)
Expand Down
5 changes: 3 additions & 2 deletions belief_pipeline/vector_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ def get_in_and_out() -> Tuple[str, str]:

if __name__ == "__main__":
vector_model_name: str = "all-MiniLM-L6-v2"
input_file_name: str = "../corpora/uganda-local/uganda-2.tsv"
output_file_name: str = "../corpora/uganda-local/uganda-2-vectors.tsv"
input_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4a.tsv"
output_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4b.tsv"

# input_file_name, output_file_name = get_in_and_out()
pipeline = Pipeline(
VectorInputStage(input_file_name),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import org.json4s.jackson.JsonMethods
import java.io.File
import scala.util.Using

object Step2InputEidos1 extends App with Logging {
object Step2InputEidos1App extends App with Logging {
implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
val contextWindow = 3
val baseDirectory = "../corpora/uganda-local"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package org.clulab.habitus.apps.elasticsearch

import ai.lum.common.FileUtils._
import org.clulab.habitus.apps.utils.{AttributeCounts, JsonRecord}
import org.clulab.processors.{Document, Sentence}
import org.clulab.utils.Sourcer
import org.clulab.wm.eidos.document.AnnotatedDocument
import org.clulab.wm.eidos.serialization.jsonld.JLDDeserializer
import org.clulab.wm.eidoscommon.utils.{FileEditor, FileUtils, Logging, TsvReader, TsvWriter}
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods

import java.io.File
import scala.util.Using

object Step2InputEidos1GhanaApp extends App with Logging {
implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
val contextWindow = 3
val datasetFilename = "../corpora/ghana-elasticsearch/dataset55k.tsv"
val baseDirectory = "/home/kwa/data/Corpora/habitus-project/corpora/multimix"
val outputFileName = "../corpora/ghana-elasticsearch/ghana-elasticsearch.tsv"
val deserializer = new JLDDeserializer()

def getDatasetUrls(): Set[String] = {
// TODO: Also get terms from here instead of from directory names.
val datasetUrls = Using.resource(Sourcer.sourceFromFilename(datasetFilename)) { source =>
val tsvReader = new TsvReader()
val datasetUrls = source.getLines.drop(1).map { line =>
val Array(url) = tsvReader.readln(line, 1)

url
}.toSet

datasetUrls
}

datasetUrls
}

def jsonFileToJsonld(jsonFile: File): File =
FileEditor(jsonFile).setExt("jsonld").get

def jsonFileToRecord(jsonFile: File): JsonRecord = {
val json = FileUtils.getTextFromFile(jsonFile)
val jValue = JsonMethods.parse(json)
val url = (jValue \ "url").extract[String]
val titleOpt = (jValue \ "title").extractOpt[String]
val datelineOpt = (jValue \ "dateline").extractOpt[String]
val bylineOpt = (jValue \ "byline").extractOpt[String]
val text = (jValue \ "text").extract[String]

// Don't use them all in order to save space.
JsonRecord(url, None, None, None, "")
}

def jsonldFileToAnnotatedDocument(jsonldFile: File): AnnotatedDocument = {
val json = FileUtils.getTextFromFile(jsonldFile)
val corpus = deserializer.deserialize(json)
val annotatedDocument = corpus.head

annotatedDocument
}

def rawTextToCleanText(rawText: String): String = rawText
.trim
.replaceAll("\r\n", " ")
.replaceAll("\n", " ")
.replaceAll("\r", " ")
.replaceAll("\t", " ")
.replaceAll("\u2028", " ") // unicode line separator
.replaceAll("\u2029", " ") // unicode paragraph separator
.map { letter =>
if (letter.toInt < 32) ' '
else letter
}
.trim

def getSentenceText(document: Document, sentence: Sentence): String = {
val rawText = document.text.get.slice(sentence.startOffsets.head, sentence.endOffsets.last)
val cleanText = rawTextToCleanText(rawText)

cleanText
}

def attributeCountsToTsvWriter(attributeCounts: AttributeCounts, tsvWriter: TsvWriter): Unit = {
tsvWriter.print(
attributeCounts.increaseCount.toString, attributeCounts.decreaseCount.toString,
attributeCounts.posChangeCount.toString, attributeCounts.negChangeCount.toString,
""
)
}

val datasetUrls: Set[String] = getDatasetUrls
val jsonFilesAndUrls: Seq[(File, String)] = {
val allJsonFiles = new File(baseDirectory).listFilesByWildcard("*.json", recursive = true).toVector
val jsonFilesWithJsonld = allJsonFiles.filter { jsonFile =>
jsonFileToJsonld(jsonFile).exists
}
val jsonFilesAndUrls: Seq[(File, String)] = jsonFilesWithJsonld.map { jsonFile =>
val record = jsonFileToRecord(jsonFile)

(jsonFile, record.url)
}
val headJsonFilesAndUrls = jsonFilesAndUrls.groupBy(_._2).map(_._2.head).toSeq

headJsonFilesAndUrls
}

Using.resource(FileUtils.printWriterFromFile(outputFileName)) { printWriter =>
val tsvWriter = new TsvWriter(printWriter)

tsvWriter.println("url", "sentenceIndex", "sentence", "context", "prevSentence")
datasetUrls.zipWithIndex.foreach { case (url, index) =>
val jsonFile = jsonFilesAndUrls.find(_._2 == url).get._1

println(s"$index ${jsonFile.getPath}")
try {
val jsonldFile = jsonFileToJsonld(jsonFile)
val annotatedDocument = jsonldFileToAnnotatedDocument(jsonldFile)
val document = annotatedDocument.document
val sentences = document.sentences

sentences.zipWithIndex.foreach { case (sentence, sentenceIndex) =>
val cleanText = getSentenceText(document, sentence)
val context = sentences
.slice(sentenceIndex - contextWindow, sentenceIndex + contextWindow + 1)
.map(getSentenceText(document, _))
.mkString(" ")
val prevSentenceText = sentences
.lift(sentenceIndex - 1)
.map(getSentenceText(document, _))
.getOrElse("")

tsvWriter.println(url, sentenceIndex.toString, cleanText, context, prevSentenceText)
}
}
catch
{
case throwable: Throwable =>
logger.error(s"Exception for file $jsonFile", throwable)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import java.io.File
import java.net.URL
import scala.util.{Try, Using}

object Step2InputEidos2 extends App with Logging {
object Step2InputEidos2App extends App with Logging {

case class LocalTsvRecord(
sentenceIndex: Int,
Expand Down Expand Up @@ -263,7 +263,7 @@ object Step2InputEidos2 extends App with Logging {
val contextLocations = parseLocations(contextLocationsString)
val vector = normalize(parseVector(vectorString))

(url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
(url, sentenceIndex) -> LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
}.toMap
}
val restClient = Elasticsearch.mkRestClient(url, credentialsFilename)
Expand Down
Loading

0 comments on commit 53b25b8

Please sign in to comment.