Skip to content

Commit

Permalink
Merge pull request #246 from clulab/kwalcock/karamoja
Browse files Browse the repository at this point in the history
Submit some Karamoja code
  • Loading branch information
kwalcock committed Mar 12, 2024
2 parents 0533a3d + ceabef6 commit a7df3c5
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 7 deletions.
5 changes: 3 additions & 2 deletions belief_pipeline/es.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from elasticsearch import Elasticsearch

client = Elasticsearch(
"http://localhost:9200/",
"https://elasticsearch.habitus.clulab.org/",
basic_auth=("user", "password")
# api_key="..."
)

# print(client.info())

result = client.search(index="habitus", q="Karamoja")
result = client.search(index="habitus3", q="Karamoja")

print(result)
116 changes: 116 additions & 0 deletions src/main/scala/org/clulab/habitus/apps/grid/Csv2Tsv.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package org.clulab.habitus.apps.grid

import org.clulab.utils.{FileUtils, Sourcer}

import scala.util.Using

object Csv2Tsv extends App {
val csvFilename = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500_only_karamoja.csv")
val tsvFilename = args.lift(1).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500-only-karamoja.tsv")
// val csvFilename = args.lift(0).getOrElse("../corpora/grid/uq500-karamoja/csvcheck.csv")
// val tsvFilename = args.lift(1).getOrElse("../corpora/grid/uq500-karamoja/csvcheck.tsv")
val quoteUnnecessarily = true

trait State
object OutsideFieldState extends State
object InsideFieldState extends State
object InsideQuotedFieldState extends State
object InsideQuotedQuoteState extends State

val escapes: Map[Char, String] = Map(
'\n' -> "\\n",
'\r' -> "\\r",
'\t' -> "\\t",
'\\' -> "\\\\",
'"' -> "\"\""
)

Using.resource(Sourcer.sourceFromFilename(csvFilename)) { source =>
val lines = source.getLines

Using.resource(FileUtils.printWriterFromFile(tsvFilename)) { printWriter =>

def printChar(char: Char): Unit = printWriter.print(char)

def printEscape(char: Char): Unit = printWriter.print(escapes(char))

def printLine(): Unit = printWriter.println

def throwChar(char: Char, state: State): Nothing = {
throw new RuntimeException(s"Char '$char' is invalid in state $state.")
}

def throwString(string: String): Nothing = {
throw new RuntimeException(string)
}

val nextState = lines.foldLeft(OutsideFieldState: State) { (state, line) =>
println(line)
val nextState = line.foldLeft(state: State) { (state, char) =>
println(char)
val nextState = state match {
case OutsideFieldState =>
// I should only see a quote or some char that starts the field.
val nextState = char match {
case '\n' | '\r' | '\t' | '\\' => throwChar(char, state)
case ',' => printChar('\t'); OutsideFieldState // The field was empty.
case '"' =>
if (quoteUnnecessarily) { printChar(char); InsideQuotedFieldState }
else InsideQuotedFieldState
case _ => printChar(char); InsideFieldState
}
nextState
case InsideFieldState =>
// I need to escape special characters and watch for the looming comma.
val nextState = char match {
case '\n' | '\r' | '\t' | '\\' => printEscape(char); InsideFieldState
case ',' => printChar('\t'); OutsideFieldState
case '"' => throwChar(char, state)
case _ => printChar(char); InsideFieldState
}
nextState
case InsideQuotedFieldState =>
// I need to escape special characters and watch for the looming end quote or doubled false alarms.
val nextState = char match {
case '\n' | '\r' | '\t' | '\\' => printEscape(char); InsideQuotedFieldState
case ',' => printChar(char); InsideQuotedFieldState
case '"' => InsideQuotedQuoteState
case _ => printChar(char); InsideQuotedFieldState
}
nextState
case InsideQuotedQuoteState =>
// I just saw a quote while InsideQuotedFieldState and need to decide what to do.
val nextState = char match {
case '\n' | '\r' | '\t' | '\\' => throwChar(char, state)
case ',' =>
if (quoteUnnecessarily) printChar('"')
printChar('\t'); OutsideFieldState // We are now outside the field.
case '"' =>
if (quoteUnnecessarily) { printEscape(char); InsideQuotedFieldState } // It was a double quote and we're still inside the field.
else { printChar(char); InsideQuotedFieldState } // It was a double quote and we're still inside the field.
case _ => throwChar(char, state)
}
nextState
}
nextState
}
// The line is finished.
nextState match {
case OutsideFieldState => printLine(); OutsideFieldState
case InsideFieldState => printLine(); OutsideFieldState
case InsideQuotedFieldState => printEscape('\n'); InsideQuotedFieldState
case InsideQuotedQuoteState =>
if (quoteUnnecessarily) { printChar('"'); printLine(); OutsideFieldState }
else { printLine(); OutsideFieldState }
}
}
// The file is finished.
nextState match {
case OutsideFieldState => assert(true)
case InsideFieldState => assert(false)
case InsideQuotedFieldState => throwString("Quoted field was not terminated correctly.")
case InsideQuotedQuoteState => assert(false)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import scala.util.{Random, Using}
case class Record(text: String, date: String, location: String)

object DatasetToGridApp extends App {
val datasetFileName = args.lift(0).getOrElse("../corpora/grid/uganda.tsv")
val datasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-karamoja/in/uq500-karamoja.tsv")
val gridFileNamePrefix = args.lift(1).getOrElse("../corpora/grid/uganda-")
val gridFileNameSuffix = args.lift(1).getOrElse(".txt")
val random = new Random(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ object GridToDatasetApp extends App {
)(GridRecord)
val controlCharacters = " ’✳–\"“”/…½é’€‘—£¬âãé™\u009D"

val inputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uganda-all.tsv")
val gridFileName = args.lift(1).getOrElse("../corpora/grid/uq500/out2/uq500_zip_cells.csv")
val ouputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500/out2/uganda-uq500-rowcol.tsv")
val inputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-karamoja/in/uq500-karamoja.tsv")
val gridFileName = args.lift(1).getOrElse("../corpora/grid/uq500-karamoja/out/uq500-karamoja_zip_cells.csv")
val ouputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-karamoja/out/uganda-uq500-karamoja-rowcol.tsv")

val gridRecords: Seq[GridRecord] = {
val text = FileUtils.getTextFromFile(gridFileName)
Expand Down Expand Up @@ -64,7 +64,7 @@ object GridToDatasetApp extends App {
// if (locationOpt.isDefined) Some(DatasetRecord(line, text, "uganda-" + dateOpt.get)) else None
// if (locationOpt.isDefined) Some(DatasetRecord(line, text, "uganda-" + locationOpt.get)) else None
// For these last ones there was not necessarily a location.
Some(DatasetRecord(line, text, "uq500"))
Some(DatasetRecord(line, text, "uq500-karamoja"))
}.toVector
val gridAndDatasetRecordPairs = gridRecords.flatMap { gridRecord =>
val datasetRecordOpt = datasetRecords.find { datasetRecord =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package org.clulab.habitus.apps.grid

import org.clulab.utils.{FileUtils, Sourcer, StringUtils}
import org.clulab.wm.eidoscommon.utils.{CsvReader, CsvWriter, TsvReader, TsvWriter}
import zamblauskas.csv.parser._
import zamblauskas.functional._

import scala.util.Using

object GridToDatasetKaramojaApp extends App {

case class GridRecord(row: String, col: String, readable: String)

case class DatasetRecord(line: String, text: String, row: String)

implicit val gridDocumentReads: ColumnReads[GridRecord] = (
column("row").as[String] and
column("col").as[String] and
column("readable").as[String]
)(GridRecord)
val controlCharacters = " ’✳–\"“”/…½é’€‘—£¬âãé™\u009D"

val inputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500-only-karamoja.tsv")
val gridFileName = args.lift(1).getOrElse("../corpora/grid/uq500-only-karamoja/out/uq500-only-karamoja_cells.csv")
val ouputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/out/uq500-only-karamoja-rowcol.tsv")

val gridRecords: Seq[GridRecord] = {
val text = FileUtils.getTextFromFile(gridFileName)
val result = Parser.parse[GridRecord](text)
val rawGridRecords = result.toOption.get
val documents = rawGridRecords.map { gridRecord =>
val text = StringUtils.afterFirst(gridRecord.readable, '.').drop(1)
.filterNot(controlCharacters.contains(_))

gridRecord.copy(readable = text)
}
val rowDocuments = documents.filter { gridRecord =>
gridRecord.row != "all"
}

rowDocuments
}
val header = Using.resource(Sourcer.sourceFromFilename(inputDatasetFileName)) { source =>
source.getLines.take(1).toArray.head
}
val gridAndDatasetRecordPairs = Using.resource(Sourcer.sourceFromFilename(inputDatasetFileName)) { source =>
val lines = source.getLines.drop(1)
val tsvReader = new TsvReader()

val datasetRecords = lines.flatMap { line =>
val fields = tsvReader.readln(line)
val text = fields.lift(7).get
.filterNot(controlCharacters.contains(_))

Some(DatasetRecord(line, text, "uq500-only-karamoja"))
}.toVector
val gridAndDatasetRecordPairs = datasetRecords.flatMap { datasetRecord =>
val gridRecordOpt = gridRecords.find { gridRecord =>
gridRecord.row == datasetRecord.row &&
datasetRecord.text.contains(gridRecord.readable)
}

if (gridRecordOpt.isEmpty) {
println(datasetRecord.text)
None
}
else Some(gridRecordOpt.get, datasetRecord)
}

gridAndDatasetRecordPairs
}

Using.resource(FileUtils.printWriterFromFile(ouputDatasetFileName)) { printWriter =>
printWriter.print(header)
printWriter.println("\trow\tcol")
gridAndDatasetRecordPairs.foreach { case (gridRecord, datasetRecord) =>
printWriter.print(datasetRecord.line)
printWriter.print("\t" + gridRecord.row)
printWriter.print("\t" + gridRecord.col)
printWriter.println()
}
}
}

0 comments on commit a7df3c5

Please sign in to comment.