-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #246 from clulab/kwalcock/karamoja
Submit some Karamoja code
- Loading branch information
Showing
5 changed files
with
207 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,13 @@ | ||
from elasticsearch import Elasticsearch | ||
|
||
client = Elasticsearch( | ||
"http://localhost:9200/", | ||
"https://elasticsearch.habitus.clulab.org/", | ||
basic_auth=("user", "password") | ||
# api_key="..." | ||
) | ||
|
||
# print(client.info()) | ||
|
||
result = client.search(index="habitus", q="Karamoja") | ||
result = client.search(index="habitus3", q="Karamoja") | ||
|
||
print(result) |
116 changes: 116 additions & 0 deletions
116
src/main/scala/org/clulab/habitus/apps/grid/Csv2Tsv.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package org.clulab.habitus.apps.grid | ||
|
||
import org.clulab.utils.{FileUtils, Sourcer} | ||
|
||
import scala.util.Using | ||
|
||
object Csv2Tsv extends App { | ||
val csvFilename = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500_only_karamoja.csv") | ||
val tsvFilename = args.lift(1).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500-only-karamoja.tsv") | ||
// val csvFilename = args.lift(0).getOrElse("../corpora/grid/uq500-karamoja/csvcheck.csv") | ||
// val tsvFilename = args.lift(1).getOrElse("../corpora/grid/uq500-karamoja/csvcheck.tsv") | ||
val quoteUnnecessarily = true | ||
|
||
trait State | ||
object OutsideFieldState extends State | ||
object InsideFieldState extends State | ||
object InsideQuotedFieldState extends State | ||
object InsideQuotedQuoteState extends State | ||
|
||
val escapes: Map[Char, String] = Map( | ||
'\n' -> "\\n", | ||
'\r' -> "\\r", | ||
'\t' -> "\\t", | ||
'\\' -> "\\\\", | ||
'"' -> "\"\"" | ||
) | ||
|
||
Using.resource(Sourcer.sourceFromFilename(csvFilename)) { source => | ||
val lines = source.getLines | ||
|
||
Using.resource(FileUtils.printWriterFromFile(tsvFilename)) { printWriter => | ||
|
||
def printChar(char: Char): Unit = printWriter.print(char) | ||
|
||
def printEscape(char: Char): Unit = printWriter.print(escapes(char)) | ||
|
||
def printLine(): Unit = printWriter.println | ||
|
||
def throwChar(char: Char, state: State): Nothing = { | ||
throw new RuntimeException(s"Char '$char' is invalid in state $state.") | ||
} | ||
|
||
def throwString(string: String): Nothing = { | ||
throw new RuntimeException(string) | ||
} | ||
|
||
val nextState = lines.foldLeft(OutsideFieldState: State) { (state, line) => | ||
println(line) | ||
val nextState = line.foldLeft(state: State) { (state, char) => | ||
println(char) | ||
val nextState = state match { | ||
case OutsideFieldState => | ||
// I should only see a quote or some char that starts the field. | ||
val nextState = char match { | ||
case '\n' | '\r' | '\t' | '\\' => throwChar(char, state) | ||
case ',' => printChar('\t'); OutsideFieldState // The field was empty. | ||
case '"' => | ||
if (quoteUnnecessarily) { printChar(char); InsideQuotedFieldState } | ||
else InsideQuotedFieldState | ||
case _ => printChar(char); InsideFieldState | ||
} | ||
nextState | ||
case InsideFieldState => | ||
// I need to escape special characters and watch for the looming comma. | ||
val nextState = char match { | ||
case '\n' | '\r' | '\t' | '\\' => printEscape(char); InsideFieldState | ||
case ',' => printChar('\t'); OutsideFieldState | ||
case '"' => throwChar(char, state) | ||
case _ => printChar(char); InsideFieldState | ||
} | ||
nextState | ||
case InsideQuotedFieldState => | ||
// I need to escape special characters and watch for the looming end quote or doubled false alarms. | ||
val nextState = char match { | ||
case '\n' | '\r' | '\t' | '\\' => printEscape(char); InsideQuotedFieldState | ||
case ',' => printChar(char); InsideQuotedFieldState | ||
case '"' => InsideQuotedQuoteState | ||
case _ => printChar(char); InsideQuotedFieldState | ||
} | ||
nextState | ||
case InsideQuotedQuoteState => | ||
// I just saw a quote while InsideQuotedFieldState and need to decide what to do. | ||
val nextState = char match { | ||
case '\n' | '\r' | '\t' | '\\' => throwChar(char, state) | ||
case ',' => | ||
if (quoteUnnecessarily) printChar('"') | ||
printChar('\t'); OutsideFieldState // We are now outside the field. | ||
case '"' => | ||
if (quoteUnnecessarily) { printEscape(char); InsideQuotedFieldState } // It was a double quote and we're still inside the field. | ||
else { printChar(char); InsideQuotedFieldState } // It was a double quote and we're still inside the field. | ||
case _ => throwChar(char, state) | ||
} | ||
nextState | ||
} | ||
nextState | ||
} | ||
// The line is finished. | ||
nextState match { | ||
case OutsideFieldState => printLine(); OutsideFieldState | ||
case InsideFieldState => printLine(); OutsideFieldState | ||
case InsideQuotedFieldState => printEscape('\n'); InsideQuotedFieldState | ||
case InsideQuotedQuoteState => | ||
if (quoteUnnecessarily) { printChar('"'); printLine(); OutsideFieldState } | ||
else { printLine(); OutsideFieldState } | ||
} | ||
} | ||
// The file is finished. | ||
nextState match { | ||
case OutsideFieldState => assert(true) | ||
case InsideFieldState => assert(false) | ||
case InsideQuotedFieldState => throwString("Quoted field was not terminated correctly.") | ||
case InsideQuotedQuoteState => assert(false) | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 83 additions & 0 deletions
83
src/main/scala/org/clulab/habitus/apps/grid/GridToDatasetKaramojaApp.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package org.clulab.habitus.apps.grid | ||
|
||
import org.clulab.utils.{FileUtils, Sourcer, StringUtils} | ||
import org.clulab.wm.eidoscommon.utils.{CsvReader, CsvWriter, TsvReader, TsvWriter} | ||
import zamblauskas.csv.parser._ | ||
import zamblauskas.functional._ | ||
|
||
import scala.util.Using | ||
|
||
object GridToDatasetKaramojaApp extends App { | ||
|
||
case class GridRecord(row: String, col: String, readable: String) | ||
|
||
case class DatasetRecord(line: String, text: String, row: String) | ||
|
||
implicit val gridDocumentReads: ColumnReads[GridRecord] = ( | ||
column("row").as[String] and | ||
column("col").as[String] and | ||
column("readable").as[String] | ||
)(GridRecord) | ||
val controlCharacters = " ’✳–\"“”/…½é’€‘—£¬âãé™\u009D" | ||
|
||
val inputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/in/uq500-only-karamoja.tsv") | ||
val gridFileName = args.lift(1).getOrElse("../corpora/grid/uq500-only-karamoja/out/uq500-only-karamoja_cells.csv") | ||
val ouputDatasetFileName = args.lift(0).getOrElse("../corpora/grid/uq500-only-karamoja/out/uq500-only-karamoja-rowcol.tsv") | ||
|
||
val gridRecords: Seq[GridRecord] = { | ||
val text = FileUtils.getTextFromFile(gridFileName) | ||
val result = Parser.parse[GridRecord](text) | ||
val rawGridRecords = result.toOption.get | ||
val documents = rawGridRecords.map { gridRecord => | ||
val text = StringUtils.afterFirst(gridRecord.readable, '.').drop(1) | ||
.filterNot(controlCharacters.contains(_)) | ||
|
||
gridRecord.copy(readable = text) | ||
} | ||
val rowDocuments = documents.filter { gridRecord => | ||
gridRecord.row != "all" | ||
} | ||
|
||
rowDocuments | ||
} | ||
val header = Using.resource(Sourcer.sourceFromFilename(inputDatasetFileName)) { source => | ||
source.getLines.take(1).toArray.head | ||
} | ||
val gridAndDatasetRecordPairs = Using.resource(Sourcer.sourceFromFilename(inputDatasetFileName)) { source => | ||
val lines = source.getLines.drop(1) | ||
val tsvReader = new TsvReader() | ||
|
||
val datasetRecords = lines.flatMap { line => | ||
val fields = tsvReader.readln(line) | ||
val text = fields.lift(7).get | ||
.filterNot(controlCharacters.contains(_)) | ||
|
||
Some(DatasetRecord(line, text, "uq500-only-karamoja")) | ||
}.toVector | ||
val gridAndDatasetRecordPairs = datasetRecords.flatMap { datasetRecord => | ||
val gridRecordOpt = gridRecords.find { gridRecord => | ||
gridRecord.row == datasetRecord.row && | ||
datasetRecord.text.contains(gridRecord.readable) | ||
} | ||
|
||
if (gridRecordOpt.isEmpty) { | ||
println(datasetRecord.text) | ||
None | ||
} | ||
else Some(gridRecordOpt.get, datasetRecord) | ||
} | ||
|
||
gridAndDatasetRecordPairs | ||
} | ||
|
||
Using.resource(FileUtils.printWriterFromFile(ouputDatasetFileName)) { printWriter => | ||
printWriter.print(header) | ||
printWriter.println("\trow\tcol") | ||
gridAndDatasetRecordPairs.foreach { case (gridRecord, datasetRecord) => | ||
printWriter.print(datasetRecord.line) | ||
printWriter.print("\t" + gridRecord.row) | ||
printWriter.print("\t" + gridRecord.col) | ||
printWriter.println() | ||
} | ||
} | ||
} |