Permalink
Browse files

new file: scala/DisambiguateGraphSemEval.scala

    - Adding an evaluation script for SemEval using graph models
	new file:   scala/DisambiguateSemEval.scala
    - Adding an evaluation script for SemEval using feature vector models
	new file:   scala/ExtractGraphWordsi.scala
    - Adding a new main for building Graph based wordsi data
	new file:   scala/ExtractSemEvalKey.scala
    - Script for transforming a cluster solution to the semEval key file format
	new file:   scala/SplitSemEval07Matrix.scala
    - Splits a SemEval2007 matrix into a test and train set.
  • Loading branch information...
1 parent 208f629 commit 49133d4d2550e9360aaea1dce506a4e65d693eae @fozziethebeat committed Mar 19, 2012
@@ -0,0 +1,54 @@
+import edu.ucla.sspace.basis.BasisMapping
+import edu.ucla.sspace.matrix.MatrixIO
+import edu.ucla.sspace.matrix.MatrixIO.Format
+import edu.ucla.sspace.sim.JaccardIndex
+import edu.ucla.sspace.text.DependencyFileDocumentIterator
+import edu.ucla.sspace.util.SerializableUtil
+import edu.ucla.sspace.vector.CompactSparseVector
+
+import scala.collection.JavaConversions.asScalaIterator
+import scala.io.Source
+
+
+// Arguments:
+// (1) solution file
+// (2) basis file
+// (3) context file
+
+val lines = Source.fromFile(args(0)).getLines
+val Array(numPoints, numClusters) = lines.next.split("\\s+").map(_.toInt)
+val clusters = Array.fill(numClusters)(new CompactSparseVector(numPoints))
+for ( (line, cid) <- lines.zipWithIndex;
+ if line != "";
+ point <- line.split("\\s+") )
+ clusters(cid).add(point.toInt, 1.0)
+
+// Read in the basis mapping and set it to read only.
+val basis:BasisMapping[String, String] = SerializableUtil.load(args(1))
+basis.setReadOnly(true)
+
+// Iterate through each document in the corpus and determine which cluster has
+// the highset jaccard similarity to it.
+val sim = new JaccardIndex()
+for ( (header, v) <- readContexts(args(2), basis)) {
+ val term = header.replaceAll(".[0-9]+", "")
+ val label = clusters.zipWithIndex.map(x=>(sim.sim(x._1, v), x._2)).max._2
+ printf("%s %s %s.%d\n", term, header, term, label)
+}
+
+def readContexts(contextFile: String, basis:BasisMapping[String, String]) = {
+ val docIter = new DependencyFileDocumentIterator(contextFile)
+ for (doc <- docIter) yield {
+ val reader = doc.reader
+ val header = reader.readLine
+ val rowVector = new CompactSparseVector()
+ var line = reader.readLine
+ while (line != null) {
+ val d = basis.getDimension(line.split("\\s+")(1))
+ if (d >= 0)
+ rowVector.set(d, 1)
+ line = reader.readLine
+ }
+ (header, rowVector)
+ }
+}
@@ -0,0 +1,36 @@
+import edu.ucla.sspace.matrix.MatrixIO
+import edu.ucla.sspace.matrix.MatrixIO.Format
+import edu.ucla.sspace.sim.CosineSimilarity
+import edu.ucla.sspace.vector.CompactSparseVector
+import edu.ucla.sspace.vector.VectorMath
+
+import scala.io.Source
+
+// Read in the feature representation of the contexts.
+val contexts = MatrixIO.readMatrix(args(2), Format.SVDLIBC_SPARSE_TEXT)
+
+// Read in the clustering solutions and form the cluster centers using the rows
+// from the training context matrix.
+val trainContexts = MatrixIO.readMatrix(args(0), Format.SVDLIBC_SPARSE_TEXT)
+val solutionFile = Source.fromFile(args(1)).getLines
+val Array(numPoints, numClusters) = solutionFile.next.split("\\s+").map(_.toInt)
+val clusters = Array.fill(numClusters)(new CompactSparseVector(contexts.columns))
+for ( (clusterLine, id) <- solutionFile.zipWithIndex;
+ if clusterLine != "";
+ point <- clusterLine.split("\\s+") )
+ VectorMath.add(clusters(id), trainContexts.getRowVector(point.toInt))
+
+// Read in the headers. These must be in the same order as the rows in
+// contexts.
+val headers = Source.fromFile(args(3)).getLines.toList
+// Get the main term for this evaluation.
+val term = headers(0).replaceAll(".[0-9]+", "")
+
+// Iterate through each row in the full context matrix and label it with the
+// cluster that has the highest similarity.
+val sim = new CosineSimilarity()
+for ( (r, header) <- (0 until contexts.rows) zip headers ) {
+ val v = contexts.getRowVector(r)
+ val label = clusters.zipWithIndex.map( x => (sim.sim(x._1, v), x._2)).max._2
+ printf("%s %s %s.%d\n", term, header, term, label)
+}
@@ -0,0 +1,36 @@
+import edu.ucla.sspace.basis.BasisMapping
+import edu.ucla.sspace.basis.FilteredStringBasisMapping
+import edu.ucla.sspace.dependency.CoNLLDependencyExtractor
+import edu.ucla.sspace.text.DependencyFileDocumentIterator
+import edu.ucla.sspace.wordsi.GraphWordsi
+
+import scala.collection.JavaConversions.asScalaIterator
+import scala.io.Source
+
+import java.util.HashMap
+import java.util.HashSet
+
+
+object ExtractGraphWordsi {
+ def main(args:Array[String]) {
+ val excludeSet = new HashSet[String]();
+ Source.fromFile(args(0)).getLines.foreach(l => excludeSet.add(l.trim))
+
+ val referenceLikilhood = new HashMap[String, java.lang.Double]()
+ Source.fromFile(args(1), "ISO-8859-1").getLines.foreach(l => {
+ val parts = l.split("\\s+")
+ if (parts.size == 2)
+ referenceLikilhood.put(parts(0), parts(1).toDouble)
+ })
+
+ val basis:BasisMapping[String, String] = new FilteredStringBasisMapping(excludeSet)
+
+ val extractor = new CoNLLDependencyExtractor()
+ val wordsi = new GraphWordsi(basis, extractor, referenceLikilhood, args(3))
+
+ for (doc <- new DependencyFileDocumentIterator(args(2)))
+ wordsi.processDocument(doc.reader)
+ wordsi.processSpace(System.getProperties)
+ println("Processing complete")
+ }
+}
@@ -0,0 +1,13 @@
+import scala.io.Source
+
+// Read the headers for each feature vector.
+val headers = Source.fromFile(args(0)).getLines.toList
+val term = headers(0).replaceAll(".[0-9]+", "")
+// Read each clustering solution and report the cluster id for each instance in
+// the sem eval format.
+val solution = Source.fromFile(args(1)).getLines
+solution.next
+for ((line, clusterId) <- solution zipWithIndex;
+ if line != "";
+ x <- line.split("\\s+") )
+ printf("%s %s %s.%d\n", term, headers(x.toInt), term, clusterId)
@@ -0,0 +1,34 @@
+import edu.ucla.sspace.matrix.Matrices
+import edu.ucla.sspace.matrix.MatrixIO
+import edu.ucla.sspace.matrix.MatrixIO.Format
+import edu.ucla.sspace.vector.SparseDoubleVector
+
+import scala.collection.JavaConversions.bufferAsJavaList
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+import java.io.PrintWriter
+
+
+val fullData = MatrixIO.readSparseMatrix(args(0), Format.SVDLIBC_SPARSE_TEXT)
+val allHeaders = Source.fromFile(args(1)).getLines.toList
+val testHeaders = Source.fromFile(args(2)).getLines.map(_.split("\\s+")(1)).toSet
+
+val testVectors = new ArrayBuffer[SparseDoubleVector]()
+val orderedTestHeaders = new PrintWriter(args(3))
+val trainVectors = new ArrayBuffer[SparseDoubleVector]()
+
+for (r <- 0 until fullData.rows) {
+ if (testHeaders.contains(allHeaders(r))) {
+ orderedTestHeaders.println(allHeaders(r))
+ testVectors.add(fullData.getRowVector(r))
+ } else {
+ trainVectors.add(fullData.getRowVector(r))
+ }
+}
+orderedTestHeaders.close
+
+MatrixIO.writeMatrix(Matrices.asSparseMatrix(testVectors),
+ args(4), Format.SVDLIBC_SPARSE_TEXT)
+MatrixIO.writeMatrix(Matrices.asSparseMatrix(trainVectors),
+ args(5), Format.SVDLIBC_SPARSE_TEXT)

0 comments on commit 49133d4

Please sign in to comment.