initial code for metal

clulab · Sep 20, 2021 · 9311763 · bethard · Sep 20, 2021 · bethard
1 parent c9ec5b8
commit 9311763
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 4 deletions.
diff --git a/main/src/main/python/pytorch/metal.py b/main/src/main/python/pytorch/metal.py
@@ -0,0 +1,58 @@
+from utils import Utils
+from collections import Counter
+from sequences.rowReader import *
+
+class Metal():
+    """docstring for Metal"""
+    def __init__(self, taskManager, parameters, modelOpt):
+        # One Layers object per task; model(0) contains the Layers shared between all tasks (if any)
+        if modelOpt:
+            self.model = modelOpt
+        else:
+            self.model = self.initialize()
+        self.taskManager = taskManager
+
+    def initialize(self):
+
+        taskWords, taskLabels = mkVocabularies()
+
+        layersPerTask = [None for _ in range(taskManager.taskCount + 1)]
+
+        layersPerTask[0] = Layers(taskManager, "mtl.layers", parameters, taskWords(0), None, isDual = false, providedInputSize = None)
+
+        inputSize = layersPerTask[0].outDim
+
+        for i in taskManager.indices:
+            layersPerTask[i+1] = Layers(taskManager, s"mtl.task${i + 1}.layers", parameters, taskWords(i + 1), Some(taskLabels(i + 1)), isDual = taskManager.tasks(i).isDual, inputSize)
+
+        for i in range(len(layersPerTask)):
+            print (s"Summary of layersPerTask({i}):")
+            print (layersPerTask[i])
+
+        return layersPerTask
+
+    def mkVocabularies(self):
+        # index 0 reserved for the shared Layers; tid + 1 corresponds to each task
+        labels = [Counter() for _ in range(taskManager.taskCount + 1)]
+        for i in range(1, len(labels)): # labels(0) not used, since only task-specific layers have a final layer
+          labels[i][Utils.START_TAG] += 1
+          labels[i][Utils.STOP_TAG] += 1
+
+        words = [Counter() for _ in range(taskManager.taskCount + 1)]
+
+        reader = MetalRowReader()
+
+        for tid in taskManager.indices:
+          for sentence in taskManager.tasks[tid].trainSentences
+            annotatedSentences = reader.toAnnotatedSentences(sentence)
+
+            for asent in annotatedSentences:
+              annotatedSentence = asent[0]
+              sentenceLabels = asent[1]
+              for i in annotatedSentence.indices:
+                words[tid + 1][annotatedSentence.words[i]] += 1
+                words[0][annotatedSentence.words[i]] += 1
+                labels[tid + 1][sentenceLabels[i]] += 1
+
+        return words, labels
+
diff --git a/main/src/main/python/pytorch/taskManager.py b/main/src/main/python/pytorch/taskManager.py
@@ -1,6 +1,7 @@
 import random
 import math
 from sequences.columnReader import ColumnReader
+from dataclasses import dataclass
 
 TYPE_BASIC = 0
 TYPE_DUAL = 1
@@ -102,11 +103,11 @@ def debugTraversal(self):
       print (f"Read {sentCount} sentences from task {taskId}")
       print (f"Read {totalSents} sentences in epoch {epoch}.")
 
+@dataclass
 class Shard:
-  def __init__(self, taskId, startPosition, endPosition):
-    self.taskId = taskId
-    self.startPosition = startPosition 
-    self.endPosition = endPosition
+  taskId: int
+  startPosition: int
+  endPosition: int
 
 class Task:
   def __init__(self,

diff --git a/main/src/main/python/pytorch/utils.py b/main/src/main/python/pytorch/utils.py
@@ -0,0 +1,22 @@
+
+class Utils:
+    def __init__(self):
+        self.concatenateCount = 0
+
+        self.UNK_WORD = "<UNK>"
+        self.EOS_WORD = "<EOS>"
+
+        self.UNK_EMBEDDING = 0
+
+        self.START_TAG = "<START>"
+        self.STOP_TAG = "<STOP>"
+
+        self.RANDOM_SEED = 2522620396L # used for both DyNet, and the JVM seed for shuffling data
+        self.WEIGHT_DECAY = 1e-5
+
+        self.LOG_MIN_VALUE = -10000.0
+
+        self.DEFAULT_DROPOUT_PROBABILITY = 0.0 # no dropout by  default
+
+        self.IS_DYNET_INITIALIZED = False
+
diff --git a/main/src/main/python/sequences/rowReaders.py b/main/src/main/python/sequences/rowReaders.py
@@ -0,0 +1,96 @@
+
+class AnnotatedSentence:
+
+    def __init__(self, words, posTags = None, neTags = None, headPositions = None):
+        self.words = words
+        self.posTags = posTags
+        self.neTags = neTags
+        self.headPositions = headPositions
+        self.size = len(words)
+        self.indicies = range(self.size)
+
+class RowReader(object):
+
+    def __init__(self):
+        raise NotImplementedError
+
+    def toAnnotatedSentences(self, rows):
+        raise NotImplementedError
+
+class MetalRowReader(RowReader):
+
+    def __init__(self):
+        self.WORD_POSITION = 0
+        self.POS_TAG_POSITION = 1
+        self.NE_LABEL_POSITION = 2
+        self.LABEL_START_OFFSET = 3
+
+    def toAnnotatedSentences(self, rows):
+        if (len(rows.head) == 2):
+            self.parseSimple(rows)
+        elif (len(rows.head) == 4):
+            self.parseSimpleExtended(rows)
+        elif (len(rows.head) >= 5):
+            self.parseFull(rows)
+        else:
+            raise RuntimeError("ERROR: the Metal format expects 2, 4, or 5+ columns!")
+
+    # Parser for the simple format: word, label 
+    def parseSimple(rows):
+        assert(len(rows.head) == 2)
+        words = list()
+        labels = list()
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            labels += [row.get(self.WORD_POSITION + 1)]
+
+        return AnnotatedSentence(words), labels
+
+    # Parser for the simple extended format: word, POS tag, NE label, label
+    def parseSimpleExtended(rows):
+        assert(len(rows.head) == 4)
+        words = list()
+        posTags = list()
+        neLabels = list()
+        labels = list()
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            posTags += [row.get(self.POS_TAG_POSITION)]
+            neLabels += [row.get(self.NE_LABEL_POSITION)]
+            labels += [row.get(self.LABEL_START_OFFSET)]
+
+        return AnnotatedSentence(words), posTags, neLabels, labels
+
+    # Parser for the full format: word, POS tag, NE label, (label head)+ 
+    def parseFull(rows: IndexedSeq[Row]):
+        assert(len(rows.head) >= 5)
+        numSent = (len(rows.head) - 3) / 2
+        assert(numSent >= 1)
+
+        words = list()
+        posTags = list()
+        neLabels = list()
+        headPositions = [list() for i in range(numSent)]
+        labels = [list() for i in range(numSent)]
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            posTags += [row.get(self.POS_TAG_POSITION)]
+            neLabels += [row.get(self.NE_LABEL_POSITION)]
+
+            for j in range(numSent):
+                labels[j]+= [row.get(self.LABEL_START_OFFSET + (j * 2))]
+                try:
+                    headPositions[j] += [int(row.get(self.LABEL_START_OFFSET + (j * 2) + 1))]
+                except:
+                    raise RuntimeError # not sure about this part
+
+        sentences = list()
+        for i in range(numSent):
+            annotatedSent = AnnotatedSentence(words, posTags, neLabels, headPositions[i])
+            sentLabels = labels[i]
+            sentences += [(annotatedSent, sentLabels)]
+
+        return sentences