In [2]:
#original
from collections import defaultdict
## a subclass of the built-in dict class. 
## It overrides one method and adds one writable instance variable. 
import time
import random
# pip install dynet
import dynet as dy
import numpy as np

In [3]:
#original
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
#lambda function:
#A lambda operator or lambda function is used for creating small, one-time, anonymous function objects in Python.
#A lambda operator can have any number of arguments but can have only one expression. 
#It cannot contain any statements and returns a function object which can be assigned to any variable.
#defaultdict() can create value according to the parameter of default_factory before the key is constructed.
#defaultdict(int) can be used to count the frequency(value) of each key(word).
#defaultdict(list) can be used to add arbitrary information as list of each key(word).
#defaultdict(lambda: len(t2i)): build the dictionary and add index for all words.
UNK = w2i["<unk>"]
#the 0-th word is "<unk>"
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      tag, words = line.lower().strip().split(" ||| ")
      yield ([w2i[x] for x in words.split(" ")], t2i[tag])

In [6]:
print(w2i)
print(t2i)
print(UNK)

defaultdict(<function <lambda> at 0x10ad1b400>, {'<unk>': 0})
defaultdict(<function <lambda> at 0x10cf0af28>, {})
0


In [7]:
#original
# Read in the data
train = list(read_dataset("../data/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
#After the dictionary for trainig dataset is constructed, it will convert new keys to index 0.
#By doing this, we can test the generalization error, since our training data wouldn't contains all words in the test data.
dev = list(read_dataset("../data/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [21]:
print(nwords)
print(ntags)
print(len(train))
print(len(dev))

18648
5
8544
2210


In [28]:
train

[([1,
   2,
   3,
   4,
   5,
   6,
   1,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   9,
   17,
   5,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33],
  0),
 ([1,
   34,
   35,
   36,
   37,
   11,
   1,
   38,
   37,
   1,
   39,
   13,
   40,
   3,
   41,
   42,
   15,
   19,
   43,
   37,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   9,
   52,
   53,
   37,
   54,
   55,
   9,
   56,
   33],
  1),
 ([57,
   58,
   59,
   60,
   19,
   61,
   37,
   62,
   63,
   19,
   64,
   65,
   66,
   26,
   19,
   64,
   67,
   68,
   69,
   5,
   1,
   70,
   63,
   71,
   1,
   72,
   73,
   74,
   75,
   1,
   76,
   26,
   77,
   26,
   78,
   37,
   1,
   79,
   33],
  0),
 ([80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 37, 90, 91, 92, 93, 94, 37, 95, 33],
  2),
 ([96, 1, 97, 3, 98, 99, 100, 33], 0),
 ([101,
   30,
   46,
   80,
   102,
   103,
   83,
   104,
   37,
   105,
   9,
   106,
   1

In [27]:
dev

[([1795, 71, 0, 448], 2),
 ([233,
   80,
   513,
   167,
   5,
   871,
   5,
   1,
   597,
   5,
   87,
   146,
   26,
   2966,
   3,
   19,
   145,
   1268,
   5,
   4296,
   33],
  0),
 ([2208,
   226,
   1690,
   1025,
   26,
   110,
   2286,
   309,
   15,
   9,
   41,
   821,
   14,
   3126,
   3127,
   15,
   132,
   944,
   316,
   1033,
   167,
   285,
   33],
  1),
 ([1,
   266,
   640,
   189,
   279,
   822,
   313,
   1,
   10168,
   5402,
   37,
   239,
   6809,
   63,
   21,
   562,
   395,
   87,
   10225,
   1,
   8131,
   5653,
   37,
   1,
   2737,
   33],
  2),
 ([1026, 15, 1025, 2698, 37, 1202, 14, 5904, 33], 1),
 ([1616,
   860,
   1058,
   669,
   484,
   245,
   67,
   12451,
   0,
   15,
   1,
   652,
   5,
   1527,
   3,
   12218,
   93,
   145,
   4620,
   33],
  0),
 ([2378,
   557,
   118,
   19,
   5728,
   137,
   15,
   0,
   171,
   1,
   3337,
   180,
   132,
   9,
   41,
   1669,
   80,
   506,
   5,
   3486,
   132,
   33],
  0),
 ([71, 16, 2100, 3145

In [26]:
dev[0][0]

[1795, 71, 0, 448]

In [10]:
#original
train[0][1]

0

In [11]:
#original
# Start DyNet and define trainer
model = dy.Model()
#dy.Model() create a class, it is called dynet.ParameterCollection now.
#A ParameterCollection holds Parameters. Use it to create, load and save parameters.
#A ParameterCollection is a container for Parameters and LookupParameters.
trainer = dy.AdamTrainer(model)
#The Adam optimizer is similar to RMSProp but uses unbiased estimates of the first and second moments of the gradient

In [29]:
#original
# Define the model
W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights
#Add a lookup parameter to the ParameterCollection with a given initializer
#Lookup parameter: LookupParameters represents a table of parameters.
#They are used to embed a set of discrete objects (e.g. word embeddings). These are sparsely updated.
b_sm = model.add_parameters((ntags))                # Softmax bias
#Parameters are things that are optimized. In DyNet parameters are just parameters.

In [32]:
W_sm

LookupParameter /_0

In [31]:
b_sm

Parameter /_1

In [None]:
#original
# A function to calculate scores for one value
def calc_scores(words):
  dy.renew_cg()
#(There is a single global computation graph that is used at any point.
#dy.renew_cg() clears the current one and starts a new one)
  score = dy.esum([dy.lookup(W_sm, x) for x in words])
#dynet.esum(xs)
#This performs an elementwise sum over all the expressions in xs
  b_sm_exp = dy.parameter(b_sm)
  return score + b_sm_exp

In [None]:
#original
for ITER in range(100):
  # Perform training
  random.shuffle(train)
  train_loss = 0.0
  start = time.time()
  for words, tag in train:
    my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
    train_loss += my_loss.value()
    my_loss.backward()
    trainer.update()
  print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
  # Perform testing
  test_correct = 0.0
  for words, tag in dev:
    scores = calc_scores(words).npvalue()
    predict = np.argmax(scores)
    if predict == tag:
      test_correct += 1
  print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))