In [1]:
import numpy as np
import random
import time 
import re
from collections import defaultdict

import torch
from torch import nn
from torch.autograd import Variable

In [4]:
train_data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
              ("Give it to me".split(), "ENGLISH"),
              ("No creo que sea una buena idea".split(), "SPANISH"),
              ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH"),
             ("jon jones is the greatest".split() , "ENGLISH"),
             ("Yo creo que si".split(), "SPANISH")]

In [2]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

# Read in the data
train = list(read_dataset("train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("test.txt"))
# nwords = len(w2i)
# ntags = len(t2i)


In [14]:
class CBow(nn.Module):
  def __init__(self ,nwords , n_tags , emb_size):
    super(CBow , self).__init__()
    self.emb = nn.Embedding(nwords , emb_size)
    nn.init.xavier_uniform_(self.emb.weight)
    self.linear = nn.Linear(emb_size , n_tags)
    # initialize the weights with xavier uniform (Glorot, X. & Bengio, Y. (2010))
    nn.init.xavier_uniform_(self.linear.weight)
  
  def forward(self,words):
    emb = self.emb(words)
    emb_sum = torch.sum(emb , dim = 0)
    emb_sum = emb_sum.view(1,-1)
    out = self.linear(emb_sum)
    return out

In [15]:
EMB_SIZE = 64
nwords = len(w2i)
ntags = len(t2i)

In [16]:
type = torch.LongTensor

In [17]:
model = CBow(nwords , ntags, EMB_SIZE)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
for epoch in range(100):
  model.train()
  random.shuffle(train)
  train_loss = 0.0
  for words , tag in train:
    words_tens = torch.tensor(words).type(type)
    tags_tens = torch.tensor([tag]).type(type)
    scores = model(words_tens)
    loss = loss_fn(scores , tags_tens)
    train_loss += loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  if epoch % 20 == 0:
    print(f"'epoch: {epoch}' : , train_loss : {train_loss / len(train)}" )
  test_correct= 0.0
  for words , tag in train : 
    words = torch.tensor(words).type(type)
    scores = model(words)[0].detach().cpu().numpy()
    predict = np.argmax(scores)
    if predict == tag:
        test_correct += 1
    print("iter %r: test acc=%.4f" % (epoch, test_correct/len(dev)))

In [None]:
len(train[2][0])

(15314, 8544)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
36
9
29
10
11
27
32
41
9
20
24
13
12
20
28
17
11
21
17
24
14
6
15
21
27
15
6
5
23
7
33
23
10
12
10
28
30
19
19
13
14
19
19
37
22
9
22
19
17
25
21
10
24
45
15
16
25
17
17
18
42
34
27
21
19
26
3
9
10
11
10
18
17
12
14
10
29
28
28
8
14
5
11
11
29
14
23
18
20
8
17
16
16
12
28
32
24
15
19
8
14
14
26
12
7
17
37
23
25
20
22
13
23
26
20
11
17
13
15
17
18
10
11
21
37
6
6
15
13
38
16
30
23
15
20
13
12
13
18
13
16
26
19
32
19
33
22
9
21
27
17
18
23
14
29
14
22
22
13
27
18
19
7
22
7
18
12
29
22
8
17
23
12
15
25
27
16
16
6
16
25
10
23
17
20
30
30
22
17
12
16
12
27
22
5
11
20
22
18
34
43
11
22
32
17
20
24
20
23
15
20
11
6
24
11
13
14
16
33
8
10
28
19
6
21
10
7
3
34
14
15
6
25
11
37
5
18
22
29
18
13
7
8
26
18
11
38
16
20
13
42
20
14
6
19
22
16
11
34
19
20
19
26
14
22
20
37
15
11
4
23
16
14
40
15
30
13
4
28
20
10
30
18
31
20
26
27
6
28
25
18
14
25
16
24
26
14
9
32
33
23
19
36
26
12
30
19
33
20
4
23
6
20
23
21
5
17
32
15
12
15
17
15
27
32