In [8]:
import torch
import pandas as pd

from collections import Counter
from torchtext.vocab import vocab
from sklearn.metrics import accuracy_score

In [9]:
train_set = pd.read_csv('./train/train.tsv', sep='\t', header=None, names=['labels', 'text'])

val_set = pd.read_csv('./dev-0/expected.tsv', sep='\t', header=None, names=['labels'])
val_set['text'] = pd.read_csv('./dev-0/in.tsv', sep='\t', header=None, names=['text'])

test_set = pd.read_csv('./test-A/in.tsv', sep='\t', header=None, names=['text'])

In [10]:
train_set.head()

Unnamed: 0,labels,text
0,B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...,EU rejects German call to boycott British lamb...
1,O B-PER O O O O O O O O O B-LOC O O O O O O O ...,"Rare Hendrix song draft sells for almost $ 17,..."
2,B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...,China says Taiwan spoils atmosphere for talks ...
3,B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...,China says time right for Taiwan talks . </S> ...
4,B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...,German July car registrations up 14.2 pct yr /...


In [11]:
val_set.head()

Unnamed: 0,labels,text
0,O O B-ORG O O O O O O O O O B-LOC O O B-MISC I...,CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE...
1,O O B-MISC I-MISC I-MISC O O O B-LOC O O O O O...,CRICKET - ENGLISH COUNTY CHAMPIONSHIP SCORES ....
2,O O O B-MISC O O O B-LOC O O B-LOC O O O B-MIS...,CRICKET - 1997 ASHES INTINERARY . </S> LONDON ...
3,O O B-PER O O B-LOC O O O B-LOC O O O O O O O ...,SOCCER - SHEARER NAMED AS ENGLAND CAPTAIN . </...
4,O O O O O O O B-LOC O O O O O O O O O O O O O ...,BASKETBALL - INTERNATIONAL TOURNAMENT RESULT ....


In [12]:
test_set.head()

Unnamed: 0,text
0,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI..."
1,RUGBY UNION - CUTTITTA BACK FOR ITALY AFTER A ...
2,SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA ...
3,FREESTYLE SKIING-WORLD CUP MOGUL RESULTS . </S...
4,SOCCER - ASIAN CUP GROUP C RESULTS . </S> AL-A...


In [13]:
train_tokens = train_set["text"].apply(lambda x : x.split())
train_labels = train_set["labels"].apply(lambda x : x.split())

val_tokens = val_set["text"].apply(lambda x : x.split())
val_labels = val_set["labels"].apply(lambda x : x.split())

test_tokens = test_set["text"].apply(lambda x : x.split())

assert len(train_tokens) == len(train_labels)
assert len(val_tokens) == len(val_labels)

print('Length of train_tokens:', len(train_tokens))
print('Length of val_tokens:', len(val_tokens))
print('Length of test_tokens:', len(test_tokens))

Length of train_tokens: 945
Length of val_tokens: 215
Length of test_tokens: 230


In [14]:
def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return vocab(counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
    

In [15]:
v = build_vocab(train_tokens)
v.set_default_index(v["<unk>"])

itos = v.get_itos()

itos[:10]

['<unk>',
 '<pad>',
 '<bos>',
 '<eos>',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott']

In [16]:
def data_process(dt):
    return [
        torch.tensor(
            [v["<bos>"]] + [v[token] for token in document] + [v["<eos>"]],
            dtype=torch.long,
        )
        for document in dt
    ]

def labels_process(dt):
    return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]

In [17]:
names= {
        "O" : 0,
        "B-PER" : 1,
        "I-PER" : 2,
        "B-ORG" : 3,
        "I-ORG" : 4,
        "B-LOC" : 5,
        "I-LOC" : 6,
        "B-MISC" : 7,
        "I-MISC" : 8,
}

In [18]:
def covert_to_int(dt, tags):
    labels = []
    for label in dt:
        labels.append([tags[i] for i in label])
    return labels

In [19]:
train_tokens_ids = data_process(train_tokens)
train_labels_ids = labels_process(covert_to_int(train_labels, tags=names))

val_tokens_ids = data_process(val_tokens)
val_labels_ids = labels_process(covert_to_int(val_labels, tags=names))

test_tokens_ids = data_process(test_tokens)

In [20]:
class LSTM(torch.nn.Module):

    def __init__(self, num_tags):
        super(LSTM, self).__init__()
        self.emb = torch.nn.Embedding(len(v.get_itos()), 100)
        self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)
        self.fc1 = torch.nn.Linear(256, num_tags)

    def forward(self, x):
        emb = torch.relu(self.emb(x))
        lstm_output, (h_n, c_n) = self.rec(emb)
        out_weights = self.fc1(lstm_output)
        return out_weights

In [32]:
EPOCHS = 10
LR = 0.001
NUM_TAGS = len(names)

In [21]:
model = LSTM(num_tags=NUM_TAGS)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss()

In [22]:
for i in range(EPOCHS):
    print(f'\nEpoch: {i} \n')

    model.train()

    train_true, train_pred = [], []
    running_loss = 0.0

    for i in range(len(train_labels)):
        batch_tokens = train_tokens_ids[i].unsqueeze(0)
        tags = train_labels_ids[i].unsqueeze(1)
        
        predicted_tags = model(batch_tokens)

        train_true += list(tags.squeeze(1).numpy())
        train_pred += list(torch.argmax(predicted_tags.squeeze(0), 1).numpy())

        optimizer.zero_grad()
        loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Train loss: {running_loss / len(train_labels)}')
    print(f'Train accuracy: {accuracy_score(train_true, train_pred)}\n')
    
    model.eval()
    
    val_true, val_pred = [], []

    with torch.no_grad():
        for i in range(len(val_labels_ids)):
            batch_tokens = val_tokens_ids[i].unsqueeze(0)
            tags = list(val_labels_ids[i].numpy())
            val_true += tags

            Y_batch_pred_weights = model(batch_tokens).squeeze(0)
            Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
            val_pred += list(Y_batch_pred.numpy())
    

    print(f'Val accuracy: {accuracy_score(val_true, val_pred)}')
    


Epoch: 0 

Train loss: 0.6237388463720442
Train accuracy: 0.8464758833069116

Val accuracy: 0.8575231080563709

Epoch: 1 

Train loss: 0.4015273840733306
Train accuracy: 0.8781045572987846

Val accuracy: 0.8939855244207033

Epoch: 2 

Train loss: 0.27967684826365224
Train accuracy: 0.9128263998979573

Val accuracy: 0.9130735993874314

Epoch: 3 

Train loss: 0.20315357111788615
Train accuracy: 0.9354215638040052

Val accuracy: 0.9168292282729577

Epoch: 4 

Train loss: 0.14831078523848817
Train accuracy: 0.9521948286229706

Val accuracy: 0.9239393994640025

Epoch: 5 

Train loss: 0.10755618058007073
Train accuracy: 0.9645037263798538

Val accuracy: 0.9312136515286868

Epoch: 6 

Train loss: 0.07571866279640407
Train accuracy: 0.974138559376082

Val accuracy: 0.9325627609341671

Epoch: 7 

Train loss: 0.05353337679863365
Train accuracy: 0.9810765502286849

Val accuracy: 0.9166651473993181

Epoch: 8 

Train loss: 0.0376968718130787
Train accuracy: 0.9861194628182001

Val accuracy: 0.9405

In [31]:
def save_prediction(test_tokens, test_pred, file_name):
    with open(file_name, 'w') as f:
        for i in range(len(test_tokens)):
            for j in range(len(test_tokens[i])):
                f.write(f'{test_tokens[i][j]}\t{list(names.keys())[test_pred[i][j]]}\n')
            f.write('\n')

In [30]:
test_pred = []
val_pred = []

with torch.no_grad():
    for i in range(len(val_tokens_ids)):
        batch_tokens = val_tokens_ids[i].unsqueeze(0)
        Y_batch_pred_weights = model(batch_tokens).squeeze(0)
        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
        val_pred.append(Y_batch_pred.numpy())

with torch.no_grad():
    for i in range(len(test_tokens_ids)):
        batch_tokens = test_tokens_ids[i].unsqueeze(0)
        Y_batch_pred_weights = model(batch_tokens).squeeze(0)
        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
        test_pred.append(Y_batch_pred.numpy())

save_prediction(test_tokens, test_pred, 'test-A/out.tsv')
save_prediction(val_tokens, val_pred, 'dev-0/out.tsv')