going through http://web.stanford.edu/class/cs224n/ and https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb

Collecting torchtext
  Using cached https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl
Collecting torch (from torchtext)
[?25l  Downloading https://files.pythonhosted.org/packages/59/6f/cdee668c94f5efb3745e9485765fd6b4918a855f7d36c0514ddf38daaddf/torch-1.5.0-cp37-none-macosx_10_9_x86_64.whl (80.5MB)
[K    100% |████████████████████████████████| 80.5MB 437kB/s eta 0:00:011
Collecting tqdm (from torchtext)
  Using cached https://files.pythonhosted.org/packages/c9/40/058b12e8ba10e35f89c9b1fdfc2d4c7f8c05947df2d5eb3c7b258019fda0/tqdm-4.46.0-py2.py3-none-any.whl
Collecting sentencepiece (from torchtext)
[?25l  Downloading https://files.pythonhosted.org/packages/84/e3/2d755b55423787f438269a26d8bd9743698921fdcde748c6fb050b1c1b8c/sentencepiece-0.1.91-cp37-cp37m-macosx_10_6_x86_64.whl (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 4.7MB/s ta 0:00:011
Collecting future (from torch->torchtext)

In [6]:
!{sys.executable} -m pip install random

Collecting random
[31m  Could not find a version that satisfies the requirement random (from versions: )[0m
[31mNo matching distribution found for random[0m


In [9]:
import sys
# !{sys.executable} -m pip install torchtext

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import spacy
import numpy as np

import time
import random

In [5]:
# !pip3 install spacy

In [10]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [11]:
LAT = data.Field(sequential=True)
LON = data.Field(sequential=True)
MAN_TAGS = data.Field(sequential = True, unk_token = None)

In [20]:
#only uses lat and lon
fields = (("time", None), ("lat", LAT), ("lon", LON), ('sog', None), ('cog', None), ('boat_class', None), ('tag', MAN_TAGS), ('tag_name', None), ('heel', None), ('clew_load', None), ('pitch', None), ('rudder', None))

In [37]:
data_train = data.TabularDataset('/Users/cmkerner/Documents/sailing_ml/data/ML Training Resources/MLData_153.csv', 'CSV', fields, skip_header=True)
data_test = data.TabularDataset('/Users/cmkerner/Documents/sailing_ml/data/ML Training Resources/MLData104.csv', 'CSV', fields, skip_header=True)
data_val = data.TabularDataset('/Users/cmkerner/Documents/sailing_ml/data/ML Training Resources/MLData75.csv', 'CSV', fields, skip_header=True)

In [38]:
#working with one track, we can split it later or use another track to test
# train_data, valid_data, test_data = data_test.splits(fields)

In [39]:
data_test

<torchtext.data.dataset.TabularDataset at 0x12d27d5f8>

In [41]:
print(f"Number of training examples: {len(data_train)}")

Number of training examples: 7084


In [42]:
print(vars(data_test.examples[4]))

{'lat': ['-38.096078'], 'lon': ['144.403721'], 'tag': ['S']}


In [30]:
print(vars(data_test.examples[0])['lat'])


['33.746611']


In [15]:
# we will be using UD tags - universal dependancy 

In [52]:
MIN_FREQ = 2

LAT.build_vocab(data_train, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LON.build_vocab(data_train, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)


# LAT.build_vocab(data_train)
# LON.build_vocab(data_train)
MAN_TAGS.build_vocab(data_train)

In [53]:
print(f"Unique tokens in TEXT vocabulary: {len(LAT.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(LON.vocab)}")
print(f"Unique tokens in PTB_TAG vocabulary: {len(MAN_TAGS.vocab)}")

Unique tokens in TEXT vocabulary: 1082
Unique tokens in UD_TAG vocabulary: 1260
Unique tokens in PTB_TAG vocabulary: 9


In [80]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator= data.BucketIterator(
    data_train, 
    batch_size = BATCH_SIZE,
    device = device,
    shuffle=None)

valid_iterator = data.BucketIterator(
    data_val, 
    batch_size = BATCH_SIZE,
    device = device,
    shuffle=None)

test_iterator = data.BucketIterator(
    data_test, 
    batch_size = BATCH_SIZE,
    device = device,
    shuffle=None)

In [81]:

class BiLSTMPOSTagger(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        #pass text through embedding layer
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)
        
        #outputs holds the backward and forward hidden states in the final layer
        #hidden and cell are the backward and forward hidden and cell states at the final time-step
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        
        #we use our outputs to make a prediction of what the tag should be
        predictions = self.fc(self.dropout(outputs))
        
        #predictions = [sent len, batch size, output dim]
        
        return predictions

In [82]:
INPUT_DIM = len(LAT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 32
OUTPUT_DIM = len(MAN_TAGS.vocab)
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.25
PAD_IDX = LAT.vocab.stoi[LAT.pad_token]

model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        OUTPUT_DIM, 
                        N_LAYERS, 
                        BIDIRECTIONAL, 
                        DROPOUT, 
                        PAD_IDX)

In [83]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(1082, 100, padding_idx=1)
  (lstm): LSTM(100, 32, num_layers=2, dropout=0.25)
  (fc): Linear(in_features=32, out_features=9, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [84]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 134,097 trainable parameters


In [85]:
pretrained_embeddings = LAT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([1082, 100])


In [86]:
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


In [87]:
optimizer = optim.Adam(model.parameters())


In [88]:
TAG_PAD_IDX = MAN_TAGS.vocab.stoi[MAN_TAGS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [89]:
model = model.to(device)
criterion = criterion.to(device)

In [90]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [91]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        lat = batch.lat
        lon = batch.lon
        tags = batch.man_tags
        
        optimizer.zero_grad()
        
        #text = [sent len, batch size]
        
        predictions = model(lat, lon)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [92]:

def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            lat = batch.lat
            lon = batch.lon
            tags = batch.man_tags
            
            predictions = model(lat,lon)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [93]:

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [94]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

AttributeError: 'Batch' object has no attribute 'man_tags'