<a href="https://colab.research.google.com/github/georgialoukatou/childes_bert_tagging/blob/main/bert_pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [152]:
#https://pageperso.lis-lab.fr/benoit.favre/pstaln/09_embedding_evaluation.html
#https://colab.research.google.com/drive/1sbfjIapc1MDcQpGYlLkapjhWWv3yOR9t#scrollTo=hW7aY1VWtbzE
#https://colab.research.google.com/drive/1PHv-IRLPCtv7oTcIGbsgZHqrB5LPvB7S#scrollTo=IdU4YVqb7N8M

!pip install -qq transformers
!pip -q install conllu

import conllu
import pandas as pd
import numpy as np
import urllib.request
import time
import torch
import re
import collections
import torch.optim as optim
import torch.nn as nn
import sys


from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel


time: 5.49 s (started: 2022-01-21 00:13:10 +00:00)


In [153]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
#torch.cuda.is_available()
#torch.cuda.get_device_name(0)

time: 1.16 ms (started: 2022-01-21 00:13:16 +00:00)


In [154]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

!pip install ipython-autotime
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.7 s (started: 2022-01-21 00:13:16 +00:00)


In [192]:
#import english hand annotated CHILDES sample
eng_gold_childes = pd.read_csv('https://raw.githubusercontent.com/georgialoukatou/childes_bert_tagging/main/eng_gold_childes.csv')
eng_gold_childes = eng_gold_childes.dropna(axis=0, subset=['correct_pos']) 
mapping = {'mod': 'AUX', 'prep': 'ADP','adj': 'ADJ', 'pro': 'PRON', 'v': 'VERB', 'adv': 'ADV', 'participle': 'VERB', 'n': 'NOUN', 'childes': 'X', 'nan': 'X', 'intj': 'INTJ', 'particle': 'PART', 'mod': 'AUX','mod ': 'AUX', 'spacy': 'X', 'on': 'X', 'participle': 'VERB', 'aux': 'AUX', 'num': 'NUM', 'det':'DET' }
#for English, missing PROPN, CCONJ, SCONJ, SYM, PUNCT
eng_gold_childes = eng_gold_childes.replace({'correct_pos': mapping})
eng_gold_childes_test = eng_gold_childes[['utterance_gloss','correct_pos', 'position']]
print(eng_gold_childes_test.head())
print(eng_gold_childes_test.correct_pos.unique())


                                     utterance_gloss correct_pos  position
0  the shortcut led right to where Harold thought...         AUX      11.0
1  I'm not allowing you to reach things onto the ...         ADP       9.0
2                                      I dare not go         AUX       1.0
3                        yyy knee behind the letters         ADP       4.0
4                    is that kind of a greenish blue         ADJ       5.0
['AUX' 'ADP' 'ADJ' 'PRON' 'VERB' 'ADV' 'NOUN' 'X' 'INTJ' 'PART' 'NUM'
 'DET']
time: 2.11 s (started: 2022-01-21 00:58:29 +00:00)


In [156]:
for filename in ['en_ewt-ud-train.conllu', 'en_ewt-ud-dev.conllu', 'en_ewt-ud-test.conllu']:
  urllib.request.urlretrieve('https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/' + filename, filename)

def load_conllu(filename):
  with open(filename) as fp:
    data = conllu.parse(fp.read())
  sentences = [[token['form'] for token in sentence] for sentence in data]
  taggings = [[token['upos'] for token in sentence] for sentence in data]
  return sentences, taggings

train_sentences, train_taggings = load_conllu('en_ewt-ud-train.conllu')
valid_sentences, valid_taggings = load_conllu('en_ewt-ud-dev.conllu')
test_sentences, test_taggings = load_conllu('en_ewt-ud-test.conllu')

ud_train=pd.DataFrame(columns=list('AB'))
for i in zip(train_sentences, train_taggings):
  ud_train = ud_train.append({'A': i[0],'B': i[1]}, ignore_index=True)

ud_valid=pd.DataFrame(columns=list('AB'))
for i in zip(valid_sentences, valid_taggings):
  ud_valid = ud_valid.append({'A': i[0],'B': i[1]}, ignore_index=True)

ud_test=pd.DataFrame(columns=list('AB'))
for i in zip(test_sentences, test_taggings):
  ud_test = ud_test.append({'A': i[0],'B': i[1]}, ignore_index=True)

ud_train.head()

Unnamed: 0,A,B
0,"[Al, -, Zaman, :, American, forces, killed, Sh...","[PROPN, PUNCT, PROPN, PUNCT, ADJ, NOUN, VERB, ..."
1,"[[, This, killing, of, a, respected, cleric, w...","[PUNCT, DET, NOUN, ADP, DET, ADJ, NOUN, AUX, A..."
2,"[DPA, :, Iraqi, authorities, announced, that, ...","[PROPN, PUNCT, ADJ, NOUN, VERB, SCONJ, PRON, A..."
3,"[Two, of, them, were, being, run, by, 2, offic...","[NUM, ADP, PRON, AUX, AUX, VERB, ADP, NUM, NOU..."
4,"[The, MoI, in, Iraq, is, equivalent, to, the, ...","[DET, PROPN, ADP, PROPN, AUX, ADJ, ADP, DET, P..."


time: 40.7 s (started: 2022-01-21 00:13:21 +00:00)


In [157]:
def collate_gold(items):
  #max_len = max(len(item[0]) for item in items)
  max_len=512
  sentences = torch.zeros((len(items), max_len), device=items[0][0].device).long().to(device)
  #taggings = torch.zeros((len(items), max_len)).long().to(device)

  for i, sentence in enumerate(items):
    sentences[i][0:len(sentence)] = sentence
    #taggings[i][0:len(tagging)] = tagging

  return sentences#, taggings



time: 5.87 ms (started: 2022-01-21 00:14:02 +00:00)


In [158]:
# load tokenizer for a specific bert model (bert-base-cased)

# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenizer from English model better for english than multilingual
# tokenize an example sentence
tokenizer.tokenize('This tokenizer is sooooo awesome.')

# load a specific bert model (bert-base-cased)
bert=BertModel.from_pretrained('bert-base-uncased')

# tensor length
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
max_input_length

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


512

time: 1.72 s (started: 2022-01-21 00:14:02 +00:00)


In [159]:
uniq_taggings=[]
for i in ud_train['B']:
  for y in i:
    if y not in uniq_taggings:
      uniq_taggings.append(y)

print(uniq_taggings)

['PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADP', 'AUX', 'PRON', 'PART', 'SCONJ', 'NUM', 'ADV', 'CCONJ', '_', 'X', 'INTJ', 'SYM']
time: 67.6 ms (started: 2022-01-21 00:14:04 +00:00)


In [160]:
ud_train['A'][41]
ud_train.loc[[10010]]


Unnamed: 0,A,B
10010,"[I, also, understand, that, weekend, staffs, a...","[PRON, ADV, VERB, SCONJ, NOUN, NOUN, AUX, ADJ,..."


time: 17.6 ms (started: 2022-01-21 00:14:04 +00:00)


In [161]:
#for sentence,tagging in zip(df_long['A'],df_long['B']):
 # tokenized_s = tokenizer.tokenize(' '.join(sentence))

time: 741 µs (started: 2022-01-21 00:14:04 +00:00)


In [162]:

def align_tokenizations(sentences, taggings):
  bert_tokenized_sentences = []
  aligned_taggings = []

  for sentence, tagging in zip(sentences, taggings):
    sentence = (map(lambda x: x.lower(), sentence))
    sentence = list(sentence)
    # first generate BERT-tokenization
    bert_tokenized_sentence = tokenizer.tokenize(' '.join(sentence))
    aligned_tagging = []
    current_word = ''
    index = 0 # index of current word in sentence and tagging
    for token in bert_tokenized_sentence:
      current_word += re.sub(r'^##', '', token) # recompose word with subtoken
      sentence[index] = sentence[index].replace('\xad', '') # fix bug in data
      # note that some word factors correspond to unknown words in BERT
      assert token == '[UNK]' or sentence[index].startswith(current_word)
      if token == '[UNK]' or sentence[index] == current_word: # if we completed a word
        current_word = ''
        aligned_tagging.append(tagging[index])
        index += 1
      else: # otherwise insert padding
        aligned_tagging.append('<pad>')

    assert len(bert_tokenized_sentence) == len(aligned_tagging)

    bert_tokenized_sentences.append(bert_tokenized_sentence)
    aligned_taggings.append(aligned_tagging)

  return bert_tokenized_sentences, aligned_taggings

time: 21.8 ms (started: 2022-01-21 00:14:04 +00:00)


In [163]:
#remove sentences with problematic formatting
ud_train= ud_train.drop([157, 530, 7132])
ud_train.drop(ud_train.index[8207:8210], inplace=True)
ud_train.drop(ud_train.index[9974], inplace=True)

#align tokenized sentences and tagging for train
bert_tokenized_sentences, aligned_taggings = align_tokenizations(ud_train['A'], ud_train['B'])
len(ud_train)

12536

time: 5.61 s (started: 2022-01-21 00:14:04 +00:00)


In [164]:
bert_tokenized_sentences_test, aligned_taggings_test = align_tokenizations(ud_test['A'], ud_test['B'])

ud_valid = ud_valid.drop([97, 1622])
bert_tokenized_sentences_valid, aligned_taggings_valid = align_tokenizations(ud_valid['A'], ud_valid['B'])


time: 1.53 s (started: 2022-01-21 00:14:09 +00:00)


In [165]:
label_vocab = collections.defaultdict(lambda: len(label_vocab))
label_vocab['<pad>'] = 0

def convert_to_ids(sentences, taggings):
  sentences_ids = []
  taggings_ids = []
  for sentence, tagging in zip(sentences, taggings):
    sentence_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(['[CLS]'] + sentence + ['[SEP]'])).long()
    tagging_tensor = torch.tensor([0] + [label_vocab[tag] for tag in tagging] + [0]).long()
    #sentences_ids.append(sentence_tensor.to(device))
    sentences_ids.append(sentence_tensor)
    #taggings_ids.append(tagging_tensor.to(device))
    taggings_ids.append(tagging_tensor)
  return sentences_ids, taggings_ids

#convert train sentences to ids and tensors
train_sentences_ids, train_taggings_ids = convert_to_ids(bert_tokenized_sentences, aligned_taggings)
#same for test, dev
test_sentences_ids, test_taggings_ids = convert_to_ids(bert_tokenized_sentences_test, aligned_taggings_test)
valid_sentences_ids, valid_taggings_ids = convert_to_ids(bert_tokenized_sentences_valid, aligned_taggings_valid)

output_dim=len(label_vocab) #dimensions of labels

time: 639 ms (started: 2022-01-21 00:14:11 +00:00)


In [166]:
for tag, count in sorted(label_vocab.items(), reverse=True, key=lambda x: x[1]):
  print(count, tag)

18 SYM
17 INTJ
16 X
15 _
14 CCONJ
13 ADV
12 NUM
11 SCONJ
10 PART
9 PRON
8 AUX
7 ADP
6 DET
5 VERB
4 NOUN
3 ADJ
2 PUNCT
1 PROPN
0 <pad>
time: 2.31 ms (started: 2022-01-21 00:14:11 +00:00)


In [167]:
#function to be fed to the collate_fn parameter of dataloader showing how to glue data

def collate_fn(items):
  #max_len = max(len(item[0]) for item in items)
  max_len=max_input_length
  sentences = torch.zeros((len(items), max_len), device=items[0][0].device).long().to(device)
  taggings = torch.zeros((len(items), max_len)).long().to(device)

  for i, (sentence, tagging) in enumerate(items):
    sentences[i][0:len(sentence)] = sentence
    taggings[i][0:len(tagging)] = tagging

  return sentences, taggings

#example
x, y = collate_fn([[torch.tensor([1, 2, 3]), torch.tensor([4, 5, 6])], [torch.tensor([1, 2]), torch.tensor([3, 4])]])
print(x.shape, y.shape)

torch.Size([2, 512]) torch.Size([2, 512])
time: 13.6 ms (started: 2022-01-21 00:14:11 +00:00)


In [168]:

class PosTaggingDataset(Dataset):
  def __init__(self, sentences, taggings):
    assert len(sentences) == len(taggings)
    self.sentences = sentences
    self.taggings = taggings

  def __getitem__(self, i):
    return self.sentences[i], self.taggings[i]

  def __len__(self):
    return len(self.sentences)

time: 2.82 ms (started: 2022-01-21 00:14:11 +00:00)


In [169]:
example=PosTaggingDataset(train_sentences_ids, train_taggings_ids)[0:2]
example

([tensor([  101,  2632,  1011, 23564,  2386,  1024,  2137,  2749,  2730, 21146,
          28209, 14093,  2632,  1011,  2019,  2072,  1010,  1996, 14512,  2012,
           1996,  8806,  1999,  1996,  2237,  1997,  1053,  4886,  2213,  1010,
           2379,  1996,  9042,  3675,  1012,   102]),
  tensor([  101,  1031,  2023,  4288,  1997,  1037,  9768, 29307,  2097,  2022,
           4786,  2149,  4390,  2005,  2086,  2000,  2272,  1012,  1033,   102])],
 [tensor([0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0, 1, 2, 6, 4, 7, 6, 4, 7, 6,
          4, 7, 0, 0, 1, 2, 7, 6, 3, 4, 2, 0]),
  tensor([ 0,  2,  6,  4,  7,  6,  3,  4,  8,  8,  5,  9,  4,  7,  4, 10,  5,  2,
           2,  0])])

time: 9.82 ms (started: 2022-01-21 00:14:11 +00:00)


In [170]:
#from sklearn.model_selection import train_test_split
#RANDOM_SEED=1660

df_train = PosTaggingDataset(train_sentences_ids, train_taggings_ids)

df_test = PosTaggingDataset(test_sentences_ids, test_taggings_ids)

df_valid = PosTaggingDataset(valid_sentences_ids, valid_taggings_ids)

print(len(df_train), len(df_test), len(df_valid))

12536 2077 1999
time: 1.82 ms (started: 2022-01-21 00:14:12 +00:00)


In [171]:
batch_size=8  ######### TO MODIDY

train_loader = DataLoader(df_train, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(df_test, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
valid_loader = DataLoader(df_valid, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

time: 1.63 ms (started: 2022-01-21 00:14:12 +00:00)


In [172]:
next(iter(train_loader))[0][7]

tensor([  101,  1996,  4442,  2020,  4082,  1999,  1996,  1043,  3270, 16739,
         8717,  2232,  1998,  2632,  1011, 24815,  4733,  1997,  1996,  3007,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

time: 15.1 ms (started: 2022-01-21 00:14:12 +00:00)


In [173]:
o=0

for i in list(train_loader)[0]:
  o=o+1
  if o <2:
    for y in i:
      print(y)

tensor([  101,  2632,  1011, 23564,  2386,  1024,  2137,  2749,  2730, 21146,
        28209, 14093,  2632,  1011,  2019,  2072,  1010,  1996, 14512,  2012,
         1996,  8806,  1999,  1996,  2237,  1997,  1053,  4886,  2213,  1010,
         2379,  1996,  9042,  3675,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [174]:
##probing: https://nlp.stanford.edu/~johnhew/interpreting-probes.html
#A probe is trained to predict properties we care about from representations of a model whose nature we'd like to know more about.

class BERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):     
        super().__init__()       
        self.bert = bert      
        embedding_dim = bert.config.to_dict()['hidden_size']       
        self.fc = nn.Linear(embedding_dim, output_dim)      
        self.dropout = nn.Dropout(dropout)       
    def forward(self, text): 
        #text = [sent len, batch size] 
        text = text.permute(1, 0)     
        #text = [batch size, sent len]    
        embedded = self.dropout(self.bert(text)[0])     
        #embedded = [batch size, seq len, emb dim]             
        embedded = embedded.permute(1, 0, 2)                
        #embedded = [sent len, batch size, emb dim]   
        predictions = self.fc(self.dropout(embedded))    
        #predictions = [sent len, batch size, output dim]  
        return predictions

time: 7.4 ms (started: 2022-01-21 00:14:12 +00:00)


In [175]:
# check that model works on an arbitrary batch that contains two sentences of length 3
dropout=0
bert_linear_predict = BERTPoSTagger(bert,output_dim,dropout)

#example
with torch.no_grad():
  y = bert_linear_predict(torch.tensor([[0, 1, 2], [3, 4, 5]]))
#the expected shape is (batch size, max sentence length, number of labels)
print(y.shape)
print(y)


torch.Size([2, 3, 19])
tensor([[[ 0.3852,  0.3417, -0.3667, -0.1071, -0.1895, -0.2648, -0.0144,
           0.1781,  0.5827,  0.2060, -0.3581,  0.0288, -0.6476,  0.5256,
          -0.7290,  0.4005,  0.5893,  0.3409, -0.3148],
         [-0.5497, -0.3651, -0.0570, -0.2743, -0.0017, -0.6323, -0.0526,
           0.2110,  0.1679, -0.2160,  0.5215,  0.1663, -0.0519,  0.5612,
          -0.5882,  0.4555, -0.1510,  0.3690, -0.5117],
         [ 0.4365,  0.1640, -0.4560, -0.1359, -0.2931, -0.3708, -0.1136,
           0.3120,  0.4257,  0.2107, -0.4357,  0.0020, -0.6857,  0.5827,
          -0.7205,  0.4748,  0.6439,  0.2667, -0.4069]],

        [[ 0.1920,  0.1151, -0.3376,  0.0048, -0.0974, -0.0915, -0.1666,
           0.1724,  0.6870,  0.2012, -0.3685, -0.1423, -0.7775,  0.4537,
          -0.4959,  0.2590,  0.3446,  0.2713, -0.3996],
         [-0.0278, -0.2290, -0.1118, -0.0016, -0.0648,  0.3038, -0.0267,
           0.1740,  0.1199,  0.1916, -0.1960, -0.1829, -0.4788,  0.0745,
          -0.0723,  0

In [176]:
#load Eva's model
#childes_bert = torch.load('model.pt')
#childes_bert.eval()

time: 613 µs (started: 2022-01-21 00:14:12 +00:00)


In [177]:
bert_linear_predict = BERTPoSTagger(bert,output_dim,dropout)

bert_linear_predict = bert_linear_predict.to(device)

time: 177 ms (started: 2022-01-21 00:14:12 +00:00)


In [178]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(bert_linear_predict):,} trainable parameters')

The model has 109,496,851 trainable parameters
time: 2.63 ms (started: 2022-01-21 00:14:13 +00:00)


In [179]:
TAG_PAD_IDX = 0

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX) #for multi class classification task

def perf(model, loader):
  model.eval() # do not apply training-specific steps such as dropout
  total_loss = correct = num_loss = num_perf = 0
  for x, y in loader:
    with torch.no_grad(): # no need to store computation graph for gradients
      # perform inference and compute loss
      y_scores = model(x)
      loss = criterion(y_scores.view(-1, output_dim), y.view(-1)) # requires tensors of shape (num-instances, num-labels) and (num-instances)

      # gather loss statistics
      total_loss += loss.item() #the loss is usually negative log-likelihood and residual sum of squares for classification and regression respectively. Then naturally, the main objective in a learning model is to reduce (minimize) the loss function's value with respect to the model's parameters by changing the weight vector values through different optimization methods, such as backpropagation in neural networks. Loss value implies how well or poorly a certain model behaves after each iteration of optimization. Ideally, one would expect the reduction of loss after each, or several, iteration(s).

      num_loss += 1

      # gather accuracy statistics
      y_pred = torch.max(y_scores, 2)[1] # compute highest-scoring tag
      mask = (y != 0) # ignore <pad> tags
      correct += torch.sum((y_pred == y) * mask) # compute number of correct predictions
      num_perf += torch.sum(mask).item()
  return total_loss / num_loss, correct.item() / num_perf

# without training, accuracy should be a bit less than 2% (chance of getting a label correct)


time: 13.5 ms (started: 2022-01-21 00:14:13 +00:00)


In [180]:
print('BERT representation (unsupervised) on train dataset (without training)', *perf(bert_linear_predict, train_loader))
print('BERT representation (unsupervised) on validation dataset (without training)', *perf(bert_linear_predict, valid_loader))
print('BERT representation (unsupervised) on test dataset (without training)', *perf(bert_linear_predict, test_loader))

BERT representation (unsupervised) on train dataset (without training) 2.9892431796548014 0.030330837961844964
BERT representation (unsupervised) on validation dataset (without training) 2.9903100442886354 0.028443642355525912
BERT representation (unsupervised) on test dataset (without training) 2.986672550898332 0.031230358265241988
time: 7min 58s (started: 2022-01-21 00:14:13 +00:00)


In [181]:
#need to Fine-tune a BERT model for POS-tagging in the fully supervised setting (use a learning rate of 2e-5, and use a linear warmup learning rate schedule on 10% of the updates as suggested by BERT authors)

#Next, we define our optimizer. Usually when fine-tuning you want to use a lower learning rate than normal, this is because we don't want to drastically change the parameters as it may cause our model to forget what it has learned. This phenomenon is called catastrophic forgetting.

#We pick 5e-5 (0.00005) as it is one of the three values recommended in the BERT paper. Again, there may be better values for this dataset.

time: 1.47 ms (started: 2022-01-21 00:22:11 +00:00)


In [182]:
#Training is very similar to evaluation as it also performs inference. In addition it uses an optimizer which modifies the parameters of the neural network to minimize the criterion thanks to the gradients accumulated through the forward pass of the model. At each epoch, we perform inference, modify model weights after each batch, and finally use perf to compute loss and accuracy on the validatin data.
#Note that training is successful when the training loss gets lower after every epoch. It might fluctuate on validation data because of overtraining or generalization noise.

LEARNING_RATE = 5e-5 #### TO MODIFY

optimizer = optim.Adam(bert_linear_predict.parameters(), lr = LEARNING_RATE)

def fit(model, epochs):
  best_valid_loss = float('inf')
  for epoch in range(epochs):
    model.train()
    total_loss = num = correct = num_loss = num_perf = 0
    start_time = time.time()
    for x, y in train_loader:

      optimizer.zero_grad() # start accumulating gradients
      y_scores = model(x)
      loss = criterion(y_scores.view(-1, output_dim), y.view(-1))
      loss.backward() # compute gradients though computation graph
      optimizer.step() # modify model parameters
      total_loss += loss.item()
      num += 1
      y_pred = torch.max(y_scores, 2)[1] # compute highest-scoring tag
      mask = (y != 0) # ignore <pad> tags
      correct += torch.sum((y_pred == y) * mask) # compute number of correct predictions
      num_perf += torch.sum(mask).item()

    train_loss=total_loss / num 
    train_acc= correct.item() / num_perf
      
    end_time = time.time()  
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    valid_loss, valid_acc = perf(model, valid_loader)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut-model.pt')

    #print(1 + epoch, total_loss / num, *perf(model, valid_loader))# valid loader instead of train loader SOS SOS
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(1 + epoch, f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(1 + epoch, f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

time: 33.7 ms (started: 2022-01-21 00:22:11 +00:00)


In [183]:
torch.cuda.empty_cache() 
#torch.cuda.memory_summary(device=None, abbreviated=False)

fit(bert_linear_predict, 1)

Epoch: 01 | Epoch Time: 17m 0s
1 	Train Loss: 0.539 | Train Acc: 82.10%
1 	 Val. Loss: 0.456 |  Val. Acc: 85.30%
time: 18min (started: 2022-01-21 00:22:11 +00:00)


In [184]:
#for batch in iter(train_loader):
  #print(batch)

time: 940 µs (started: 2022-01-21 00:40:11 +00:00)


In [185]:
saved_model = BERTPoSTagger(bert,output_dim,dropout)

saved_model = saved_model.to(device)

#saved_model.load_state_dict(torch.load('tut-model.pt'))

test_loss, test_acc = perf(saved_model, test_loader)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 2.864 | Test Acc: 11.93%
time: 1min 1s (started: 2022-01-21 00:40:11 +00:00)


In [186]:
ex=example[0]
to_tokens=[]
for t in ex:
    to_tokens.append(tokenizer.convert_ids_to_tokens(t))
print(to_tokens)

to_ids= tokenizer.convert_tokens_to_ids(["[CLS]","and", "house", "go","[SEP]"])
print(to_ids)

ex=torch.tensor([to_ids]).to(device)

[['[CLS]', 'al', '-', 'za', '##man', ':', 'american', 'forces', 'killed', 'sha', '##ikh', 'abdullah', 'al', '-', 'an', '##i', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'q', '##ai', '##m', ',', 'near', 'the', 'syrian', 'border', '.', '[SEP]'], ['[CLS]', '[', 'this', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']', '[SEP]']]
[101, 1998, 2160, 2175, 102]
time: 4.2 ms (started: 2022-01-21 00:41:13 +00:00)


In [187]:
def tag_sentence(model, s): #tokenizer,tag_field,text_field, device,   
    model.eval()
    with torch.no_grad():
      y = saved_model(s)
      top_predictions = y.argmax(-1)
    return(top_predictions)

tag_sentence(saved_model, ex)

tensor([[13,  7, 11,  0,  0]], device='cuda:0')

time: 19.8 ms (started: 2022-01-21 00:41:13 +00:00)


In [195]:
#test on hand annotations
orig_stdout = sys.stdout
f = open('out.txt', 'w')
sys.stdout = f

label_vocab_inv = {y:x for x,y in label_vocab.items()}



eng_gold_childes=[]
tokenized_eng_gold_childes=[]
for sentence in eng_gold_childes_test['utterance_gloss']:
  word=sentence.split()
  eng_gold_childes.append(word)

for s in eng_gold_childes:
  tokenized_s = tokenizer.tokenize(' '.join(s))
  tokenized_eng_gold_childes.append(tokenized_s)

for sen in tokenized_eng_gold_childes:
    sen_ = tokenizer.convert_tokens_to_ids(['[CLS]'] + sen + ['[SEP]'])
    gold=torch.tensor([sen_]).to(device)
    res=tag_sentence(saved_model, gold)
    res_c=res.cpu().numpy()[0][2:-2]
    print(list(label_vocab_inv[f] for f in res_c))

output=[]
sys.stdout = orig_stdout
f.close()   

In [202]:
#evaluate human annotations

#df = df.dropna(axis=0, subset=['correct_pos']) 
mapping = {'mod': 'AUX', 'prep': 'ADP','adj': 'ADJ', 'pro': 'PRON', 'v': 'VERB', 'adv': 'ADV', 'participle': 'VERB', 'n': 'NOUN', 'childes': 'X', 'nan': 'X', 'intj': 'INTJ', 'particle': 'PART', 'mod': 'AUX','mod ': 'AUX', 'spacy': 'X', 'on': 'X', 'participle': 'VERB', 'aux': 'AUX', 'num': 'NUM', 'det':'DET' }
#for English, missing PROPN, CCONJ, SCONJ, SYM, PUNCT
df = eng_gold_childes_test.replace({'correct_pos': mapping})
df = df[['correct_pos', 'position']]

In [1]:
f = open('out.txt', 'r')
correct=0
c=0
for ind, line in enumerate(f):
  print(ind,line)
  c+=1
  if c<3:
    l=list(x.replace("[","").replace(" ", "").replace("\n","").replace("]","") for x in line.split(","))
    position=int(df.iloc[ind]['position'])
    print(position, ind)
    if l[position][1:-1]== df.iloc[ind]['correct_pos']:
      correct+=1
  acc=correct/len(df)*100  
  print(f'Accuracy on English gold CHILDES: {correct} / {len(df)} , {acc:.2f}%')




FileNotFoundError: ignored