# Assignment 3 : Sequence labelling with RNNs
In this assignement we will ask you to perform POS tagging.

You are asked to follow these steps:
*   Download the corpora and split it in training and test sets, structuring a dataframe.
*   Embed the words using GloVe embeddings
*   Create a baseline model, using a simple neural architecture
*   Experiment doing small modifications to the model
*   Evaluate your best model
*   Analyze the errors of your model

**Corpora**:
Ignore the numeric value in the third column, use only the words/symbols and its label.
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip 

**Splits**: documents 1-100 are the train set, 101-150 validation set, 151-199 test set.

**Baseline**: two layers architecture: a Bidirectional LSTM and a Dense/Fully-Connected layer on top.

**Modifications**: experiment using a GRU instead of the LSTM, adding an additional LSTM layer, and using a CRF in addition to the LSTM. Each of this change must be done by itself (don't mix these modifications).

**Training and Experiments**: all the experiments must involve only the training and validation sets.

**Evaluation**: in the end, only the best model of your choice must be evaluated on the test set. The main metric must be F1-Macro computed between the various part of speech (without considering punctuation classes).

**Error Analysis** (optional) : analyze the errors done by your model, try to understand which may be the causes and think about how to improve it.

**Report**: You are asked to deliver a small report of about 4-5 lines in the .txt file that sums up your findings.

In [163]:
import os
import numpy as np
import matplotlib.pyplot as plt
import shutil
import tensorflow as tf
import time
from tensorflow.keras import Model, Sequential
from tensorflow.keras import layers, callbacks
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.metrics import f1_score


In [164]:
print(f"Number of GPU available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

Number of GPU available: 1


In [165]:
# get the download path 
print(os.getcwd())
download_path = os.path.join(os.getcwd(), "Datasets/treebank.tar.gz")

/content


In [166]:
# Download the zip file  and extract the dataset
import zipfile
# Conf
print(f"The current working directory is {os.getcwd()}")

dataset_folder = os.path.join(os.getcwd(), "Datasets")

if not os.path.exists(dataset_folder):
  os.makedirs(dataset_folder)
url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "treebank")
dataset = tf.keras.utils.get_file(dataset_path , url, untar=True, cache_dir='.', cache_subdir='')


# Extract dataset from the Zip file
def extract_dataset(download_path, extract_path):
    print("Extracting dataset... (it may take a while...)")
    with zipfile.ZipFile(download_path, "r") as loaded_tar:
        loaded_tar.extractall(extract_path)
    print("Extraction completed!")



# 
extract_dataset(download_path, dataset_folder)

The current working directory is /content
Extracting dataset... (it may take a while...)
Extraction completed!


In [167]:
# I have Created a flatten list function
def flatten_list(L):
    L0 = []
    for x in L:
        if isinstance(x, list):
            L0 += flatten_list(x)
        else: 
            L0.append(x)
    return L0

In [168]:
# download the dataset and partition it into train val and test set
download_path = os.path.join(os.getcwd(), "Datasets/dependency_treebank")

def train_val_test(download_path, start=0, end=100):
  data = []

  files = os.listdir(download_path)[start:end]
  #n = len(files)
  #indexes = [i for i in range(n)]
  #np.random.shuffle(indexes)
 
  #files = [files[i] for i,j in zip(range(n),indexes) if i==j][start:end]
  for f in files:
    file = os.path.join(download_path, f)
    with open(file, "r", encoding="utf-8") as read_file:
      text = read_file.read()
    data.append(text)
  dt = flatten_list([txt.split("\n") for txt in data])
  dt = [t.split("\t") for t in dt]
  dt = [t for t in dt if t[0]!='']
  dt = [t[:2] for t in dt]
  return [[L[0].lower(),L[1]] for L in dt]    

In [169]:
# This is an important step: reducing the number of classes to classified by 
 # for instance putting punctuations into one classes '.'
punc = ['.',',',';', ':','?','!',"''", 
        '``', '--', '`', "'", '&',
       
        ':', '...','-','-LRB-'] +  ['-LRB-','-RRB-']
Nouns = ["us$", "c$", "wa"]
Wp = ["WP$"]
def punctuation(L):
  res = []
  for x in L:
    if x[1] in punc:
      res.append([x[0], '.'])
    elif x[1] == 'NNS':
      res.append([x[0], "NN"])
      
    elif x[1]=="WBR":
      res.append([x[0], "RB"])

    elif x[1] in ["#","$","SYM"]:
      res.append([x[0], 'SYM'])

    elif x[1] in  ["FW","LS","NIL", "UH"]:
      res.append([x[0], "X"])

    elif x[1] == 'MD':
      res.append([x[0], 'VB'])

    elif x[1] in ['RBR', 'RBS']:
      res.append([x[0], 'RB'])

    elif x[1] == 'NNPS':
      res.append([x[0], "NNP"])

    elif x[1] == 'POS':
      res.append([x[0], "TO"])

    elif x[1] in ['VBD','VBG','VBN','VBP','VBZ']:
      res.append([x[0], "VB"])

    elif x[1] in ["EX", "WP"]:
      res.append([x[0], "PRP"])

    elif x[1] in ["PDT", "PRP$", "WDT","WP$"]:
      res.append([x[0], "DT"])
    else:
      res.append(x)
  return res

In [170]:
train_list = punctuation(train_val_test(download_path))
val_list = punctuation(train_val_test(download_path,100,150))
test_list = punctuation(train_val_test(download_path,150,200))

In [171]:
train_text =  [x[0] for x in train_list]
train_y =  [x[1] for x in train_list]

# Validation set
val_text = [x[0] for x in val_list]
val_y = [x[1] for x in val_list]

# Test set
test_text = [x[0] for x in test_list]
test_y =  [x[1] for x in test_list]

In [172]:
unique = np.unique(np.array(val_y+test_y+train_y))
idx = [i for i,_ in enumerate(unique)]
word_idx = dict(zip(unique, idx))

In [173]:
y_train = np.array([word_idx[w] for w in train_y])
y_val = np.array([word_idx[w] for w in val_y])
y_test = np.array([word_idx[w] for w in test_y])

In [174]:
from tensorflow.keras.utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_val_one_hot = to_categorical(y_val)
y_test_one_hot = to_categorical(y_test)

In [175]:
y_train_one_hot.shape

(48393, 18)

In [176]:
num_classes = len(unique)
num_classes               

18

In [177]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type, embedding_dimension=50):
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """

    download_path = ""

    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove")

    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model


# Modify these variables as you wish!
# Glove -> 50, 100, 200, 300
# Word2Vec -> 300
embedding_model_type = "glove"
embedding_dimension = 50

embedding_model = load_embedding_model(embedding_model_type, embedding_dimension)
        

In [178]:
# Compute the embedding matrix
# for simplicity, Embedding vector for OOV are assumed to be zero
embedding_dim = 50
embedding_matrix = np.zeros((len(train_text),embedding_dim), dtype=np.float32)
oov_terms = []
for i,word in enumerate(train_text):
  if not word in embedding_model.vocab:
    oov_terms.append(word)
    embedding_matrix[i] = np.random.rand(embedding_dim)
  else:
    embedding_matrix[i] = embedding_model[word]
print(f"the proportion of OOV in the Corpus is {len(oov_terms)/len(train_text):.4f}%, it is negligeable")

the proportion of OOV in the Corpus is 0.0130%, it is negligeable


In [179]:
X_train = np.array(train_text)
X_val = np.array(val_text)
X_test = np.array(test_text)

In [180]:
# Vectorization
vectorize_layer = TextVectorization(max_tokens=1000,
                                    output_mode="int", 
                                    output_sequence_length= 10)

vectorize_layer.adapt(X_train)

In [181]:
batch_size = 32
embedding_layer = layers.Embedding(embedding_matrix.shape[0],50, 
                            trainable=False, 
                            weights = [embedding_matrix], name ="Emb"
                             )



In [182]:
Early_callback = callbacks.EarlyStopping(patience=5,
                                                  restore_best_weights=True, 
                                                  monitor='val_loss')

Baseline: two layers architecture: a Bidirectional LSTM and a Dense/Fully-Connected layer on top.

In [183]:
def baseline_model():
  inp = tf.keras.Input(shape=(1,), dtype=tf.string)
  x = vectorize_layer(inp)
  x = embedding_layer(x)
 
  x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(128))(x)
  x = layers.Dense(64)(x)
  x = layers.Dense(128)(x)
  x = layers.Dropout(0.5)(x)
  x = layers.Dense(num_classes)(x)
  model = Model(inp, x)
  #model.summary()
  model.compile(optimizer="adam", 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=["acc"])
  return model

In [184]:
model_baseline = baseline_model()

In [185]:
model_baseline.summary()

Model: "functional_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_6 (TextVe (None, 10)                0         
_________________________________________________________________
Emb (Embedding)              (None, 10, 50)            2419650   
_________________________________________________________________
bidirectional_39 (Bidirectio (None, 10, 128)           58880     
_________________________________________________________________
bidirectional_40 (Bidirectio (None, 256)               263168    
_________________________________________________________________
dense_34 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_35 (Dense)             (None, 128)             

In [186]:
%%time
model_baseline.fit(X_train,y_train_one_hot, 
          epochs=150, 
          validation_data=(X_val, y_val_one_hot), 
          callbacks=[Early_callback])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
CPU times: user 10min 20s, sys: 1min 48s, total: 12min 9s
Wall time: 9min 26s


<tensorflow.python.keras.callbacks.History at 0x7fb4f8cde1d0>

 Experiment using a GRU instead of the LSTM



In [187]:
# Number of RNN units
rnn_units = 128

def GRU_model():
  inp = tf.keras.Input(shape=(1,),dtype = tf.string)
  x = vectorize_layer(inp)
  x = embedding_layer(x)
  x = layers.Bidirectional(layers.GRU(rnn_units,return_sequences=True))(x)
  x = layers.Bidirectional(layers.GRU(rnn_units))(x)
  x = layers.Dense(128, activation="relu")(x)
  x = layers.Dropout(0.5)(x)
  x = layers.Dense(num_classes)(x)
  model = Model(inp, x)
  model.summary()
  return model

In [188]:
gru_model = GRU_model()

Model: "functional_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_6 (TextVe (None, 10)                0         
_________________________________________________________________
Emb (Embedding)              (None, 10, 50)            2419650   
_________________________________________________________________
bidirectional_41 (Bidirectio (None, 10, 256)           138240    
_________________________________________________________________
bidirectional_42 (Bidirectio (None, 256)               296448    
_________________________________________________________________
dense_37 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)             

In [189]:

gru_model.compile(optimizer="adam", 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=["acc"])

In [190]:
%%time
gru_model.fit(X_train,y_train_one_hot, 
          epochs=150, 
          validation_data=(X_val, y_val_one_hot), 
          callbacks=[Early_callback])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
CPU times: user 12min 6s, sys: 2min 12s, total: 14min 19s
Wall time: 12min 14s


<tensorflow.python.keras.callbacks.History at 0x7fb5158c6f60>

In [191]:
def baseline_model_Plus_LSTM():
  inp = tf.keras.Input(shape=(1,), dtype=tf.string)
  x = vectorize_layer(inp)
  x = embedding_layer(x)
  x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(128))(x)
  x = layers.Dense(64)(x)
  x = layers.Dense(128)(x)
  x = layers.Dropout(0.5)(x)
  x = layers.Dense(num_classes)(x)
  model = Model(inp, x)
  model.summary()
  return model

In [192]:
lstm_model_plus = baseline_model_Plus_LSTM()

Model: "functional_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_6 (TextVe (None, 10)                0         
_________________________________________________________________
Emb (Embedding)              (None, 10, 50)            2419650   
_________________________________________________________________
bidirectional_43 (Bidirectio (None, 10, 128)           58880     
_________________________________________________________________
bidirectional_44 (Bidirectio (None, 10, 256)           263168    
_________________________________________________________________
bidirectional_45 (Bidirectio (None, 256)               394240    
_________________________________________________________________
dense_39 (Dense)             (None, 64)              

In [193]:

lstm_model_plus.compile(optimizer="adam", 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=["acc"])

In [194]:
lstm_model_plus.fit(X_train,y_train_one_hot, 
          epochs=150, 
          validation_data=(X_val, y_val_one_hot), 
          callbacks=[Early_callback])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150


<tensorflow.python.keras.callbacks.History at 0x7fb50aaa8c18>

In [195]:
# Number of RNN units
rnn_units = 128

def GRU_model_Plus_GRU():
  inp = tf.keras.Input(shape=(1,),dtype = tf.string)
  x = vectorize_layer(inp)
  x = embedding_layer(x)
  x = layers.Bidirectional(layers.GRU(rnn_units,return_sequences=True))(x)
  x = layers.Bidirectional(layers.GRU(rnn_units,return_sequences=True))(x)
  x = layers.Bidirectional(layers.GRU(rnn_units))(x)
  x = layers.Dense(128, activation="relu")(x)
  x = layers.Dropout(0.5)(x)
  x = layers.Dense(num_classes)(x)
  model = Model(inp, x)
  model.summary()
  return model

In [196]:
gru_model_plus = GRU_model_Plus_GRU()

Model: "functional_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_6 (TextVe (None, 10)                0         
_________________________________________________________________
Emb (Embedding)              (None, 10, 50)            2419650   
_________________________________________________________________
bidirectional_46 (Bidirectio (None, 10, 256)           138240    
_________________________________________________________________
bidirectional_47 (Bidirectio (None, 10, 256)           296448    
_________________________________________________________________
bidirectional_48 (Bidirectio (None, 256)               296448    
_________________________________________________________________
dense_42 (Dense)             (None, 128)             

In [197]:
gru_model_plus.compile(optimizer="adam", 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=["acc"])

In [198]:
%%time
gru_model_plus.fit(X_train,y_train_one_hot, 
          epochs=150, 
          validation_data=(X_val, y_val_one_hot), 
          callbacks=[Early_callback])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
CPU times: user 15min 56s, sys: 3min 16s, total: 19min 13s
Wall time: 17min 40s


<tensorflow.python.keras.callbacks.History at 0x7fb4fd7c5630>

In [199]:
# Vectorization
vectorize_layer = TextVectorization(max_tokens=1000,
                                    output_mode="int", 
                                    output_sequence_length= num_classes)

vectorize_layer.adapt(X_train)

In [200]:
embedding_layer = layers.Embedding(embedding_matrix.shape[0],50, 
                            trainable=False, 
                            weights = [embedding_matrix], name ="Emb"
                             )


In [201]:
try:
  from tf_crf_layer.layer import CRF
except:
  !pip install tf_crf_layer
  from tf_crf_layer.layer import CRF
  print("done!")

In [202]:

from tf_crf_layer.metrics import crf_accuracy, crf_viterbi_accuracy
from tf_crf_layer.loss import crf_loss

In [203]:
def LSTM_CRF():
  inp = tf.keras.Input(shape=(1,), dtype=tf.string)
  x = vectorize_layer(inp)
  x = embedding_layer(x)
  x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
  
  
  x = layers.Dense(64, activation=None)(x)
 
  
  x = CRF(num_classes, name='crf_loss')(x)
  model = Model(inp, x)
  #model = ModelWithCRFLoss(model)
  #model.compile('adam')
  
  return model

In [204]:
model_CRF = LSTM_CRF()

In [205]:
model_CRF.compile('adam', {'crf_loss': crf_loss}, metrics=['acc'])

In [206]:
model_CRF.summary()

Model: "functional_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_7 (TextVe (None, 18)                0         
_________________________________________________________________
Emb (Embedding)              (None, 18, 50)            2419650   
_________________________________________________________________
bidirectional_49 (Bidirectio (None, 18, 128)           58880     
_________________________________________________________________
bidirectional_50 (Bidirectio (None, 18, 128)           98816     
_________________________________________________________________
bidirectional_51 (Bidirectio (None, 18, 128)           98816     
_________________________________________________________________
dense_44 (Dense)             (None, 18, 64)          

In [207]:
%%time
model_CRF.fit(X_train, y_train_one_hot, 
          epochs=150, 
          validation_data=(X_val, y_val_one_hot),
          callbacks=[Early_callback]
          )

Epoch 1/150
<tf_crf_layer.layer.CRF object at 0x7fb4fa183550> 0
<tf_crf_layer.layer.CRF object at 0x7fb4fa183550> 0
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
CPU times: user 2h 50min 34s, sys: 25min 41s, total: 3h 16min 16s
Wall time: 2h 4min 34s


<tensorflow.python.keras.callbacks.History at 0x7fb4f9f67c50>

In [208]:
test = [[x,y] for x,y in test_list if y!='.']
X_test = [x[0] for x in test]
y_test = [word_idx[y] for y in  [x[1] for x in test]]

# retrieve best indices
def pred_y(y):
  return tf.argmax(y, axis=1)

In [209]:
f1_score_macro = []
best_model = ''
best_f1_score = 0.
names = ["baseline model", "Model using GRU", "LSTM model plus", "GRU model plus","Model with CRF layer"]
list_models = [model_baseline, gru_model, lstm_model_plus, gru_model_plus, model_CRF]

for i, model in enumerate(list_models):
  y_pred = model.predict(X_test)
  y_pred = pred_y(y_pred)
  f1 =f1_score(y_test, y_pred, average='macro')
  if best_f1_score < f1:
    best_f1_score = f1
    best_model = names[i]
  print(f"f1 score for {names[i]} is {f1:.4f}")
  f1_score_macro.append(f1)


print("="*100)
print(f"The best model is {best_model} with f1 score: {best_f1_score:.4f}")  

f1 score for baseline model is 0.4518
f1 score for Model using GRU is 0.4440
f1 score for LSTM model plus is 0.4403
f1 score for GRU model plus is 0.4597
f1 score for Model with CRF layer is 0.3897
The best model is GRU model plus with f1 score: 0.4597
