In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# **0 - Data Download and Load**

In [None]:
# Data Download
id = '1by3IklROS_bxiz4gnC1oGg9KGghvXnhD'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_without_labels.csv')  

id = '1cDm7WUrE0BzUuT9O23Oz8vi9g5YsrSg7'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')  

id = '1SQSoK4IuILwjPMmjpp9-jd_Gh6ad7CYl'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv') 

In [None]:
# Data load
import pandas as pd
import numpy as np

train_dataset = pd.read_csv('train.csv')
val_dataset = pd.read_csv('val.csv')
test_dataset = pd.read_csv('test_without_labels.csv')

In [None]:
# Get labels

def get_labels(dataset):
  labels_list = list(dataset.labels)
  temp_labels = []
  for labels in labels_list:
    temp_labels.append(labels.split(" "))
  return temp_labels

# Labels
train_labels = get_labels(train_dataset)
val_labels = get_labels(val_dataset)

In [None]:
# Get Sentences

train_data = train_dataset.sents
val_data = val_dataset.sents
test_data = test_dataset.sents


# **1 - Data Preprocessing**

In [None]:
# Remove punctuation inside words

def remove_some_punctuation(sent_data):
  temp_word = []
  for idx, sentence in enumerate(sent_data):
    temp = []
    for word in sentence.split(' '):
      if(('.' in word or '\'' in word) and len(word)>1):
        word = word.replace('\'','')
        word = word.replace('.', '')
      temp.append(word)
    temp_word.append(temp)
  return temp_word

test_words = remove_some_punctuation(test_data)
val_words = remove_some_punctuation(val_data)
train_words = remove_some_punctuation(train_data)

In [None]:
# Tokenization on space
# Lemmatization 

import re
import nltk

nltk.download('punkt')
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer


def get_lemmatized(tokens):
  lemmatized_tokens = []
  lemmatizer = WordNetLemmatizer()

  for token in tokens:
    lemmatized = [lemmatizer.lemmatize(x.lower(), pos = 'v') for x in token]   
    lemmatized_tokens.append(lemmatized)

  return lemmatized_tokens

train_tokens = get_lemmatized(train_words)
val_tokens = get_lemmatized(val_words)
test_tokens = get_lemmatized(test_words)
all_tokens = train_tokens + val_tokens + test_tokens 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Generate word index

def get_word_idx(tokens):
  word_to_ix = {}
  for sentence in tokens:
    for word in sentence:
      if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
  return word_to_ix

# Generate label index
def get_tag_idx(labels):
  for tags in labels:
    for tag in tags:
      if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)
  return tag_to_ix

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG : 0, STOP_TAG : 1}

word_to_ix = get_word_idx(all_tokens)
word_list = list(word_to_ix.keys())
tag_to_ix = get_tag_idx(train_labels)



In [None]:
# Get words index

def get_index(dataset, to_idx):
  input_index_list = []
  for sentence in dataset:
    input_index_list.append([to_idx[x] for x in sentence])
  return input_index_list

train_input_index = get_index(train_tokens, word_to_ix)
train_output_index = get_index(train_labels, tag_to_ix)
val_input_index = get_index(val_tokens, word_to_ix)
val_output_index = get_index(val_labels, tag_to_ix)
test_input_index = get_index(test_tokens, word_to_ix)

# **2 - Input Embedding**


















## **Static Embeddings**

### **POS**



In [None]:
# Generate POS tag
# lab 6

nltk.download('averaged_perceptron_tagger')

def get_pos_tag(tokens):
  postags = []
  for sentence in tokens:
    tags = []
    for word, tag in nltk.pos_tag(sentence):
      tags.append(tag)
    postags.append(tags)
  return postags

pos_tags = get_pos_tag(all_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Get the word vector of POS tag

from gensim.models import Word2Vec
wv_pos_model = Word2Vec(sentences = pos_tags, size = 50, window = 5, min_count = 1, workers = 4, sg = 1)

wv_pos = {}
for i in range(0,len(all_tokens)):
  for j in range(0,len(all_tokens[i])):
    wv_pos[all_tokens[i][j]] = wv_pos_model[pos_tags[i][j]]

  if __name__ == '__main__':


In [None]:
# Generate POS embedding matrix

pos_embedding_matrix = []

for word in word_list:
  temp_embedding = []
  temp_embedding.extend(wv_pos[word])
  pos_embedding_matrix.append(temp_embedding)


pos_embedding_matrix = np.array(pos_embedding_matrix)
pos_embedding_matrix.shape

(3351, 50)

### **Dependency parsing**

In [None]:
#lab 7

import spacy
nlp = spacy.load('en_core_web_sm')

# parse-tree
def get_parse_sentence(tokens):
  parse_sentence = []
  
  for sentence in tokens:
    parse = nlp(" ".join(sentence))
    temp = []
    for sent in parse:
      temp.append(sent.dep_)

    parse_sentence.append(temp[:len(sentence)])

  return parse_sentence

parse_sentences = get_parse_sentence(all_tokens)

In [None]:
# Get the word vector of Dependency Parse

wv_parse_model = Word2Vec(sentences = parse_sentences, size = 50, window = 5, min_count = 1, workers = 4, sg = 1)

wv_parse = {}
for i in range(0, len(all_tokens)):
  for j in range(0,len(all_tokens[i])):
    wv_parse[all_tokens[i][j]] = wv_parse_model[parse_sentences[i][j]]

  


In [None]:
# Generate parse embedding matrix

parse_embedding_matrix = []


for word in word_list:
  temp_embedding = []
  temp_embedding.extend(wv_parse[word])
  parse_embedding_matrix.append(temp_embedding)

parse_embedding_matrix = np.array(parse_embedding_matrix)
parse_embedding_matrix.shape

(3351, 50)

### **Word2Vec**

In [None]:
# Download pre-trained glove 

import gensim.downloader as api
w2v_embedding_model = api.load("glove-twitter-50")



In [None]:
#Generate word2vec embedding matrix

w2v_embedding_matrix = []
EMBEDDING_DIM = 50

for word in word_list:
  temp_embedding = []
  try:
    temp_embedding.extend(w2v_embedding_model[word])
    w2v_embedding_matrix.append(temp_embedding)

  except:
    w2v_embedding_matrix.append([0]* EMBEDDING_DIM)

w2v_embedding_matrix = np.array(w2v_embedding_matrix)
w2v_embedding_matrix.shape

(3351, 50)

## **Dynamic Embeddings**

### **Bert**

In [None]:
# https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
# https://www.cnblogs.com/cuiyubo/p/10464504.html

import torch
!pip install flair
import flair

from flair.data import Sentence
from flair.embeddings import BertEmbeddings

from flair.embeddings import TransformerWordEmbeddings

# init embedding
bert_embedding = TransformerWordEmbeddings('bert-base-uncased')


Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/f0/3a/1b46a0220d6176b22bcb9336619d1731301bc2c75fa926a9ef953e6e4d58/flair-0.8.0.post1-py3-none-any.whl (284kB)
[K     |████████████████████████████████| 286kB 6.5MB/s 
[?25hCollecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/af/da/d215a091986e5f01b80f5145cff6f22e2dc57c6b048aab2e882a07018473/ftfy-6.0.3.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 5.4MB/s 
[?25hCollecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 8.5MB/s 
Collecting mpld3==0.3
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 20.7MB/s 
Collecting gdown==3.12.2
  Downloading https:/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from tqdm import tqdm

bert_embedding_matrix = []
temp = []
for word in tqdm(word_list):

  sent = Sentence(word)
  bert_embedding.embed(sent)
  temp.append(sent)
  for token in sent:
    bert_embedding_matrix.append(token.embedding.detach().cpu().numpy())


bert_embedding_matrix = np.array(bert_embedding_matrix)

100%|██████████| 3351/3351 [03:52<00:00, 14.38it/s]


## **Combination**

In [None]:
# concatenate both two matrix
all_embedding_matrix = np.concatenate((bert_embedding_matrix, w2v_embedding_matrix, pos_embedding_matrix, parse_embedding_matrix), axis = 1)

In [None]:
word_bert_embedding_matrix = np.concatenate((bert_embedding_matrix, w2v_embedding_matrix), axis = 1)

# **3 - NER Model**

## **Build Model**

In [None]:
# lab9
# lab 10

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class Design_BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, stacked_layers = 1, use_crf = False, attention_method = None):
        super(Design_BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)



        self.layers = stacked_layers
        self.use_crf = use_crf
        self.attention_method = attention_method
        self.general_attention_weight = nn.parameter.Parameter(torch.Tensor(1, self.hidden_dim, self.hidden_dim), requires_grad = True) # https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/attention.html

        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Using the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        
        self.dropout = nn.Dropout(0.2) # After comparison, it is better than 0.5

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) if not attention_method else nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha
    
    def cal_attention(self, hidden, input_embedding, method,left,right):
        # https://pytorch.org/docs/master/generated/torch.bmm.html
        if method == 'Dot-product':
          attn_weights = F.softmax(torch.bmm(left, right),dim=-1)
        elif method == 'Scaled Dot-product':
          attn_weights = F.softmax(torch.bmm(left, right) / np.sqrt(self.hidden_dim),dim=-1)
        # General
        # https://github.com/lmthang/nmt.hybrid
        # https://nlp.stanford.edu/projects/nmt/
        # https://stackoverflow.com/questions/50571991/implementing-luong-attention-in-pytorch
        else: # General
          left_step = torch.bmm(left, self.general_attention_weight)
          right_step = torch.bmm(left_step, right)
          attn_weights = F.softmax(right_step, dim=-1)
        return attn_weights

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        # with attention
        if self.attention_method:
          lstm_out = torch.squeeze(lstm_out, 1)
          left_self = lstm_out.view(1, lstm_out.size(0), lstm_out.size(1))
          right_self = left_self.view(left_self.size(0), left_self.size(2), left_self.size(1))
 
          Attn_weight = self.cal_attention(lstm_out, embeds, self.attention_method,left_self,right_self)
          output = torch.bmm(Attn_weight, left_self)
          concat_output = torch.cat((output, left_self), dim = -1)
          lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)

        else:
          lstm_out = lstm_out.view(len(sentence), self.hidden_dim)

        lstm_out = self.dropout(lstm_out)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        if self.use_crf == False:
          lstm_feats = self._get_lstm_features(sentence)
          return lstm_feats, list(torch.argmax(lstm_feats, -1).cpu().numpy())
        else:
          # Get the emission scores from the BiLSTM
          lstm_feats = self._get_lstm_features(sentence)

          # Find the best path, given the features.
          score, tag_seq = self._viterbi_decode(lstm_feats)
          return score, tag_seq

## **Train Model**

In [None]:
from sklearn.metrics import accuracy_score

#lab 9
def cal_acc(model, input_index, output_index):
  ground_truth = []
  predicted = []

  for i,idxs in enumerate(input_index):
      ground_truth += output_index[i]
      score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
      predicted += pred
  accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)


  return ground_truth, predicted, accuracy

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import datetime
from tqdm.auto import tqdm


def train_dataset(model, epochs = 30, Lr = 0.015):
  optimizer = optim.SGD(model.parameters(), lr = Lr, weight_decay = 1e-4)
  loss_func = nn.CrossEntropyLoss()
  for epoch in tqdm(range(epochs)):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(input_index):
          tags_index = output_index[i]

          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in, targets)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss += loss.item()

      max_val_acc=0
      model.eval()
      # Call the cal_acc functions you implemented as required
      _, _, train_acc = cal_acc(model, train_input_index, train_output_index)
      _, _, val_acc = cal_acc(model, val_input_index, val_output_index)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()

      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

  return model


In [None]:
train_data = train_input_index 
train_label = train_output_index

In [None]:
# Shuffle dataset
from random import shuffle
training_list = list(zip(train_data, train_label))
shuffle(training_list)
input_index = np.array([sentence[0] for sentence in training_list])
output_index = np.array([sentence[1] for sentence in training_list])

  """
  


In [None]:
from sklearn.metrics import f1_score


def decode_output(output_list):
  ix_to_tag = {v:k for k,v in tag_to_ix.items()}
  return [ix_to_tag[output] for output in output_list]


def get_f1score(model):
  y_true, y_pred,_ = cal_acc(model, val_input_index, val_output_index)
  y_true_decode = decode_output(y_true)
  y_pred_decode = decode_output(y_pred)
  f1 = f1_score(y_true_decode, y_pred_decode, average = 'micro')
  return f1

# **4 - Evaluation**

set up training process  with 

lr=0.015

epoch =30

## **Restore result**

In [None]:
ix_2_tag = {idx : tag for tag, idx in tag_to_ix.items()}

def predict(model, input_index):
  predicted=[]
  for x in input_index:
    input_tensor = torch.tensor(x).to(device)
    _, output = model(input_tensor)
    
    for idx in output:
      predicted.append(ix_2_tag[idx])
  return predicted

def result_to_csv(model,index):
  predition = predict(model, index)
  fina_id = range(len(predition))
  test_prediction = {'Id':fina_id ,'Predicted':predition }
  df = pd.DataFrame(test_prediction)
  df.to_csv('best_result.csv', index=False)

  return df

## **Use Trained Model**

### **Best Model**


embeddeing = word2vec+bert

attention = General

crf = True

layers = 2

In [None]:
# load model
id = '1gFRJPxJ6_KHVwLl4tOeJCEa04p6SrhWy' 
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('best_model_trian.pth')

In [None]:
best_model_load = torch.load("best_model_trian.pth")

In [None]:
best_model_f1 = get_f1score(best_model_load)
print("The F1 score of the best model: %.4f" %(best_model_f1))

The F1 score of the best model: 0.8269


Predict in test dataset

In [None]:
result_to_csv(best_model_load, test_input_index)

Unnamed: 0,Id,Predicted
0,0,B-Person
1,1,O
2,2,B-Person
3,3,O
4,4,B-Temporal
...,...,...
5229,5229,O
5230,5230,B-Organisation
5231,5231,O
5232,5232,O


### **Other Attemps**

#### **Base**

In [None]:
# load model
id = '1jdA7ew4VrmfnK6af4K1Q6OY3sm6DRCXR'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Base_train.pth')

In [None]:
Base_model_load = torch.load("Base_train.pth")

In [None]:
Base_model_f1 = get_f1score(Base_model_load)
print("The F1 score of the Base_model with using CRF: %.4f" %(Base_model_f1))

The F1 score of the Base_model with using CRF: 0.7154


#### **Different input embedding model**


Layer = 2

Input embedding = pos

Attention = General

Use Crf

In [None]:
# load model
id = '1_uB3crXU4wWSJZsW0Mif8-t8QksjcpHh'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('general_pos_trian.pth')

In [None]:
general_pos_load = torch.load("general_pos_trian.pth")

In [None]:
general_pos_f1 = get_f1score(general_pos_load)
print("The F1 score of the general method with pos: %.4f" %(general_pos_f1))

The F1 score of the general method with pos: 0.7842


Layer = 2

Input embedding =  dependency parse

Attention = General

Use Crf

In [None]:
# load model
id = '1pejxvtGVLfBeS699F6uuhjx80TeAcbIs'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('general_parse_trian.pth')

In [None]:
general_parse_load = torch.load("general_parse_trian.pth")

In [None]:
general_parse_f1 = get_f1score(general_parse_load)
print("The F1 score of the general method with parse: %.4f" %(general_parse_f1))

The F1 score of the general method with parse: 0.7685


Layer = 2

Input embedding =  Word2Vec

Attention = General

Use Crf

In [None]:
# load model
id = '1keJsOA4BJgS0g-8YKC-k_n4MVTFcdn6R'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('general_w2v_trian.pth')

In [None]:
general_w2v_load = torch.load("general_w2v_trian.pth")

In [None]:
general_w2v_f1 = get_f1score(general_w2v_load)
print("The F1 score of the general method with w2v: %.4f" %(general_w2v_f1))

The F1 score of the general method with w2v: 0.8125


Layer = 2

Input embedding =  Bert

Attention = General

Use Crf

In [None]:
# load model
id = '1SXnwlqx0zhiiE0xLzf1rxwcti1cGwJvW'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('general_bert_trian.pth')

In [None]:
general_bert_load = torch.load("general_bert_trian.pth")

In [None]:
general_bert_f1_ = get_f1score(general_bert_load)
print("The F1 score of the general method with bert: %.4f" %(general_bert_f1_))

The F1 score of the general method with bert: 0.8129


#### **Different attention startegy**

Layer = 2

Input embedding = word2vec + bert

Attention = Scaled Dot-product

Use Crf

In [None]:
# load model
id = '1OVTI_S6wsHTN9CNjZx8r08Jhwc4lqcy8'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('scale_model_train.pth')

In [None]:
scale_load = torch.load("scale_model_train.pth")

In [None]:
scale_model_f1 = get_f1score(scale_load)
print("The F1 score of the Scaled Dot-product method with using CRF: %.4f" %(scale_model_f1))

The F1 score of the Scaled Dot-product method with using CRF: 0.8267


Layer = 2

Input embedding = word2vec + bert

Attention = Dot-product

Use Crf

In [None]:
# load model
id = '1hBQWN6hC4jD6GU6A3JdSo6z77z7whjk-'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('dot_model_train.pth')

In [None]:
dot_load = torch.load("dot_model_train.pth")

In [None]:
dot_model_f1 = get_f1score(dot_load)
print("The F1 score of the Dot-product method with using CRF: %.4f" %(dot_model_f1))

The F1 score of the Dot-product method with using CRF: 0.8153


#### **Different Stacked layer**

Layer = 1

Input embedding = word2vec + bert

Attention = General

Use Crf

In [None]:
# load model
id = '1iEVFWkBgEsmoUduPuLvsJTT91spYWh2Q'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('layers1_model_train.pth')

In [None]:
layer1_load = torch.load("layers1_model_train.pth")

In [None]:
layer1_f1 = get_f1score(layer1_load)
print("The F1 score of the general method with using CRF (layer1): %.4f" %(layer1_f1))

The F1 score of the general method with using CRF (layer1): 0.8121


Layer = 4

Input embedding = word2vec + bert

Attention = General

Use Crf

In [None]:
# load model
id = '1mpJiH_w3rSPwGVFCmXa23kGl1JBoOfD2'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('layers4_model_train.pth')

In [None]:
layer4_load = torch.load("layers4_model_train.pth")

In [None]:
layer4_f1 = get_f1score(layer4_load)
print("The F1 score of the general method with using CRF (layer4): %.4f" %(layer4_f1))

The F1 score of the general method with using CRF (layer4): 0.7874


#### **with/without CRF**

Layer = 2

Input embedding = word2vec + bert

Attention = General

Without using Crf

In [None]:
# load model
id = '1NeknO3pF8Wn7z4w_NYAvurbzgJpVC9lD'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('withoutCrf_model_train.pth')

In [None]:
withoutCrf_load = torch.load("withoutCrf_model_train.pth")

In [None]:
withoutCrf_model_f1 = get_f1score(withoutCrf_load)
print("The F1 score of the general method without using CRF: %.4f" %(withoutCrf_model_f1))

The F1 score of the general method without using CRF: 0.7192


#  **5 - Train and Evaluation Log**

### **Best Model**


embeddeing = word2vec+bert

attention = General

crf = True

layers = 2

##### **Initialize Model**

In [None]:
EMBEDDING_DIM = word_bert_embedding_matrix.shape[1] 
HIDDEN_DIM = 200

embedding_matrix = word_bert_embedding_matrix
best_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

##### **Train and Evaluation Model**

In [None]:
best_model_trian = train_dataset(best_model)

In [None]:
torch.save(best_model_trian, "best_model_trian.pth")

In [None]:
best_model_f1 = get_f1score(best_model_trian)
print("The F1 score of the best model: %.4f" %(best_model_f1))

##### **Restore the result**

In [None]:
result_to_csv(best_model_load, test_input_index)

### **Other Attemps**

#### **Base**

##### **Initialize Model**

In [None]:
EMBEDDING_DIM = word_bert_embedding_matrix.shape[1]
HIDDEN_DIM = 200

embedding_matrix = word_bert_embedding_matrix
base_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)

##### **Train and Evaluation Model**

In [None]:
Base_train = train_dataset(base_model)
torch.save(Base_train, "Base_train.pth")

In [None]:
Base_model_f1 = get_f1score(Base_train)
print("The F1 score of the Base_model with using CRF: %.4f" %(Base_model_f1))

#### **Different input embedding model**

##### **Initialize Model**

Static

In [None]:
# (vocab_size, tag_to_ix, embedding_dim, embedding_matrix, hidden_dim, stacked_layers, use_crf = True, attention_method = None)

HIDDEN_DIM = 200
EMBEDDING_DIM = 50


embedding_matrix = pos_embedding_matrix
general_pos_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

embedding_matrix = parse_embedding_matrix
general_parse_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

embedding_matrix =w2v_embedding_matrix
general_w2v_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

Dynamic

In [None]:
HIDDEN_DIM = 200
EMBEDDING_DIM = 768

embedding_matrix = bert_embedding_matrix
general_bert_model = Design_BiLSTM_CRF(len(bert_embedding_matrix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

Combination

In [None]:
EMBEDDING_DIM = 918
HIDDEN_DIM = 200

embedding_matrix = all_embedding_matrix
general_all_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)

##### **Train and Evaluation Model**


Layer = 2

Input embedding = pos

Attention = General

Use Crf

In [None]:
general_pos_trian = train_dataset(general_pos_model)
torch.save(general_pos_trian, "general_pos_trian.pth")

In [None]:
general_pos_f1 = get_f1score(general_pos_trian)
print("The F1 score of the general method with pos: %.4f" %(general_pos_f1))

Layer = 2

Input embedding =  dependency parse

Attention = General

Use Crf

In [None]:
general_parse_trian = train_dataset(general_parse_model)
torch.save(general_parse_trian, "general_parse_trian.pth")

In [None]:
general_parse_f1 = get_f1score(general_parse_trian)
print("The F1 score of the general method with parse: %.4f" %(general_parse_f1))

Layer = 2

Input embedding =  Word2Vec

Attention = General

Use Crf

In [None]:
general_w2v_trian = train_dataset(general_w2v_model)
torch.save(general_w2v_trian, "general_w2v_trian.pth")

In [None]:
general_w2v_f1 = get_f1score(general_w2v_trian)
print("The F1 score of the general method with w2v: %.4f" %(general_w2v_f1))

Layer = 2

Input embedding =  Bert

Attention = General

Use Crf

In [None]:
general_bert_trian = train_dataset(general_bert_model)
torch.save(general_bert_trian, "general_bert_trian.pth")

In [None]:
general_bert_f1_ = get_f1score(general_bert_trian)
print("The F1 score of the general method with bert: %.4f" %(general_bert_f1_))

Layer = 2

Input embedding = word2vec + bert

Attention = General

Use Crf

Note: Best Model

In [None]:
general_word_bert_trian = train_dataset(general_bert_model)
torch.save(general_bert_trian, "general_word_bert_trian.pth")

In [None]:
general_word_bert_f1_ = get_f1score(general_word_bert_trian)
print("The F1 score of the general method with bert: %.4f" %(general_word_bert_f1_))

Layer = 2

Input embedding = word2vec + pos + dependency parse + Bert

Attention = General

Use Crf

In [None]:
general_all_trian = train_dataset(general_all_model)
torch.save(general_all_trian, "general_all_trian.pth")

In [None]:
general__all_f1 = get_f1score(general_all_trian)
print("The F1 score of the general method with all: %.4f" %(general__all_f1))

#### **Different attention strategy**

##### **Initialize Model**

In [None]:
HIDDEN_DIM = 200
# 50 word2vec + 768 Bert
EMBEDDING_DIM = 818


embedding_matrix = word_bert_embedding_matrix
scale_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "Scaled Dot-product").to(device)
general_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)
dot_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "Dot-product").to(device)

##### **Train and Evaluation Model**

Layer = 2

Input embedding = word2vec + bert

Attention = Scaled Dot-product

Use Crf

In [None]:
scale_model_train = train_dataset(scale_model)
torch.save(scale_model_train, "scale_model_train.pth")

In [None]:
scale_model_f1 = get_f1score(scale_model_train)
print("The F1 score of the Scaled Dot-product method with using CRF: %.4f" %(scale_model_f1))

Layer = 2

Input embedding = word2vec + bert

Attention = General

Use Crf

Note: Best Model

In [None]:
general_model_train = train_dataset(general_model)
torch.save(general_model_train, "general_model_train.pth")

In [None]:
general_model_f1 = get_f1score(general_model_train)
print("The F1 score of the general method with using CRF: %.4f" %(general_model_f1))

Layer = 2

Input embedding = word2vec + bert

Attention = Dot-product

Use Crf

In [None]:
dot_model_train = train_dataset(dot_model)
torch.save(dot_model_train, "dot_model_train.pth")

In [None]:
dot_model_f1 = get_f1score(dot_model_train)
print("The F1 score of the Dot-product method with using CRF: %.4f" %(dot_model_f1))

#### **Different Stacked layer**

##### **Initialize Model**

In [None]:
HIDDEN_DIM = 200
# 50 word2vec + 768 Bert
EMBEDDING_DIM = 818


embedding_matrix = word_bert_embedding_matrix
layers1_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 1, use_crf = True, attention_method = "General").to(device)
layers2_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)
layers4_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 4, use_crf = True, attention_method = "General").to(device)

##### **Train and Evaluation Model**

Layer = 1

Input embedding = word2vec + bert

Attention = General

Use Crf

In [None]:
layers1_model_train = train_dataset(layers1_model)
torch.save(layers1_model_train, "layers1_model_train.pth")

In [None]:
layer1_f1 = get_f1score(layers1_model_train)
print("The F1 score of the general method with using CRF (layer1): %.4f" %(layer1_f1))

Layer = 2

Input embedding = word2vec + bert

Attention = General

Use Crf

Note: Best Model

In [None]:
layers2_model_train = train_dataset(layers2_model)
torch.save(layers2_model_train, "layers2_model_train.pth")

In [None]:
layer2_f1 = get_f1score(layers2_model_train)
print("The F1 score of the general method with using CRF (layer2): %.4f" %(layer2_f1))

Layer = 4

Input embedding = word2vec + bert

Attention = General

Use Crf

In [None]:
layers4_model_train = train_dataset(layers4_model)
torch.save(layers4_model_train, "layers4_model_train.pth")

In [None]:
layer4_f1 = get_f1score(layers4_model_train)
print("The F1 score of the general method with using CRF (layer4): %.4f" %(layer4_f1))

#### **with/without CRF**

##### **Initialize Model**

In [None]:
HIDDEN_DIM = 200
# 50 word2vec + 768 Bert
EMBEDDING_DIM = 818


embedding_matrix = word_bert_embedding_matrix
crf_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = True, attention_method = "General").to(device)
withoutCrf_model = Design_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, stacked_layers = 2, use_crf = False, attention_method = "General").to(device)

##### **Train and Evaluation**

Layer = 2

Input embedding = word2vec + bert

Attention = General

Use Crf

Note: Best Model

In [None]:
crf_model_train = train_dataset(crf_model)
torch.save(crf_model_train, "crf_model_train.pth")

In [None]:
crf_model_f1 = get_f1score(crf_model_train)
print("The F1 score of the general method without using CRF: %.4f" %(crf_model_f1))

Layer = 2

Input embedding = word2vec + bert

Attention = General

Without using Crf

In [None]:
withoutCrf_model_train = train_dataset(withoutCrf_model)
torch.save(withoutCrf_model_train, "withoutCrf_model_train.pth")

In [None]:
withoutCrf_model_f1 = get_f1score(withoutCrf_model_train)
print("The F1 score of the general method without using CRF: %.4f" %(withoutCrf_model_f1))