### Load the libraries (change directories based on the system on which code is running)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import string
import re
import nltk
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
os.chdir('/content/drive/MyDrive/Work/Dan/BERT Model')
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import matplotlib.pyplot as plt
import pandas as pd
!pip install transformers
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 18.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 55.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.8MB/s 
Bui

## Prepare Datasets (uncomment if datasets are changing)

use domain specific - for hiri, iris, and lcs - build 3 bert model

In [9]:
train=pd.read_excel('Input data/Model_training_all_06032021.xlsx',sheet_name='Sheet2').drop_duplicates().reset_index(drop=True)
train=train[train['domain']=='LCS']
train=train[train['language']=='EN'][['cantonese_text','intent']].rename(columns={'cantonese_text':'text','intent':'label'})
test=pd.read_excel('Input data/Gold_data_all_06032021.xlsx',sheet_name='Sheet2').drop_duplicates().reset_index(drop=True)
test=test[test['domain']=='LCS']
test=test[test['language']=='EN'][['utterance','intentid_expected']].rename(columns={'utterance':'text','intentid_expected':'label'})
test=test[test.label.isin(train.label)]


def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in re.split('\W+', text) if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return ' '.join(text)

def spell_correction(text):           # spelling correction
    txt=TextBlob(text)
    return txt.correct()

def clean_data(x):
    x=x.lower()
    x=x.encode('ascii','ignore').decode() # remove texts other than english
    x=re.sub('https*\S+','',x) # remove urls
    #x=spell_correction(x)
    x=remove_punct(x) # remove punctuations
    x=remove_stopwords(x) # remove stopwords
    #x=stemming(x) # stemming
    #x=lemmatizer(x) # lemmatization
    return ' '.join(x)


train['text']=train['text'].apply(lambda x: clean_data(x))
test['text']=test['text'].apply(lambda x: clean_data(x))

In [10]:
from sklearn import preprocessing
import pickle
le = preprocessing.LabelEncoder()
le.fit(train['label'])
train['label']=le.transform(train['label'])
test['label']=le.transform(test['label'])
with open('model_files/label_encoder_lcs.pkl', 'wb') as handle:
    pickle.dump(le, handle)

from sklearn.model_selection import train_test_split
train,valid = train_test_split(train, test_size=0.2)

## Tokenization

In [12]:
with open('model_files/label_encoder_lcs.pkl', 'rb') as handle:
    le=pickle.load(handle)
train_text,train_labels=train['text'],train['label']
val_text,val_labels=valid['text'],valid['label']
test_text,test_labels=test['text'],test['label']
bert = AutoModel.from_pretrained('albert-base-v1')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
seq_len = [len(str(i).split()) for i in train_text]
tokens_train = tokenizer.batch_encode_plus(train_text.astype(str).tolist(),max_length=25,truncation=True,pad_to_max_length=True)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(val_text.astype(str).tolist(),max_length=25,truncation=True,pad_to_max_length=True)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(test_text.astype(str).tolist(),max_length=25,truncation=True,pad_to_max_length=True)
## convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

## BERT model Config

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 64

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

## Pretrained BERT model

In [14]:

# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False
    
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,len(le.classes_))
      
      # dense layer 2 (Output layer)
      #self.fc2 = nn.Linear(512,len(le.classes_))

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      k = self.bert(sent_id, attention_mask=mask)
      cls_hs=k[1]
      x = self.fc1(cls_hs)

      #x = self.relu(x)

      #x = self.dropout(x)

      # output layer
      #x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [15]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 0.01)          # learning rate
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss() 

# number of training epochs
epochs = 10

## Training of BERT model

In [16]:
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  correct=0
  total=0
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    # if step % 100 == 0 and not step == 0:
    #   print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()
    correct += (torch.max(preds, 1)[1] == labels).float().sum()
    total += labels.shape[0]
    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds, correct/total

## Evaluate the BERT model

In [17]:
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []
  correct=0
  total=0
  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    #if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
            
      # Report progress.
      #print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()
      correct += (torch.max(preds, 1)[1] == labels).float().sum()
      total += labels.shape[0]
      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds ,correct/total

## Run training

In [18]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _, train_acc = train()
    
    #evaluate model
    valid_loss, _, valid_acc = evaluate()
    
    #save the best model

    torch.save(model.state_dict(), 'model_files/saved_weights_lcs.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}, Training Accuracy: {train_acc: .3f}')
    print(f'Validation Loss: {valid_loss:.3f}, Validation Accuracy: {valid_acc: .3f}')


 Epoch 1 / 10

Evaluating...

Training Loss: 5.990, Training Accuracy:  0.091
Validation Loss: 5.040, Validation Accuracy:  0.177

 Epoch 2 / 10

Evaluating...

Training Loss: 4.638, Training Accuracy:  0.178
Validation Loss: 4.018, Validation Accuracy:  0.290

 Epoch 3 / 10

Evaluating...

Training Loss: 4.342, Training Accuracy:  0.227
Validation Loss: 3.930, Validation Accuracy:  0.330

 Epoch 4 / 10

Evaluating...

Training Loss: 4.235, Training Accuracy:  0.239
Validation Loss: 3.666, Validation Accuracy:  0.336

 Epoch 5 / 10


KeyboardInterrupt: ignored

In [19]:
#load weights of best model
path = 'model_files/saved_weights_lcs.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [20]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
test['predicted']=le.inverse_transform(preds)
test['actual']=le.inverse_transform(test['label'])
from sklearn.metrics import accuracy_score
accuracy_score(test['actual'], test['predicted'])

0.22533136966126657

In [21]:
test.to_csv('Predictions/predicted_lcs.csv',index=None)