In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import libraries and read data

In [None]:
import os
#os.chdir('/content/drive/MyDrive/Dan/Dan')
os.chdir('/content/drive/MyDrive/Work/Dan')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
#library imports
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Cleaning Functions

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in re.split('\W+', text) if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return ' '.join(text)

def spell_correction(text):           # spelling correction
    txt=TextBlob(text)
    return txt.correct()

def clean_data(x):
    x=x.lower()
    x=x.encode('ascii','ignore').decode() # remove texts other than english
    x=re.sub('https*\S+','',x) # remove urls
    #x=spell_correction(x)
    x=remove_punct(x) # remove punctuations
    x=remove_stopwords(x) # remove stopwords
    #x=stemming(x) # stemming
    #x=lemmatizer(x) # lemmatization
    return ' '.join(x)

### Prepare Train Data

In [None]:
data=pd.read_excel('RNN-Data_2.xlsx',sheet_name=None)
df_train=data['train data'].rename(columns={'utterance':'text','intent':'label'})[['text','label']]
df_train['text']=df_train['text'].apply(lambda x: clean_data(x))
df_train['text_length'] = df_train['text'].apply(lambda x: len(x.split()))

In [None]:
df_train.head()

Unnamed: 0,text,label,text_length
0,external storage,access_management.access_to_file_storage_sites...,2
1,z file opener,computer.how_to_decrypt_a_file,3
2,check bank locked,access_management.unlock_1bank_account,3
3,msg file opened another program outlook,microsoft_office_365.msg_file,6
4,msg file opened outlook,microsoft_office_365.msg_file,4


In [None]:
df_train.shape

(19243, 3)

## Build the vocabulary

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_train['label'])
df_train['label']=le.transform(df_train['label'])


In [None]:
#tokenization
reviews=df_train.text
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

from collections import Counter
counts = Counter()
for index, row in df_train.iterrows():
    counts.update(tokenize(row['text']))
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
def encode_sentence(text, vocab2index, N=10):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length
df_train['encoded'] = df_train['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


num_words before: 3665
num_words after: 2165


## Build pytorch dataset and training and validataion functions.

In [None]:
X = list(df_train['encoded'])
y = list(df_train['label'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr) ### Adam optimizer
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y) # cross entropy loss
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = validation_metrics(model, val_dl)
        if i%5==0:
          print("train loss %.3f, test loss %.3f, test accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long().to(device)
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        #sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total

In [None]:
batch_size = 1000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

## Bidirectional LSTM

In [None]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # activation is taken as tanh
        self.linear = nn.Linear(hidden_dim, df_train.label.nunique())
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

model = LSTM_fixed_len(vocab_size, 50, 500)
model.to(device)
train_model(model, epochs=100, lr=0.01)


train loss 6.040, test loss 5.572, test accuracy 0.036
train loss 1.056, test loss 0.980, test accuracy 0.757
train loss 0.232, test loss 0.592, test accuracy 0.868
train loss 0.125, test loss 0.633, test accuracy 0.866
train loss 0.088, test loss 0.643, test accuracy 0.861
train loss 0.077, test loss 0.653, test accuracy 0.878
train loss 0.068, test loss 0.624, test accuracy 0.878
train loss 0.059, test loss 0.657, test accuracy 0.874
train loss 0.060, test loss 0.692, test accuracy 0.877
train loss 0.055, test loss 0.682, test accuracy 0.874
train loss 0.061, test loss 0.645, test accuracy 0.882
train loss 0.052, test loss 0.656, test accuracy 0.885
train loss 0.050, test loss 0.700, test accuracy 0.878
train loss 0.056, test loss 0.693, test accuracy 0.879
train loss 0.052, test loss 0.719, test accuracy 0.882
train loss 0.050, test loss 0.711, test accuracy 0.879
train loss 0.055, test loss 0.706, test accuracy 0.881
train loss 0.054, test loss 0.660, test accuracy 0.883
train loss

## Glove pretrained embedding vector

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

In [None]:
# def load_glove_vectors(glove_file="./glove.6B.50d.txt"):
#     """Load the glove word vectors"""
#     word_vectors = {}
#     with open(glove_file) as f:
#         for line in f:
#             split = line.split()
#             word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
#     return word_vectors

# def get_emb_matrix(pretrained, word_counts, emb_size = 50):
#     """ Creates embedding matrix from word vectors"""
#     vocab_size = len(word_counts) + 2
#     vocab_to_idx = {}
#     vocab = ["", "UNK"]
#     W = np.zeros((vocab_size, emb_size), dtype="float32")
#     W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
#     W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
#     vocab_to_idx["UNK"] = 1
#     i = 2
#     for word in word_counts:
#         if word in word_vecs:
#             W[i] = word_vecs[word]
#         else:
#             W[i] = np.random.uniform(-0.25,0.25, emb_size)
#         vocab_to_idx[word] = i
#         vocab.append(word)
#         i += 1   
#     return W, np.array(vocab), vocab_to_idx
# word_vecs = load_glove_vectors()
# pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

## Bidirectional LSTM with Glove embedding

In [None]:
# class LSTM_glove_vecs(torch.nn.Module) :
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
#         super().__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
#         self.embeddings.weight.requires_grad = False ## freeze embeddings
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.linear = nn.Linear(hidden_dim, 465)
#         self.dropout = nn.Dropout(0.2)
        
#     def forward(self, x, l):
#         x = self.embeddings(x)
#         x = self.dropout(x)
#         lstm_out, (ht, ct) = self.lstm(x)
#         return self.linear(ht[-1])

# model = LSTM_glove_vecs(vocab_size, 50, 500, pretrained_weights)
# model.to(device)
# train_model(model, epochs=50, lr=0.01)


# GRU with Glove embedding

In [None]:
# class GRU_glove_vecs(torch.nn.Module) :
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
#         super().__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
#         self.embeddings.weight.requires_grad = False ## freeze embeddings
#         self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
#         self.linear = nn.Linear(hidden_dim, 465)
#         self.dropout = nn.Dropout(0.2)
        
#     def forward(self, x, l):
#         x = self.embeddings(x)
#         x = self.dropout(x)
#         output, ht = self.gru(x)
#         return self.linear(ht[-1])

# model = GRU_glove_vecs(vocab_size, 50, 500, pretrained_weights)
# train_model(model, epochs=5, lr=0.1)


# Prediction on Test data and calculation of accuracy

### Accuracy calculation function

In [None]:
df_train=data['train data'].rename(columns={'utterance':'text','intent':'label'})[['text','label']]
def prep_data(df_test):
  df_test['text']=df_test['text'].apply(lambda x: clean_data(x))
  df_test=df_test[df_test['label'].isin(df_train['label'])] # take only matching rows from train and test data
  df_test['label']=le.transform(df_test['label'])
  df_test['encoded'] = df_test['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
  return df_test

def calc_acc(df_test): 
  try: 
    X_test = list(df_test['encoded'])
    y_test = list(df_test['label'])
    test_ds = ReviewsDataset(X_test, y_test)
    test_dl = DataLoader(test_ds, batch_size=len(X_test))
    _,test_accuracy=validation_metrics(model,test_dl)
    test_accuracy=test_accuracy.cpu().detach().numpy().reshape(-1)[0]
    return test_accuracy
  except:
    print('Cannot Calculate the accuracy as test data labels are completely different from train data.')

def predictions(df_test,model,le,filename):
  try:
    X_test = list(df_test['encoded'])
    y_test = list(df_test['label'])
    test_ds = ReviewsDataset(X_test, y_test)
    test_dl = DataLoader(test_ds, batch_size=len(X_test))
    x,y,l=next(iter(test_dl))
    x = x.long().to(device)
    y = y.long().to(device)
    y_pred = model(x, l)
    y_pred=y_pred.cpu().detach().numpy()
    y_pred=np.argmax(y_pred,axis=1)
    y_pred=le.inverse_transform(y_pred)
    df_test['Predictions']=y_pred
    df_test['Label']=le.inverse_transform(y_test)
    df_test[['text','Label','Predictions']].to_csv(filename,index=None)
    print('The predictions are saved as {}'.format(filename))
  except:
    print('Cannot Calculate the predictions as test data labels are completely different from train data.')

### Calculate accuracies on test datasets

In [None]:
df_test_unseen_iris=pd.read_excel('Unseen - IRIS.xlsx').rename(columns={'input_conversation':'text','Corrected Intent':'label'})[['text','label']]
df_test_unseen_iris=prep_data(df_test_unseen_iris)
calc_acc(df_test_unseen_iris)

0.24090463

In [None]:
df_test_golden_iris=pd.read_excel('GoldenCopyDataFile - IRIS.xlsx').rename(columns={'utterance':'text','intent':'label'})[['text','label']]
df_test_golden_iris=prep_data(df_test_golden_iris)
calc_acc(df_test_golden_iris)

0.8683955

In [None]:
df_test_golden_hiri=pd.read_excel('GoldenCopyDataFile - HIRI.xlsx').rename(columns={'utterance':'text','intent':'label'})[['text','label']]
df_test_golden_hiri=prep_data(df_test_golden_hiri)
calc_acc(df_test_golden_hiri)

Cannot Calculate the accuracy as test data labels are completely different from train data.


In [None]:
df_test_unseen_hiri=pd.read_excel('Unseen data - HIRI.xlsx').rename(columns={'Intent':'label','Input Conversation':'text'})[['text','label']]
df_test_unseen_hiri=prep_data(df_test_unseen_hiri)
calc_acc(df_test_unseen_hiri)

Cannot Calculate the accuracy as test data labels are completely different from train data.


# Predictions

In [None]:
predictions(df_test_unseen_iris,model,le,'OutputunseenIRIS_out.csv')

The predictions are saved as OutputunseenIRIS_out.csv


In [None]:
predictions(df_test_golden_iris,model,le,'OutputgoldenIRIS_out.csv')

The predictions are saved as OutputgoldenIRIS_out.csv


In [None]:
predictions(df_test_golden_hiri,model,le,'OutputgoldenHIRI_out.csv')

Cannot Calculate the predictions as test data labels are completely different from train data.


In [None]:
predictions(df_test_unseen_hiri,model,le,'OutputunseenHIRI_out.csv')

Cannot Calculate the predictions as test data labels are completely different from train data.


## The Accuracies and predictions fro HIRI datasets could not be calculated since the test data is completely different from the train data.