**Name of Student** :- Gauri Kishor Damle\
**NUID**:- 002931881

In [None]:
#### Import required libraries
import os
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
!pip install contractions
import contractions
!pip install datasets
import datasets

### nltk
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
import string 
from nltk.stem import PorterStemmer
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle

## sklearn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer


## tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, Activation, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer


## gensim 
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.corpora import Dictionary
import gensim.downloader as api
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors



In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Reading train and test sets 

In [None]:
df_train_data = pd.read_csv('/content/drive/MyDrive/SemesterTwo/IE7374/Assignment 4/archive/atis_intents_train.csv', header=0, names=['intent','text'])
df_test_data = pd.read_csv('/content/drive/MyDrive/SemesterTwo/IE7374/Assignment 4/archive/atis_intents_test.csv', header=0, names=['intent','text'])

In [None]:
df_train_data

#### The above mentioned data contains 8 different intents. It has 4833 records with customer questions which can be classified into these 8 intents.

#### The ATIS dataset is a standard benchmark dataset widely used as an intent classification. ATIS Stands for Airline Travel Information System. Intent classification is an important component of Natural Language Understanding (NLU) systems in any chatbot platform.


#### ATIS dataset provides large number of messages and their associated intents that can be used in training a classifier. Within a chatbot, intent refers to the goal the customer has in mind when typing in a question or comment. While entity refers to the modifier the customer uses to describe their issue, the intent is what they really mean. 

In [None]:
df_train_data.info()

In [None]:
df_train_data.shape

# **Common functions for Data Cleaning and Preprocessing**

The Input data contains unclean data.

To improve performance of model, lower casing, punctuation removal stop words removal is performed as part of preprocessing.

#### Data processing functions 

In [None]:
#### Lower Casing
def lowerCasing(total_dataset):
  return total_dataset.apply(lambda x: " ".join(x.lower() for x in x.split()))

#### Removing contractions
def removeContraction(data):
  expanded_words = []   
  for word in data.split():
    # using contractions.fix to expand the shortened words
    expanded_words.append(contractions.fix(word))  
   
  expanded_text = ' '.join(expanded_words)
  return expanded_text

#### Punctuation Removal
def removePunctuation(data):
  return data.str.replace('[^\w\s]','')


#### Identifying and Removing Stop Words
def removeStopWords(data):
  stop_words = stopwords.words('english')
  sw_list = ['1','2','3','4','5','6','7','8','9','0']
  stop_words.extend(sw_list)
  return data.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))


## text cleaning
#Preprovessing function to remove tags, html, special characters,and etc
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)
    
def preprocess_text(inputSentence):

    # Removing html tags
    sentence = remove_tags(inputSentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

     # Removing email address 
    sentence = re.sub(r'[^\s]+@[^*()\s]+', ' ', sentence)

     # Removing http address 
    sentence = re.sub(r'(http|https)://[^\s]*', ' ', sentence)

     # Removing dollar address 
    sentence = re.sub(r'[$]+', ' ', sentence)
    
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

# **Preprocessing training data**

In [None]:
df_train = df_train_data.copy()

In [None]:
df_train['text'] = lowerCasing(df_train['text'])

In [None]:
df_train['text'] = df_train.text.apply(lambda x : removeContraction(x))

In [None]:
df_train['text'] = removePunctuation(df_train['text'])

In [None]:
df_train['text'] = removeStopWords(df_train['text'])

df_train.text = df_train.text.apply(lambda x : preprocess_text(x))
df_train

In [None]:
x_train = df_train.text.apply(lambda x : preprocess_text(x))
x_train.shape

In [None]:
df_train['intent'].value_counts().nunique()

In [None]:
df_train['intent'].value_counts()

In [None]:
y_train = df_train['intent']
one_hot = MultiLabelBinarizer()
y_train = one_hot.fit_transform(y_train)

In [None]:
y_train.shape

# **EDA on preprocessed data**


In [None]:
# Word Count
df_train['WordCount'] = df_train['text'].str.len() #Word Count Per review
df_train.reset_index(drop=True)

In [None]:
df_train

#### Distribution of Word Counts

In [None]:
plt.figure(figsize=(14,7))

plt.hist(df_train[df_train['intent']=="atis_flight"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_airfare"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_ground_service"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_airline"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_abbreviation"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_aircraft"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_flight_time"]['WordCount'], bins = 30, alpha = 0.7)
plt.hist(df_train[df_train['intent']=="atis_quantity"]['WordCount'], bins = 30, alpha = 1)


plt.legend(["atis_flight","atis_airfare","atis_ground_service","atis_airline","atis_abbreviation","atis_aircraft","atis_flight_time","atis_quantity"])
plt.show()


#### WordCloud generation

In [None]:


#Import packages
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

#Set defaults for graph formats
plt.rcParams['figure.figsize']=(14,7) #Sets default for the size of the graph
matplotlib.rcParams['axes.titlesize']=24 #Sets default for the size of the title
matplotlib.rcParams['axes.labelsize']=20 #Sets default for the size of the x/y axis labels

In [None]:
intent=["atis_abbreviation", "atis_aircraft", "atis_airfare", "atis_airline",
           "atis_flight", "atis_flight_time", "atis_ground_service","atis_quantity"]


x = df_train.groupby(['intent'])['WordCount'].sum().reset_index()['intent']
y = df_train.groupby(['intent'])['WordCount'].sum().reset_index()['WordCount']
x_pos = np.arange(len(x))
plt.style.use('ggplot')
barchart=plt.bar(x_pos, y, color=['#DC8458', '#950702', '#8E067D', '#2E8C44', '#395196', '#60A619','#ECA10A'])
plt.xlabel("intent")
plt.ylabel("Word Count")
plt.title("Total Number of Words in Each Intent")
plt.xticks(x_pos, intent)

# Add counts above the two bar graphs
for bar in barchart:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, height, '%d' % int(height), ha='center', va='bottom')

#### As it can be seen in histogram and bar graph, the dataset is baised towards the ATIS flight. Overall more than 50 percent approx 78% questions are with intent of Flight.

In [None]:
chart = sns.countplot(df_train.intent)
plt.title("Number of texts rows per intent")
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right');

In [None]:
intent=["atis_abbreviation", "atis_aircraft", "atis_airfare", "atis_airline",
           "atis_flight", "atis_flight_time", "atis_ground_service","atis_quantity"]


plt.figure(figsize = (14, 7))
data = df_train.intent.value_counts()

ax  = data.plot.pie(autopct = '%1.1f%%', labels = data.index,  fontsize = 14)
ax.set_title('Intent Distribution', fontsize = 18)
plt.axis('off')
ax.legend(labels = data.index, loc = "upper left", fontsize = 14, fancybox = True, labelspacing = 1, framealpha = 1, shadow=True, borderpad=1)
plt.show()

In [None]:
text = " ".join(i for i in x_train )

wordcloud = WordCloud().generate(text)
plt.figure( figsize=(14,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


# **Preprocessing test data**

In [None]:
df_test = df_test_data.copy()

In [None]:
df_test['text'] = lowerCasing(df_test['text'])

In [None]:
df_test['text']  = df_test.text.apply(lambda x : removeContraction(x))

In [None]:
df_test['text'] = removePunctuation(df_test['text'])

In [None]:
df_test['text'] = removeStopWords(df_test['text'])

df_test.text = df_test.text.apply(lambda x : preprocess_text(x))

In [None]:
x_test = df_test.text.apply(lambda x : preprocess_text(x))


In [None]:
df_test['intent'].value_counts().nunique()

In [None]:
df_test['intent'].value_counts()

In [None]:
y_test = df_test['intent']
one = MultiLabelBinarizer()


In [None]:
y_test = one.fit_transform(y_test)

In [None]:
y_test.shape

# **Creating Word Vectors for Model Build**

In [None]:
x_train = [d.split() for d in x_train]


In [None]:
y_train = np.array(y_train)


In [None]:
y_train.dtype

In [None]:
OUTPUT_DIM = 300

In [None]:
## tenserflow keras tockenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_train

In [None]:
len(tokenizer.word_index)

In [None]:
plt.hist([len(x) for x in x_train], bins =30)

In [None]:
#### As per histomram the max 21 words are present in the messages, so we can keep maxlen = 21
array = np.array([len(x) for x in x_train])
len(array[array>21])

## As we have almost all sentences with 21 word count per sentence, truncate the sequence lenght with maxlen = 21

In [None]:
input_length = 21
x_train = pad_sequences(x_train, maxlen=input_length)



In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
INPUT_VOAB_LEN = len(tokenizer.word_index)+1
vocab = tokenizer.word_index
INPUT_VOAB_LEN

In [None]:

x_test = [d.split() for d in x_test]


In [None]:
y_test = np.array(y_test)

In [None]:
y_test.shape

In [None]:
x_test = tokenizer.texts_to_sequences(x_test)
x_test

In [None]:
x_test = pad_sequences(x_test, maxlen=input_length)
x_test.shape

# **Common Functions for LSTM**

In [None]:
##### Common functionS

def show_performance_plot(mdl):
  #show the model accuracy
  plt.plot(mdl.history['acc'])
  plt.plot(mdl.history['val_acc'])

  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train','test'], loc='upper left')
  plt.show()
  #show model loss
  plt.plot(mdl.history['loss'])
  plt.plot(mdl.history['val_loss'])

  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train','test'], loc='upper left')
  plt.show()



def LSTM_model_build(input_dim, output_dim, embedding_vectors, input_length, x_train, y_train, x_test, y_test): 

  model_LSTM = Sequential()
  model_LSTM.add(Embedding(input_dim = input_dim, output_dim = output_dim, weights = [embedding_vectors], input_length = input_length, trainable = False))


  model_LSTM.add(LSTM(128))
  

  model_LSTM.add(Dense(128, activation='sigmoid'))



 
  model_LSTM.add(Dense(21, activation='softmax'))
  model_LSTM.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
  model_LSTM.summary()

  model_LSTM_plt = model_LSTM.fit(x_train, y_train, verbose=1, validation_split =0.3, epochs=100,callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True)],  batch_size = 128)
  y_pred = (model_LSTM.predict(x_test) >= 0.5).astype(int)
  score, accuracy = model_LSTM.evaluate(x_test, y_test, verbose=2, batch_size = 128)
  print("accuracy value = {}".format(accuracy*100))
 
  show_performance_plot(model_LSTM_plt)




  

  


 


# **Pretrained Model word2vec**

In [None]:
# https://code.google.com/archive/p/word2vec/
DIR = "/content/drive/MyDrive/SemesterTwo/IE7374/Project1/"

model_w2v = KeyedVectors.load_word2vec_format(DIR+'GoogleNews-vectors-negative300.bin.gz', binary=True)
print("word2vec model loaded")

In [None]:
embedding_vectors_pretrained = np.zeros((INPUT_VOAB_LEN, OUTPUT_DIM))

#if word in vocab.items():
for word, i in vocab.items():
  if word in model_w2v.wv:
    embedding_vectors_pretrained[i] = model_w2v.wv[word]

In [None]:
LSTM_model_build(INPUT_VOAB_LEN, OUTPUT_DIM, embedding_vectors_pretrained, input_length, x_train, y_train, x_test, y_test)

# **Pretrained Model Fastext**

In [None]:
# https://code.google.com/archive/p/word2vec/
DIR = "/content/drive/MyDrive/SemesterTwo/IE7374/Project1/"


model_fasttext = KeyedVectors.load_word2vec_format(DIR+'wiki-news-300d-1M.vec')
print("FastText model loaded")

In [None]:
# getting initial weights from model_fasttext model
embedding_vectors_pretrained = np.zeros((INPUT_VOAB_LEN, 300))

#if word in vocab.items():
for word, i in vocab.items():
  if word in model_fasttext.wv:
    embedding_vectors_pretrained[i] = model_fasttext.wv[word]

In [None]:
LSTM_model_build(INPUT_VOAB_LEN, OUTPUT_DIM, embedding_vectors_pretrained, input_length, x_train, y_train, x_test, y_test)

# **Pretrained Model Glove**

In [None]:
# https://nlp.stanford.edu/projects/glove/
DIR = "/content/drive/MyDrive/SemesterTwo/IE7374/Project1/"

embeddings_index = {}

# Pre-trained Glove
#if option == 1:
with open(os.path.join(DIR, 'glove.6B.300d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

In [None]:

print('Preparing embedding matrix.')
# prepare embedding matrix

embedding_matrix = np.zeros((INPUT_VOAB_LEN, OUTPUT_DIM))
for word, i in vocab.items():

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

In [None]:
LSTM_model_build(INPUT_VOAB_LEN, OUTPUT_DIM, embedding_matrix, input_length, x_train, y_train, x_test, y_test)

# **Bert Model**

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
print(tokenizer.vocab)

In [None]:
#size of vocabulary
print(len(tokenizer.vocab))

In [None]:
sentence = "I like NLP"
print(sentence)

# bert model uses word piece tokenization GPT2 uses byte pair encoding tokenization
tokens = tokenizer.tokenize(sentence)
print(tokens)


ids = tokenizer.encode(sentence)
print(ids)
print(tokenizer.decode(ids))

In [None]:
## checking for class imbalance in multiclass classification
df_train_data['intent'].value_counts()

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

## use GPU for faster runtime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## for multiclass classification
num_labels = 8 

model = (
    AutoModelForSequenceClassification.from_pretrained(
        checkpoint,
        num_labels = num_labels

    ).to(device))

In [None]:
def bert_tokenize_function(batch):
  return tokenizer(batch["text"], padding = True, truncation=True)

In [None]:
df_train["label"] = df_train["intent"]
df_train = df_train.drop("intent", axis=1)


In [None]:
le = LabelEncoder()
df_train["label"] = le.fit_transform(df_train["label"])



In [None]:
df_train

In [None]:
df_test["label"] = df_test["intent"]
df_test = df_test.drop("intent", axis=1)

In [None]:
le = LabelEncoder()
df_test["label"] = le.fit_transform(df_test["label"])



In [None]:
from datasets import Dataset
train_dataset = Dataset.from_dict(df_train)
test_dataset = Dataset.from_dict(df_test)
validation_dataset = Dataset.from_dict(df_train.tail(100))

my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "validation":validation_dataset})

In [None]:

encoded_dataset_test = my_dataset_dict.map(bert_tokenize_function, batched=True, batch_size=None)

In [None]:
encoded_dataset_test

In [None]:
encoded_dataset_test['train'][0]
    


In [None]:
from transformers import trainer, TrainingArguments

batch_size = 8
logging_steps = len(encoded_dataset_test["train"])

In [None]:
model_name =f"{checkpoint}-finetuned-model"
training_args = TrainingArguments(
    output_dir = model_name,
    num_train_epochs = 2,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    disable_tqdm = False,
    logging_steps = logging_steps,
    log_level = "error",
    optim = 'adamw_torch'
)

In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = encoded_dataset_test['train'],
                  eval_dataset = encoded_dataset_test['validation'],
                  tokenizer = tokenizer)



In [None]:
trainer.train()

In [None]:
trainer.predict(encoded_dataset_test['test'])

In [None]:
preds = trainer.predict(encoded_dataset_test['test'])
preds


In [None]:
preds.predictions.shape

In [None]:
def get_accuracy(preds):
  predictions = preds.predictions.argmax(axis=-1)
  labels = preds.label_ids
  accuracy = accuracy_score(preds.label_ids, preds.predictions.argmax(axis=-1))
  return {'accuracy': accuracy}

In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model, 
                  compute_metrics=get_accuracy,
                  args=training_args, 
                  train_dataset=encoded_dataset_test["train"],
                  eval_dataset = encoded_dataset_test['validation'],
                  tokenizer=tokenizer)
trainer.train();

#### Training run

In [None]:
batch_size = 16
logging_steps = len(encoded_dataset_test["train"]) // batch_size
model_name = f"{checkpoint}-finetuned-imdb"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim='adamw_torch'
                                  )

In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model, 
                  args=training_args, 
                  compute_metrics=get_accuracy,
                  train_dataset=encoded_dataset_test["train"],
                  eval_dataset = encoded_dataset_test['validation'],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
model_name

In [None]:
from transformers import pipeline
classifier = pipeline('text-classification', model=model_name)
classifier('what flights are available from here')

In [None]:
classifier('This was ticket ')


# **Final Conclusion**

#### Accuracy comparison for following models
#### LSTM model with following word embedding vectors

#### Word2Vec

1.   Pretrained - 90%

#### FastText

1.   Pretrained - 94%

#### Glove Pretrained

1.   Pretrained - 89%


#### Bert - simpletransformer word embeddings

1.   Pretrained - 97%


---



As the size of data increases, the colab can crash while bulding models. Thus to improve model performance in such cases, we can shift the processing to GPU processor for faster results.