## Q1. POS Tagging using NN

In [87]:
import tensorflow as tf
import nltk
from nltk.corpus import treebank
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import regularizers
from keras.utils import np_utils
from keras.utils import plot_model
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

In [88]:
tagged_sentences = treebank.tagged_sents()
np.random.seed(1)

In [89]:
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.brown.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 1161192


In [90]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

### Train test split and preprocessing:

In [91]:
split = int(len(tagged_sentences)*.7)
training_sentences = tagged_sentences[:split]
test_sentences = tagged_sentences[split:]

print(len(training_sentences))
print(len(test_sentences))

2739
1175


In [92]:
def get_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])

    return X, y

### Vectorizing Dataset:

In [93]:
X, y = get_dataset(training_sentences)
X_test, y_test = get_dataset(test_sentences)
dict_vectorize = DictVectorizer()
dict_vectorize.fit(X + X_test)
X = dict_vectorize.transform(X)
X_test = dict_vectorize.transform(X_test)
print(X)

  (0, 0)	0.0
  (0, 1)	0.0
  (0, 2)	0.0
  (0, 3)	0.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	0.0
  (0, 7)	0.0
  (0, 4125)	1.0
  (0, 12011)	1.0
  (0, 12493)	1.0
  (0, 14470)	1.0
  (0, 16147)	1.0
  (0, 28603)	1.0
  (0, 29206)	1.0
  (0, 31509)	1.0
  (0, 35593)	1.0
  (1, 0)	0.0
  (1, 1)	0.0
  (1, 2)	0.0
  (1, 3)	0.0
  (1, 4)	1.0
  (1, 5)	0.0
  (1, 6)	0.0
  (1, 7)	0.0
  :	:
  (71055, 11981)	1.0
  (71055, 12081)	1.0
  (71055, 12855)	1.0
  (71055, 27718)	1.0
  (71055, 28559)	1.0
  (71055, 28656)	1.0
  (71055, 29418)	1.0
  (71055, 32292)	1.0
  (71056, 0)	0.0
  (71056, 1)	0.0
  (71056, 2)	1.0
  (71056, 3)	1.0
  (71056, 4)	1.0
  (71056, 5)	0.0
  (71056, 6)	1.0
  (71056, 7)	0.0
  (71056, 8)	1.0
  (71056, 11977)	1.0
  (71056, 12054)	1.0
  (71056, 12816)	1.0
  (71056, 16614)	1.0
  (71056, 28555)	1.0
  (71056, 28633)	1.0
  (71056, 29296)	1.0
  (71056, 31832)	1.0


In [94]:
label_encoder = LabelEncoder()
label_encoder.fit(y + y_test)
y = label_encoder.transform(y)
y_test = label_encoder.transform(y_test)
print(y)

[21 21  3 ... 20  7  2]


In [95]:
y = np_utils.to_categorical(y)
y_test = np_utils.to_categorical(y_test)
print('x_train shape: ', X.shape)
print('y_train shape: ', y.shape)
print('x_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (71057, 44234)
y_train shape:  (71057, 46)
x_test shape:  (29619, 44234)
y_test shape:  (29619, 46)


### Training neural net (3 - Layered):

In [96]:
model = Sequential()
model.add(Dense(units=512, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.05)))
model.add(Dropout(0.2))
model.add(Dense(units=y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, validation_split=0.2, epochs=5, batch_size=256, verbose=1)

Train on 56845 samples, validate on 14212 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model Evaluation and accuracy

In [97]:
test_loss, accuracy = model.evaluate(X_test, y_test)
print(accuracy)

0.9411188763969074


## Q2. Dialogue Act Recognition

In [79]:
from nltk.stem.lancaster import LancasterStemmer
import csv
from keras.regularizers import l2

### Training Data Description:
The data has been taken from switchboard dialogue act corpus and multiple transcripts are combined with just text and a particular class of dialogue in a single csv file.

The data is taken such that currently only 3 classes i.e Greetings, Goodbye and Requests have been considered as per annotations from SwDA corpora

In [80]:
training_data= []
act_tags = {'fp': 'Greet', 'fc': 'Goodbye', 'qy' : 'Request', 'qw' : 'Request', 'qo' : 'Request', 'qr' : 'Request', 'qrr': 'Request'}#greeting, goodby, and request
corpusReader = csv.reader(open('./swda/combined.csv', newline=''), delimiter =',')
for row in corpusReader:
    if row[1] in act_tags:
        training_data.append({'class': act_tags[row[1]], 'sentence': row[2]})

### Extraction and Stemming

In [81]:
words= []
classes = []
sentences = []
dialogues = []
ignore_words = ['?']
for pattern in training_data:
    w = nltk.word_tokenize(pattern['sentence'])
    words.extend(w)
    dialogues.append(w)
    sentences.append(pattern['sentence'])
    classes.append(pattern['class'])

stemmer = LancasterStemmer()
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))

### Generating bag of words

In [82]:
def bof(dialogues):
    dataset = []
    for dialoge in dialogues:
        bag = []
        sent = [stemmer.stem(w.lower()) for w in dialoge if w not in ignore_words]
        for w in words:
            bag.append(1) if w in sent else bag.append(0)
        dataset.append(bag)
    dataset = np.array(dataset)
    return dataset
dataset = bof(dialogues)

In [83]:
#One-Hot Encode
label_encoder = LabelEncoder()
label_encoder.fit(classes)
labels = label_encoder.transform(classes)
labels = np_utils.to_categorical(labels, num_classes=3)

### Generating model (3 - layer NN model) 

In [84]:
model = Sequential()
model.add(Dense(units=512, input_dim=dataset.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=256, activation='relu', kernel_regularizer=l2(0.03)))
model.add(Dense(units=labels.shape[1], activation='softmax'))
https://techblog.cdiscount.com/part-speech-tagging-tutorial-keras-deep-learning-library/
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(dataset, labels, validation_split=0.2, epochs=5, batch_size=32, verbose=1)

Train on 7478 samples, validate on 1870 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Making predictions

In [85]:
input_vectors = ['hello', 'How are you?', 'Excuse me', 'HI', 'Would you like to join?', '']
input_vectors = [nltk.word_tokenize(w) for w in input_vectors]
input_vectors = bof(input_vectors)
label_prob = model.predict(input_vectors)
label_pred = label_prob.argmax(axis=-1)
print(label_encoder.inverse_transform(label_pred))

['Greet' 'Request' 'Request' 'Greet' 'Request' 'Request']


  if diff:


#### References:
1. http://www.aclweb.org/anthology/C94-1027
2. https://techblog.cdiscount.com/part-speech-tagging-tutorial-keras-deep-learning-library/
3. https://machinelearnings.co/text-classification-using-neural-networks-f5cd7b8765c6