# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from scipy import spatial
from nltk.corpus import stopwords

In [4]:
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [5]:
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers import Input, Dense
from keras.models import Sequential

In [7]:
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

In [None]:
train = pd.read_csv("../input/atis-airlinetravelinformationsystem/atis_intents_train.csv", header=None)
test = pd.read_csv("../input/atis-airlinetravelinformationsystem/atis_intents_test.csv", header=None)

# Data Cleaning

In [8]:
words = set(stopwords.words("english"))

In [11]:
words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
train.head()

Unnamed: 0,0,1
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [19]:
test.head()

Unnamed: 0,0,1
0,atis_flight,i would like to find a flight from charlotte ...
1,atis_airfare,on april first i need a ticket from tacoma to...
2,atis_flight,on april first i need a flight going from pho...
3,atis_flight,i would like a flight traveling one way from ...
4,atis_flight,i would like a flight from orlando to salt la...


## Stopwords Corpus

In [27]:
train['text'] = train[1].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
test['text'] = test[1].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))

## Digits Removal \d+

In [28]:
train['text'] = train['text'].str.replace('\d+', '')
test['text'] = test['text'].str.replace('\d+', '')

In [29]:
train

Unnamed: 0,0,1,text
0,atis_flight,i want to fly from boston at 838 am and arriv...,want fly boston arrive denver morning
1,atis_flight,what flights are available from pittsburgh to...,flights available pittsburgh baltimore thursda...
2,atis_flight_time,what is the arrival time in san francisco for...,arrival time san francisco flight leaving was...
3,atis_airfare,cheapest airfare from tacoma to orlando,cheapest airfare tacoma orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...,round trip fares pittsburgh philadelphia dollars
...,...,...,...
4829,atis_airfare,what is the airfare for flights from denver t...,airfare flights denver pittsburgh delta airline
4830,atis_flight,do you have any flights from denver to baltim...,flights denver baltimore via dallas
4831,atis_airline,which airlines fly into and out of denver,airlines fly denver
4832,atis_flight,does continental fly from boston to san franc...,continental fly boston san francisco stop denver


In [30]:
text = train['text']
labels = train[0]
test_text = test['text']
test_labels = test[0]

In [32]:
labels.nunique()

8

## Tokenize and Padding

In [38]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(text)
word_index = tok.word_index

Indexed each words as there are 631 chars, words are listed to 0-631

In [39]:
word_index

{'flights': 1,
 'flight': 2,
 'boston': 3,
 'show': 4,
 'san': 5,
 'denver': 6,
 'francisco': 7,
 'atlanta': 8,
 'pittsburgh': 9,
 'dallas': 10,
 'baltimore': 11,
 'philadelphia': 12,
 'like': 13,
 'list': 14,
 'airlines': 15,
 'washington': 16,
 'leaving': 17,
 'please': 18,
 'morning': 19,
 'pm': 20,
 'would': 21,
 'fly': 22,
 'fare': 23,
 'first': 24,
 'oakland': 25,
 'wednesday': 26,
 "i'd": 27,
 'ground': 28,
 'trip': 29,
 'transportation': 30,
 'cheapest': 31,
 'need': 32,
 'class': 33,
 'round': 34,
 'arriving': 35,
 'available': 36,
 'city': 37,
 'afternoon': 38,
 'american': 39,
 'one': 40,
 'give': 41,
 'want': 42,
 'fares': 43,
 'way': 44,
 'new': 45,
 'leave': 46,
 'dc': 47,
 'york': 48,
 'earliest': 49,
 'thursday': 50,
 'nonstop': 51,
 'arrive': 52,
 'monday': 53,
 'united': 54,
 'tuesday': 55,
 'go': 56,
 'information': 57,
 'st': 58,
 'milwaukee': 59,
 'find': 60,
 'twenty': 61,
 'miami': 62,
 'sunday': 63,
 'evening': 64,
 'vegas': 65,
 'las': 66,
 'noon': 67,
 'delta'

In [40]:
max_vocab_size = len(word_index) + 1
input_length = 25

In [46]:
train_data_tokens = tok.texts_to_sequences(text)
test_data_tokens = tok.texts_to_sequences(test_text)

### Tokenized each word based off of word index

In [47]:
train_data_tokens

[[42, 22, 3, 52, 6, 19],
 [1, 36, 9, 11, 50, 19],
 [310, 139, 5, 7, 2, 17, 16],
 [31, 218, 137, 93],
 [34, 29, 43, 9, 12, 129],
 [32, 2, 100, 172, 133],
 [169, 101, 163, 2, 102, 10],
 [4, 1, 9, 104, 105, 50],
 [1, 3, 16],
 [169, 28, 30, 36, 6],
 [4, 1, 10, 5, 7],
 [4, 1, 5, 80, 69, 44, 90],
 [31, 2, 3, 234],
 [1, 11, 20],
 [4, 24, 33, 43, 3, 6],
 [4, 28, 30, 6],
 [1, 6, 9, 17, 20, 20],
 [32, 57, 1, 55, 17, 11, 10, 10, 3, 3, 11],
 [18, 41, 1, 3, 9, 50, 82, 206],
 [21, 13, 22, 6, 9, 54, 15],
 [4, 1, 5, 80, 69],
 [18, 14, 24, 33, 1, 54, 6, 11],
 [357, 283, 163, 39, 15],
 [27, 13, 57, 119, 6, 9, 8],
 [27, 13, 142, 2, 8, 6],
 [83, 170, 6, 9, 8],
 [4, 1, 3, 9, 26, 82, 206, 46, 3, 72, 20],
 [8, 28, 30],
 [207, 32, 152, 10, 3, 35, 67],
 [4, 31, 34, 29, 23, 11, 10],
 [4, 1, 45, 48, 59, 168, 15, 132],
 [1, 45, 48, 62],
 [18, 60, 2, 68, 12, 5, 7, 41, 2, 466],
 [32, 2, 10, 5, 7],
 [43, 1, 9, 12],
 [4, 15, 24, 33, 1],
 [1, 58, 134, 183, 63, 64],
 [27, 13, 295, 2, 9, 8],
 [4, 1, 11, 5, 7, 20, 20, 81

In [48]:
train_input = pad_sequences(train_data_tokens, input_length)
test_input = pad_sequences(test_data_tokens, input_length)

### Padded each sentence (text) to the same size of 25

In [49]:
train_input

array([[ 0,  0,  0, ..., 52,  6, 19],
       [ 0,  0,  0, ..., 11, 50, 19],
       [ 0,  0,  0, ...,  2, 17, 16],
       ...,
       [ 0,  0,  0, ..., 15, 22,  6],
       [ 0,  0,  0, ...,  7, 78,  6],
       [ 0,  0,  0, ...,  6,  5,  7]], dtype=int32)

## One Hot Encode with LabelEncoder

In [50]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(labels)

LabelEncoder()

In [53]:
# from sklearn.externals import joblib
# joblib.dump(label_transformer, 'atis-airlinetravelinformationsystem/label_encoder.pk1')


In [54]:
labels = label_transformer.transform(labels)
test_labels = label_transformer.transform(test_labels)

In [55]:
labels

array([4, 4, 5, ..., 3, 4, 4])

In [56]:
labels = to_categorical(np.asarray(labels))
test_labels = to_categorical(np.asarray(test_labels))

In [57]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Train Validation Split

In [60]:
X_train, X_val, y_train, y_val = train_test_split(train_input, labels, test_size=0.2, random_state=1)

In [62]:
X_train

array([[  0,   0,   0, ..., 203, 201,  33],
       [  0,   0,   0, ...,  42, 110,   6],
       [  0,   0,   0, ..., 176, 155,  70],
       ...,
       [  0,   0,   0, ...,   6,  12,  76],
       [  0,   0,   0, ...,  81,  75, 230],
       [  0,   0,   0, ...,  50, 167,  81]], dtype=int32)

# Word Embeddings - Vector Representations

Embedded Index saves the info from pretrained GloVe model which can be later used for word embedding in terms of its
application to our specific model. Our embedded matrix is first matrix of zeros, and then updated according to the 
our dataset-GloVe dataset comparison.

In [65]:
embedded_dim = 300
embedded_index = dict()

with open('../input/glove42b300dtxt/glove.42B.300d.txt', 'r', encoding='utf-8') as glove:
    for line in glove:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedded_index[word] = vector

In [66]:
glove.close

<function TextIOWrapper.close()>

In [69]:
embedded_matrix = np.zeros((max_vocab_size, embedded_dim))
for x, i in word_index.items():
    vector = embedded_index.get(x)
    if vector is not None:
        embedded_matrix[i] = vector

# CNN for NLP task

As words and their sequence are important for NLP solutions, pixels and their order are also essential and something valubale to keep in mind while training

In [70]:
model = Sequential()
model.add(Embedding(max_vocab_size, 300, input_length=input_length, weights=[embedded_matrix], trainable=False))

In [71]:
model.add(Conv1D(filters=32, kernel_size=8, activation='selu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='selu'))
model.add(Dense(8, activation='sigmoid'))

In [72]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 300)           189600    
_________________________________________________________________
conv1d (Conv1D)              (None, 18, 32)            76832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 9, 32)             0         
_________________________________________________________________
flatten (Flatten)            (None, 288)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                2890      
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 88        
Total params: 269,410
Trainable params: 79,810
Non-trainable params: 189,600
_____________________________________________

In [73]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, verbose=2)

Epoch 1/5
121/121 - 1s - loss: 0.7109 - accuracy: 0.8273
Epoch 2/5
121/121 - 1s - loss: 0.2438 - accuracy: 0.9403
Epoch 3/5
121/121 - 1s - loss: 0.1174 - accuracy: 0.9716
Epoch 4/5
121/121 - 1s - loss: 0.0503 - accuracy: 0.9902
Epoch 5/5
121/121 - 1s - loss: 0.0259 - accuracy: 0.9956


<tensorflow.python.keras.callbacks.History at 0x7f3644236050>

In [74]:
model.evaluate(X_val, y_val)



[0.1357804387807846, 0.9689761996269226]

In [81]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()

In [82]:
predictions = model.predict(test_input)

In [85]:
print(acc(test_labels, predictions))

0.98375


# # Thanks for reading it to the end. Credits to the OpenSourceCommunity