In [5]:
import json
import os
import random
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import nltk
from nltk.corpus import words
import re
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [6]:
with open(r"dataset.json.txt") as f:
    data=json.load(f)

In [7]:
intent_classes=[]
unique_intent=[]
vocab=[]

#print(data.keys())
for i in data["train"]:
        if i[1] not in intent_classes:
            intent_classes.append(i[1])
            
unique_intent=random.sample(intent_classes, 20)
print(unique_intent)

['jump_start', 'reminder', 'whisper_mode', 'what_are_your_hobbies', 'tire_pressure', 'vaccines', 'reset_settings', 'sync_device', 'time', 'timezone', 'travel_notification', 'update_playlist', 'balance', 'mpg', 'w2', 'restaurant_suggestion', 'book_flight', 'who_made_you', 'change_ai_name', 'calculator']


In [8]:
intent=[]
sentences=[]
intent_val=[]
sentences_val=[]
intent_test=[]
sentences_test=[]

for i in data["train"]:
    if i[1] in unique_intent:
        intent.append(i[1])
        sentences.append(i[0])

for i in data["val"]:
    if i[1] in unique_intent:
        intent_val.append(i[1])
        sentences_val.append(i[0])

In [9]:
stemmer=SnowballStemmer("english", ignore_stopwords=False)
max_length=0

def cleaning(sentences):
    words=[]
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([stemmer.stem(i.lower()) for i in w])
    
    return words

cleaned_words=cleaning(sentences)
cleaned_words_val=cleaning(sentences_val)

In [10]:
for sent in cleaned_words:
    for i in sent:
        if i not in vocab:
            vocab.append(i)

for sent in cleaned_words_val:
    for i in sent:
        if i not in vocab:
            vocab.append(i)

vocab.sort()
n=len(vocab)
print(n)

1461


In [11]:
train_X,val_X=[],[]

for sent in cleaned_words:
    max_length=max(max_length,len(sent))
    list1=[0]*n
    for word in sent:
        index=vocab.index(word)
        list1[index]=1
    train_X.append(list1)
    
Train_X=np.array(train_X)
print(Train_X.shape)


(2000, 1461)


In [12]:
for sent in cleaned_words_val:
    #max_length=max(max_length,len(sent))
    list1=[0]*n
    for word in sent:
        index=vocab.index(word)
        list1[index]=1
    val_X.append(list1)
    
Val_X=np.array(val_X)
print(Val_X.shape)

(400, 1461)


In [13]:
train_Y,val_Y=[],[]
for val in intent:
    list1=[0]*len(unique_intent)
    index=unique_intent.index(val)
    list1[index]=1
    train_Y.append(list1)
train_Y=np.array(train_Y)

In [14]:
for val in intent_val:
    list1=[0]*len(unique_intent)
    index=unique_intent.index(val)
    list1[index]=1
    val_Y.append(list1)

val_Y=np.array(val_Y)

max_features=Train_X.shape[1]

In [15]:
def create_model(max_features, max_length):
   # Input for variable-length sequences of integers
   inputs = keras.Input(shape=(None,), dtype="int32")
   # Embed each integer in a 128-dimensional vector
   x = layers.Embedding(max_features, 128)(inputs)

   x = layers.Bidirectional(layers.LSTM(128))(x)
# Add a classifier
   outputs = layers.Dense(32, activation="relu")(x)
   outputs=  layers.Dropout(0.5)
   outputs=  layers.Dense(20, activation="softmax")(x)
   model = keras.Model(inputs, outputs)
   return model

In [16]:
model = create_model(max_features, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         187008    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 20)                5140      
Total params: 455,316
Trainable params: 455,316
Non-trainable params: 0
_________________________________________________________________


In [None]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist=model.fit(Train_X, train_Y , batch_size=32, epochs=2, validation_data=(Val_X,val_Y))



Train on 2000 samples, validate on 400 samples
Epoch 1/2