<a href="https://colab.research.google.com/github/esvm/IF704-chatbot/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Instructions

1. Create an account in [Kaggle](https://www.kaggle.com/)
2. Go to your account (https://www.kaggle.com/<yourusername>/account)
3. Generate a new API Token if you don't have one
4. Upload the downloaded `kaggle.json` in this notebook folder

# Installing [Kaggle](https://www.kaggle.com/)

In [1]:
! pip install kaggle



In [11]:
! mkdir ~/.kaggle

In [12]:
! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

# Downloading Dataset

In [77]:
! kaggle datasets download -d stefanlarson/outofscope-intent-classification-dataset

Downloading outofscope-intent-classification-dataset.zip to /content
  0% 0.00/285k [00:00<?, ?B/s]
100% 285k/285k [00:00<00:00, 36.1MB/s]


In [14]:
if not os.path.exists('./dataset'):
        os.makedirs('./dataset')

In [78]:
! mv outofscope-intent-classification-dataset.zip ./dataset

In [79]:
import zipfile
with zipfile.ZipFile('./dataset/outofscope-intent-classification-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('./dataset')

# Setup Dependencies

In [414]:
import pandas as pd

# ignore words
import nltk
nltk.download('stopwords')

# tokenize and vetorize text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# one-hot encoding labels
from sklearn import preprocessing
import numpy as np
from tensorflow.keras.utils import to_categorical

# deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Conv2D
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [222]:
from nltk.corpus import stopwords
words = set(stopwords.words("english"))
print(words)

{'about', 'above', 'being', 'as', 'just', "didn't", 'd', 'to', "wasn't", 'any', 'you', 'himself', 'a', 'in', 'no', "aren't", 'his', 'wouldn', "it's", 'were', 'what', 'not', 'through', 'wasn', "she's", 've', 'myself', "you'll", 'haven', 'didn', 'if', 'him', "don't", 'am', 'out', "mustn't", 'can', 'too', 'been', 'such', 'itself', 'weren', "you're", 'its', 'was', 'had', 'our', 'both', 'couldn', 'now', 'there', 'or', 'which', 'where', 'with', "haven't", 'so', 'up', 't', 'their', 'into', 'most', 'he', 'those', 'o', 'these', 'because', 'by', 'them', 'i', 'mightn', "hadn't", 'over', 'shan', 'during', "that'll", 'than', 'doesn', "doesn't", 'other', 'do', 'for', 'on', "isn't", 'aren', 'whom', 'theirs', 'have', "shouldn't", "wouldn't", 'ma', 're', 'ours', 'the', 'of', 'own', "you've", 'yourselves', 'then', 'has', 'again', 'few', 'against', 'isn', 'ourselves', "needn't", 'having', 'll', 'while', 'm', 'an', 'hadn', 'is', 'does', 'and', "hasn't", 'nor', 'who', 'further', 's', 'shouldn', 'from', "sh

# Reading Data

In [415]:
# Read movies conversations
scopeTestJSON = pd.read_json("./dataset/is_test.json")
scopeTrainJSON = pd.read_json("./dataset/is_train.json")
scopeValJSON = pd.read_json("./dataset/is_val.json")

In [416]:
# scopeTestJSON[0] = scopeTestJSON[0].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
# scopeTrainJSON[0] = scopeTestJSON[0].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
# scopeValJSON[0] = scopeTestJSON[0].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))

# Extract only dialog texts
scopeTestText = [str(line).strip() for line in scopeTestJSON[0]]
scopeTrainText = [str(line).strip() for line in scopeTrainJSON[0]]

modelText = scopeTrainText + scopeTestText
scopeValText = [str(line).strip() for line in scopeValJSON[0]]

In [417]:
# Extract only labels
scopeTestLabels = [str(line).strip() for line in scopeTestJSON[1]]
scopeTrainLabels = [str(line).strip() for line in scopeTrainJSON[1]]

modelLabels = scopeTrainLabels + scopeTestLabels
scopeValLabels = [str(line).strip() for line in scopeValJSON[1]]

# Tokenize words from dialogues

In [418]:
tok = Tokenizer()
tok.fit_on_texts(modelText)
wordIndex = tok.word_index

# Vetorizing dialogues

In [432]:
modelTokens = tok.texts_to_sequences(modelText)

maxVocabSize = len(wordIndex) + 1
inputLength = 20 # max(map(lambda x: len(x), modelTokens))

In [420]:
inputLength

28

In [520]:
modelInput = pad_sequences(modelTokens, inputLength)

validationTokens = tok.texts_to_sequences(scopeValText)
validationInput = pad_sequences(validationTokens, inputLength)

# One-hot encoding labels

In [434]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(modelLabels)

encodedValidationLabels = label_transformer.transform(scopeValLabels)
encodedModelLabels = label_transformer.transform(modelLabels)

In [435]:
categoricalValidationLabels = to_categorical(np.asarray(encodedValidationLabels))
categoricalModelLabels = to_categorical(np.asarray(encodedModelLabels))

In [436]:
print(categoricalModelLabels)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Split train data to isolate test dataset

In [437]:
X_train, X_val, y_train, y_val = train_test_split(modelInput, categoricalModelLabels, test_size=0.2, random_state=13)

# Learning

In [509]:
model = Sequential([
  Embedding(maxVocabSize, 300, input_length=inputLength),                 
  Conv1D(filters=32, kernel_size=8, activation='relu'),
  MaxPooling1D(pool_size=3),
  Flatten(),
  Dense(180, activation='relu'),
  Dense(150, activation='sigmoid')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [510]:
model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 20, 300)           1787400   
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 13, 32)            76832     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 4, 32)             0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 128)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 180)               23220     
_________________________________________________________________
dense_41 (Dense)             (None, 150)               27150     
Total params: 1,914,602
Trainable params: 1,914,602
Non-trainable params: 0
___________________________________________

In [511]:
model.fit(X_train, y_train, epochs=6, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7efcd37a9110>

In [512]:
model.evaluate(X_val, y_val)



[0.8798507452011108, 0.8079487085342407]

In [521]:
predictions = model.predict(validationInput)

In [522]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()

print(acc(categoricalValidationLabels, predictions))

0.7366666666666667


In [523]:
def get_intent(sentence):
  data = [[sentence]]
  df = pd.DataFrame(data)
  input = df[0]
  input = tok.texts_to_sequences(input)
  input = pad_sequences(input, inputLength)
  prediction = model.predict(input)
  # return np.argmin(prediction)
  return modelLabels[np.where(encodedModelLabels == np.argmax(prediction))[0][0]]

In [524]:
get_intent("in spanish, meet me tomorrow is said how")

'translate'