## Importing Libraries
---

In [5]:
import numpy as np
import pandas as pd
import json

In [8]:
f = open('Intent.json')
data = json.load(f)
df = pd.DataFrame(data['intents'])
df.head()

Unnamed: 0,intent,text,responses
0,greetings,"[hello, hey, hi, good day, greetings, what's u...","[hello, hey!, what can i do for you?]"
1,goodbye,"[cya, see you later, goodbye, have a good day,...","[have a nice day, goodbye]"
2,age,"[how old, how old are you?, what is your age, ...","[I get reborn after every compilation, hey!, m..."
3,name,"[what is your name, what should i call you, wh...","[you can call me Medbot!, i am Medbot!, i am M..."
4,common cold symptoms,"[Runny or stuffy nose, Sore throat, Cough, Con...",[It seems that you are suffering from common c...


## Data Preprocessing
---

In [9]:
df_patterns = df[['text', 'intent']]
df_responses = df[['responses', 'intent']]
df_patterns.head()

Unnamed: 0,text,intent
0,"[hello, hey, hi, good day, greetings, what's u...",greetings
1,"[cya, see you later, goodbye, have a good day,...",goodbye
2,"[how old, how old are you?, what is your age, ...",age
3,"[what is your name, what should i call you, wh...",name
4,"[Runny or stuffy nose, Sore throat, Cough, Con...",common cold symptoms


In [10]:
df_patterns.shape

(81, 2)

In [11]:
df_patterns = df_patterns.explode('text')
df_patterns.head()

Unnamed: 0,text,intent
0,hello,greetings
0,hey,greetings
0,hi,greetings
0,good day,greetings
0,greetings,greetings


In [12]:
df_patterns.describe()

Unnamed: 0,text,intent
count,348,348
unique,348,80
top,hello,sexual_health
freq,1,8


In [13]:
df_patterns.drop_duplicates(inplace= True)

In [14]:
df_patterns.describe()

Unnamed: 0,text,intent
count,348,348
unique,348,80
top,hello,sexual_health
freq,1,8


In [15]:
df_patterns['intent'].value_counts()

sexual_health           8
fever symptoms          8
common cold symptoms    8
goodbye                 7
Depression symptoms     7
                       ..
insurance               4
mental_health           4
exercise                4
vaccination             4
aging_gracefully        4
Name: intent, Length: 80, dtype: int64

## Data Balancing
---
We will balance the unbalanced classes by randomly duplicating rows from such classes

In [16]:
def balance_data(df_patterns):
    df_intent = df_patterns['intent']
    max_counts = df_intent.value_counts().max() #max number of examples for a class
    
    new_df = df_patterns.copy()
    for i in df_intent.unique():
        i_count = int(df_intent[df_intent == i].value_counts())
        if i_count < max_counts:
            i_samples = df_patterns[df_intent == i].sample(max_counts - i_count, replace = True, ignore_index = True)
            new_df = pd.concat([new_df, i_samples])
    return new_df

In [17]:
df_patterns = balance_data(df_patterns)

In [18]:
df_patterns['intent'].value_counts()

greetings                 8
goodbye                   8
nutrition_for_athletes    8
cancer_support            8
caregiving_tips           8
                         ..
allergies                 8
stress_management         8
home_remedies             8
sexual_health             8
aging_gracefully          8
Name: intent, Length: 80, dtype: int64

## Token_ID, Attention_Mask, Labels
---

In [19]:
seq_len = 256
num_samples = len(df_patterns)

Xids = np.zeros((num_samples, seq_len)) #Token ids
Xmask = np.zeros((num_samples, seq_len)) #attention mask

In [20]:
Xids.shape

(640, 256)

In [21]:
Xids

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
%%time
# we will populate the above zero arrays with actual values for each token
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(df_patterns['text']):
    tokens = tokenizer.encode_plus(phrase, max_length= seq_len, truncation= True, padding= 'max_length', add_special_tokens = True, return_tensors= 'tf')
    
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

CPU times: total: 1.48 s
Wall time: 6.9 s


In [23]:
Xids

array([[  101., 19082.,   102., ...,     0.,     0.,     0.],
       [  101., 23998.,   102., ...,     0.,     0.,     0.],
       [  101., 20844.,   102., ...,     0.,     0.,     0.],
       ...,
       [  101., 18653., 26348., ...,     0.,     0.,     0.],
       [  101., 18036.,  3007., ...,     0.,     0.,     0.],
       [  101.,   138.,  3375., ...,     0.,     0.,     0.]])

one-hot encoding the labels  
first we will label encode the current values

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [25]:
arr = le.fit_transform(df_patterns['intent'])
arr

array([38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37,  5,  5,  5,
        5,  5, 54, 54, 54, 54, 54, 19, 19, 19, 19, 19, 19, 19, 19, 34, 34,
       34, 34, 34, 34, 34, 34,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,
        2,  2,  0,  0,  0,  0,  0,  0, 18, 18, 18, 18, 33, 33, 33, 33, 23,
       23, 23, 23, 22, 22, 22, 22, 10, 10, 10, 10,  1,  1,  1,  1,  1, 32,
       32, 32, 32, 55, 55, 55, 55, 69, 69, 69, 69, 41, 41, 41, 41, 74, 74,
       74, 74, 35, 35, 35, 35,  7,  7,  7,  7, 66, 66, 66, 66, 42, 42, 42,
       42, 70, 70, 70, 70,  8,  8,  8,  8, 62, 62, 62, 62, 14, 14, 14, 14,
       27, 27, 27, 27, 50, 50, 50, 50, 63, 63, 63, 63, 66, 66, 66, 66, 71,
       71, 71, 71, 16, 16, 16, 16, 20, 20, 20, 20, 72, 72, 72, 72, 29, 29,
       29, 29, 47, 47, 47, 47,  9,  9,  9,  9, 44, 44, 44, 44, 49, 49, 49,
       49, 24, 24, 24, 24, 30, 30, 30, 30, 75, 75, 75, 75, 45, 45, 45, 45,
       65, 65, 65, 65, 57, 57, 57, 57, 68, 68, 68, 68, 43, 43, 43, 43, 31,
       31, 31, 31, 77, 77

In [26]:
le.classes_

array(['Asthma symptoms', 'Consultation', 'Depression symptoms',
       'Diabetes symptoms', 'addiction_recovery', 'age',
       'aging_gracefully', 'aging_health', 'allergies', 'appointment',
       'asthma prevention', 'cancer_support', 'caregiver_support',
       'caregiving_tips', 'child_health', 'child_nutrition',
       'chronic_conditions', 'cognitive_health', 'common cold prevention',
       'common cold symptoms', 'dental_care', 'dental_health',
       'depression prevention', 'diabetes prevention', 'diet_nutrition',
       'digital_health', 'disease_prevention', 'elderly_care',
       'elderly_nutrition', 'emergency', 'exercise', 'exercise_injuries',
       'exercise_recommendation', 'fever prevention', 'fever symptoms',
       'first_aid', 'fitness_motivation', 'goodbye', 'greetings',
       'gut_health', 'heart_health', 'holistic_health', 'home_remedies',
       'immune_system', 'insurance', 'insurance_claim',
       'lifestyle_changes', 'medication', 'menstrual_health',
  

In [27]:
labels = np.zeros((num_samples, arr.max()+1))
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
labels[np.arange(num_samples), arr] = 1
labels 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
len(labels)

640

## Data Pipeline
---

In [30]:
import tensorflow as tf

In [31]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(80,), dtype=tf.float64, name=None))>

In [32]:
dataset.as_numpy_iterator().next()

(array([  101., 19082.,   102.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0., 

Tensorflow expects a tuple with 2 inputs only, examples at index 0 and Labels at index 1

In [33]:
def map_function(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [34]:
dataset = dataset.map(map_function)

In [35]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(80,), dtype=tf.float64, name=None))>

In [36]:
batch_size = 8

dataset = dataset.shuffle(1000)
dataset = dataset.batch(batch_size, drop_remainder= True)

In [37]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(8, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(8, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(8, 80), dtype=tf.float64, name=None))>

In [38]:
split = 0.9 

size = int((num_samples / batch_size) * split)

In [39]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

## Training the Tranformer model
---

In [40]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

Downloading tf_model.h5:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [45]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      TFBaseModelOutputWithPoo  108310272 
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             256, 768),                          
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                          
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_______________________________________

In [46]:
input_ids = tf.keras.layers.Input(shape= (seq_len,), name= 'input_ids', dtype='int32') #input layer for tokens
mask = tf.keras.layers.Input(shape= (seq_len,), name= 'attention_mask', dtype='int32') #input layer for attention masks

embeddings = bert.bert(input_ids, attention_mask= mask)[1]

x = tf.keras.layers.Dense(1024, activation= 'relu')(embeddings)
y = tf.keras.layers.Dense(arr.max()+1, activation= 'softmax', name= 'outputs')(x)

In [47]:
model = tf.keras.Model(inputs= [input_ids, mask], outputs = y)

In [48]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [49]:
optimizer = tf.keras.optimizers.Adam(learning_rate= 1e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [50]:
model.compile(optimizer= optimizer, loss= loss, metrics = [acc])

In [51]:
%%time
history = model.fit(train_ds, validation_data= val_ds, epochs= 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: total: 5h 25min 2s
Wall time: 1h 21min 36s


## Saving the model
---

In [47]:
model.save('intent_prediction_model.h5')
