<a href="https://colab.research.google.com/github/evergreenllc2020/Deep-Learning-For-Hackers/blob/master/cchome_intent_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CC Home Intent Recognition with BERT using Keras and TensorFlow 2

# New Section

In [0]:
!nvidia-smi

In [0]:
!pip install tensorflow

In [0]:
!pip install pip install tensorflow-tensorboard

In [0]:
!pip install tensorboard

In [0]:

!pip install tensorflow-gpu 

In [0]:
!pip install --upgrade bert-for-tf2 

In [0]:
!pip install --upgrade grpcio 

In [0]:
!pip install tqdm  

In [0]:
!pip install sentencepiece >> /dev/null

In [0]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [0]:
#import bert
#from bert import BertModelLayer

In [0]:
!ls "/content/drive/My Drive/NLP"

In [0]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
#import tensorflow.compat.v1 as tf
#tf.disable_v2_behavior()
from tensorflow import keras

import bert               

from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Data

The data contains various user queries categorized into seven intents. It is hosted on [GitHub](https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines) and is first presented in [this paper](https://arxiv.org/abs/1805.10190).

In [0]:
#!gdown --id 1OlcvGWReJMuyYQuOZm149vHWwPtlboR6 --output train.csv
#!gdown --id 1Oi5cRlTybuIF2Fl5Bfsr-KkqrXrdt77w --output valid.csv
#!gdown --id 1ep9H6-HvhB4utJRLVcLzieWNUSG3P_uF --output test.csv

!gdown --id 1ofsxq2O6wHBd9CYiAElEZDPsglTHBDQF --output cchome_train.csv
!gdown --id 1y17bzuVZQkRVYA6hptvmLvbgBfiMhi_V --output cchome_valid.csv
!gdown --id 1y17bzuVZQkRVYA6hptvmLvbgBfiMhi_V --output cchome_test.csv



In [0]:
train = pd.read_csv("cchome_train.csv", header=None)
valid = pd.read_csv("cchome_valid.csv", header=None)
test = pd.read_csv("cchome_test.csv", header=None)

In [0]:
train.head()

In [0]:
train.columns = ['intent', 'text']
train["intent"] = train.apply(lambda row : row['intent'].split("__")[2],axis=1)
train.head()
train.describe()

In [0]:
test.columns = ['intent', 'text']
test["intent"] = test.apply(lambda row : row['intent'].split("__")[2],axis=1)
test.head()
test.describe()

In [0]:
valid.columns = ['intent', 'text']
valid["intent"] = valid.apply(lambda row : row['intent'].split("__")[2],axis=1)
valid.head()
valid.describe()

In [0]:
train.shape

# New Section

# New Section

In [0]:
train = train.append(valid).reset_index(drop=True)

In [0]:
train.shape

In [0]:
train.head()

In [0]:
chart = sns.countplot(train.intent, palette=HAPPY_COLORS_PALETTE)
plt.title("Number of texts per intent")
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right');

In [0]:
apps_df = train[train["intent"]=="apps"]
apps_df = apps_df.sample(n = 20000)
support_df = train[train["intent"]=="support"]
support_df = support_df.sample(n = 20000)

train_df_2 = train[(train["intent"]!="apps") & (train["intent"]!="support")]

train =  pd.concat([train_df_2, support_df])
train =  pd.concat([train, apps_df])
train.describe()



In [0]:
chart = sns.countplot(train.intent, palette=HAPPY_COLORS_PALETTE)
plt.title("Number of texts per intent")
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right');

# Intent Recognition with BERT

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

In [0]:
!unzip uncased_L-12_H-768_A-12.zip

In [0]:
#import shutil
#shutil.rmtree('model')
os.makedirs("model", exist_ok=True)

In [0]:
!mv uncased_L-12_H-768_A-12/ model

In [0]:
bert_model_name="uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join("model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

## Preprocessing

In [0]:
class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "intent"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    #print("inside __init__")
    #print("train data")
    #print(train.head())
    #print("test data")
    #print(test.head())
    

    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])
    

  def _prepare(self, df):
    x, y = [], []
    #print("inside prepare")
    #print("here is dataframe")
    #print(df.head)
    #print("count")
    #print(df.describe())
    for _, row in tqdm(df.iterrows()):
      text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
      #print("text: " + text)
      #print("label: " + text)
      
      tokens = self.tokenizer.tokenize(text)
      #print("tokens before CLS and SEP:" )
      #print(tokens)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      #print("tokens after CLS and SEP:" )
      #print(tokens)
      
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      #print("token_ids")
      #print(token_ids)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))
      
      

    #print("inside _prepare. np_array(x)")
    #print(np.array(x))
    #print("inside _prepare. np_array(y)")
    #print(np.array(y))
    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      #print("inputs_ids before")
      #print(input_ids)
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      #print("inputs_ids after subarray")
      #print(input_ids)
      
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      #print("inputs_ids after append")
      #print(input_ids)
      
      x.append(np.array(input_ids))
    #print("inside _pad")
    #print("np.array(x)")
    #print(np.array(x))
    return np.array(x)

In [0]:
classes = train.intent.unique().tolist()
#data = IntentDetectionData(train[0:2], test[0:2], tokenizer, classes, max_seq_len=128)

In [0]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

In [0]:
tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokens = ["[CLS]"] + tokens + ["[SEP]"]
tokens2 = np.array([(tokens)])
print(tokens)
print(tokens2.shape)
tokens2 = tokens2[0,:]
print(tokens2)
print(tokens2.shape)
print(tokens2)
tokens3 = tokens2.reshape(12,1)
print(tokens3)

In [0]:
tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokens = ["[CLS]"] + tokens + ["[SEP]"]
print(tokens)
tokenizer.convert_tokens_to_ids(tokens)

In [0]:
def create_model(max_seq_len, bert_ckpt_file):

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.4)(cls_out)
  
  logits = keras.layers.Dense(units=768, activation="relu")(cls_out)
  logits = keras.layers.Dropout(0.4)(logits)
    
  logits = keras.layers.Dense(units=len(classes), activation="sigmoid")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)
        
  return model

In [0]:
len(classes)

## Training

In [0]:
classes = train.intent.unique().tolist()

data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128)

In [0]:
data.train_x.shape

In [0]:
data.train_x[0]

In [0]:
data.train_y.shape

In [0]:
data.max_seq_len

In [0]:
model = create_model(data.max_seq_len, bert_ckpt_file)

In [50]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 59)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 59, 768)           108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 6152  

In [0]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)





In [0]:
log_dir = "log/intent_detection/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.25,
  batch_size=32,
  shuffle=True,
  epochs=15,
  callbacks=[tensorboard_callback]
)

Train on 71352 samples, validate on 23784 samples
Epoch 1/15


## Evaluation

In [0]:
%load_ext tensorboard

In [0]:
%tensorboard --logdir log

In [0]:
ax = plt.figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.plot(history.history['loss'])
ax.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'])
plt.title('Loss over training epochs')
plt.show();

In [0]:
ax = plt.figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.plot(history.history['acc'])
ax.plot(history.history['val_acc'])
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'])
plt.title('Accuracy over training epochs')
plt.show();

In [0]:
_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print("test acc", test_acc)

In [0]:
y_pred = model.predict(data.test_x).argmax(axis=-1)

In [0]:
print(y_pred[0])

In [0]:
print(classification_report(data.test_y, y_pred, target_names=classes))

In [0]:
cm = confusion_matrix(data.test_y, y_pred)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)

In [0]:
hmap = sns.heatmap(df_cm, annot=True, fmt="d")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True label')
plt.xlabel('Predicted label');

In [0]:
sentences = [
  "I love singing ",
  "Rate this book as awful"
]

pred_tokens = map(tokenizer.tokenize, sentences)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

predictions = model.predict(pred_token_ids).argmax(axis=-1)

for text, label in zip(sentences, predictions):
  print("text:", text, "\nintent:", classes[label])
  print()





In [0]:
predictions = model.predict(pred_token_ids)

print(predictions)

# References

- https://mccormickml.com/2019/07/22/BERT-fine-tuning/
- https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
- https://jalammar.github.io/illustrated-bert/
- https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03
- https://www.reddit.com/r/MachineLearning/comments/ao23cp/p_how_to_use_bert_in_kaggle_competitions_a/