In [2]:
#!pip3 install bert-for-tf2
#!pip3 install sentencepiece
#!pip3 install lime
#!pip3 instll bert-tensorflow 

In [3]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub


from tensorflow.keras import layers
from keras.models import Sequential
import bert
import pandas as pd
import numpy as np
import re, random, math, os

In [4]:
#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [6]:
bbc_text = pd.read_csv("bbc-text.csv")

bbc_text.head(20)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


In [8]:
itr_cnt = 1000
train_subset = bbc_text[0:itr_cnt]
test_subset = bbc_text[1925:2226]

In [9]:
values = bbc_text.category.value_counts()
print(values)

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64


In [10]:
def preprocess_text(sen):
    #Remove STOPWORDS
    for word in STOPWORDS:
      token = ' ' + word + ' '
      sen = sen.replace(token, ' ')
      sen = sen.replace(' ', ' ')

    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [11]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [12]:
bbc_train = []
bbc_test = []
train_sen = list(train_subset['text'])
test_sen = list(test_subset['text'])
for sen in train_sen:
    bbc_train.append(preprocess_text(sen))

for sen in test_sen:
    bbc_test.append(preprocess_text(sen))

In [14]:
bbc_labels = bbc_text.category.unique()
print(type(bbc_labels))
bbc_labels = np.sort(bbc_labels)
print(bbc_labels)

<class 'numpy.ndarray'>
['business' 'entertainment' 'politics' 'sport' 'tech']


In [15]:
from sklearn.preprocessing import LabelEncoder
train_df = pd.DataFrame()
train_df['text'] = train_subset["text"]
train_df['category'] = LabelEncoder().fit_transform(train_subset["category"])

test_df = pd.DataFrame()
test_df['text'] = test_subset["text"]
test_df['category'] = LabelEncoder().fit_transform(test_subset["category"])

In [16]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [50]:
def tokenize_data(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [18]:
tokenized_train = [tokenize_reviews(text) for text in bbc_train] 
tokenized_test = [tokenize_reviews(text) for text in bbc_test]

In [19]:
train_texts = [[train_text, train_df['category'][i]] for i,  train_text in enumerate(tokenized_train)]
test_texts = [[test_text, test_df['category'][i+1925]] for i, test_text in enumerate(tokenized_test)]

In [20]:
sorted_train_labels = [(text_lab[0], text_lab[1]) for text_lab in train_texts]
sorted_test_labels = [(text_lab[0], text_lab[1]) for text_lab in test_texts]

In [21]:
processed_train = tf.data.Dataset.from_generator(lambda: sorted_train_labels, output_types=(tf.int32, tf.int32))
processed_test = tf.data.Dataset.from_generator(lambda: sorted_test_labels, output_types = (tf.int32, tf.int32))

In [22]:
BATCH_SIZE = 32
batched_train_dataset = processed_train.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
print(type(batched_train_dataset))
batched_test_dataset = processed_test.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>


In [23]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=100,
                 dnn_units=512,
                 model_output_classes=5,
                 dropout_rate=0.2,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated_dropout = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated_dropout)
        
        return model_output

In [24]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 5  
DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [25]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [26]:
text_model.compile(loss='sparse_categorical_crossentropy',
                    optimizer="adam",
                    metrics=["sparse_categorical_accuracy"])

In [28]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6104400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0

In [27]:
text_model.fit(batched_train_dataset, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f92fc555550>

In [29]:
test_loss, test_acc = text_model.evaluate(batched_test_dataset)
print(test_acc)

0.9700000286102295


In [30]:
y_pred_confid = text_model.predict(batched_test_dataset)
y_pred_confid_np = [];
print(y_pred_confid[3])
print(type(y_pred_confid))

for i in range(len(y_pred_confid)):
  confid_list = [];
  for j in range(len(y_pred_confid[i])):
    y_pred_confid[i][j] = f"{y_pred_confid[i][j]:.3f}"
    confid_list.append(y_pred_confid[i][j])
  confid_list = str(confid_list)  
  y_pred_confid_np.append(confid_list)
  
print(type(y_pred_confid_np))
y_pred_confid_np = np.array(y_pred_confid_np)
print(type(y_pred_confid_np))
print(y_pred_confid_np.shape)
print(list(y_pred_confid[3]))
print(type(y_pred_confid))
print(str(y_pred_confid[77]))

[9.8927748e-01 1.7196689e-03 4.8641330e-03 3.2091371e-04 3.8177362e-03]
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
(300,)
[0.989, 0.002, 0.005, 0.0, 0.004]
<class 'numpy.ndarray'>
[0.658 0.023 0.313 0.002 0.004]


In [31]:
pred = tf.nn.softmax(text_model.predict(batched_test_dataset))
y_pred_argmax = tf.math.argmax(pred, axis=1)
type(pred)
y_pred_argmax_np = y_pred_argmax.numpy();
print(y_pred_argmax_np)

count = 0;
y_true = tf.Variable([], dtype=tf.int32)
for features, label in batched_test_dataset:
    y_true = tf.concat([y_true, label], 0)
    count += 1;

print(y_true)
print(features)

[0 4 3 0 1 0 3 3 4 4 2 4 2 2 3 3 3 0 4 1 0 3 0 3 0 4 0 0 0 0 3 3 3 0 0 1 0
 0 0 3 3 3 4 4 3 3 3 0 1 4 4 4 2 0 0 4 0 4 3 3 2 2 2 3 3 2 2 4 0 3 3 3 3 0
 3 0 0 0 0 3 2 3 4 4 3 4 0 1 0 3 0 4 4 2 1 0 1 0 4 2 1 0 1 1 4 0 4 4 0 2 3
 1 0 2 1 0 4 3 4 2 3 2 0 2 2 0 0 0 4 2 0 2 0 1 2 3 2 2 3 1 4 4 0 4 3 4 0 0
 3 4 4 4 3 1 3 2 0 2 2 1 4 0 4 3 1 1 3 0 1 4 4 3 1 0 2 4 2 4 4 0 2 0 4 2 1
 3 4 0 4 1 4 4 3 2 3 3 2 1 1 0 2 2 3 0 0 4 0 4 4 3 0 2 3 0 0 3 4 3 4 1 3 3
 1 0 4 3 3 2 4 0 2 3 3 2 1 4 4 4 0 3 1 1 4 0 2 4 3 3 4 4 2 0 3 1 2 3 1 4 4
 0 0 0 3 3 4 3 0 4 0 0 3 0 2 0 0 4 2 4 2 4 1 2 4 1 3 2 1 0 4 0 4 1 4 3 0 0
 2 1 2 3]
tf.Tensor(
[0 4 3 0 1 0 3 3 1 4 2 4 2 2 3 3 3 0 4 1 0 3 0 3 0 4 0 0 0 0 3 3 3 0 0 1 0
 0 0 3 3 3 4 0 3 3 3 0 1 4 4 4 2 0 0 4 0 4 3 3 2 2 2 3 3 2 2 4 0 3 3 3 3 0
 3 0 0 0 0 3 2 3 4 4 3 4 0 1 0 3 0 4 4 2 1 0 1 0 4 2 1 2 1 1 4 0 4 4 0 2 3
 1 0 2 1 0 4 3 4 2 3 2 0 2 2 0 0 0 4 2 0 2 0 1 2 3 2 2 3 1 4 4 0 4 3 0 0 2
 3 4 4 4 3 1 3 2 0 2 2 1 4 0 4 3 1 1 3 0 1 4 4 3 1 0 2 2 2 4 4 0 2 0 2 2 1
 3 4

In [33]:
origin_proc_test = [];
count = 0;
for i, j in enumerate(processed_test):
  back_to = tokenizer.convert_ids_to_tokens(j[0].numpy())
  origin_proc_test.append(back_to);
  count += 1;

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = tf.nn.softmax(text_model.predict(batched_test_dataset))

y_pred_argmax = tf.math.argmax(y_pred, axis=1)
y_pred_argmax_np = y_pred_argmax.numpy()

y_true = tf.Variable([], dtype=tf.int32)

In [36]:
for features, label in batched_test_dataset:
    y_true = tf.concat([y_true, label], 0)

origin_proc_test = [];
for i, j in enumerate(processed_test):
    back_to = tokenizer.convert_ids_to_tokens(j[0].numpy())
    origin_proc_test.append(back_to);

origin_proc_test_str = [];
for i in range(len(origin_proc_test)):
    listToStr = ' '.join([str(elem) for elem in origin_proc_test[i]])
    origin_proc_test_str.append(listToStr)

In [38]:
print(origin_proc_test[1])

['norway', 'uphold', '##s', 'nap', '##ster', 'ruling', 'norwegian', 'student', 'ran', 'website', 'linked', 'downloadable', 'mp', 'files', 'ordered', 'pay', 'compensation', 'country', 'supreme', 'court', 'frank', 'allan', 'br', '##u', '##vik', 'ordered', 'pay', 'k', '##rone', '##r', 'music', 'industry', 'norway', 'student', 'set', 'nap', '##ster', 'no', 'site', 'allowed', 'users', 'submit', 'receive', 'links', 'mp', 'files', 'br', '##u', '##vik', 'earlier', 'cleared', 'appeal', 'lower', 'court', 'found', 'music', 'industry', 'music', 'industry', 'bosses', 'norway', 'said', 'ruling', 'would', 'help', 'build', 'confidence', 'internet', 'distribution', 'medium', 'frank', 'allan', 'br', '##u', '##vik', 'set', 'nap', '##ster', 'no', 'website', 'part', 'school', 'project', 'studying', 'computer', 'engineering', 'norwegian', 'town', 'lille', '##hammer', 'website', 'associated', 'nap', '##ster', 'com', 'site', 'usa', 'operating', 'since', 'already', 'facing', 'legal', 'action', 'br', '##u', '##

In [39]:
le = LabelEncoder().fit(test_subset['category'])
inversed_y_pred = le.inverse_transform(y_pred_argmax_np)
y_true_np = y_true.numpy();
inversed_y_true = le.inverse_transform(y_true_np)
origin_proc_test_np = np.asarray(origin_proc_test_str)

In [42]:
index_list = list(range(0, 300))
df_inv = {'index': index_list, 'true_label' : inversed_y_true, 'predicted_label': inversed_y_pred, 'confidence_score': y_pred_confid_np, 'text' : origin_proc_test_np}
test_result_inv_df = pd.DataFrame(df_inv)
test_result_inv_df.head(30)

Unnamed: 0,index,true_label,predicted_label,confidence_score,text
0,0,business,business,"[0.984, 0.0, 0.002, 0.0, 0.014]",warning us pensions deficit taxpayers may bail...
1,1,tech,tech,"[0.036, 0.17, 0.012, 0.01, 0.772]",norway uphold ##s nap ##ster ruling norwegian ...
2,2,sport,sport,"[0.0, 0.001, 0.0, 0.997, 0.002]",nad ##al puts spain result nad ##al rod ##dick...
3,3,business,business,"[0.989, 0.002, 0.005, 0.0, 0.004]",ukraine strikes turk ##men gas deal ukraine ag...
4,4,entertainment,entertainment,"[0.001, 0.995, 0.0, 0.001, 0.003]",da vinci film star tom hank ##s actor tom hank...
5,5,business,business,"[0.999, 0.0, 0.0, 0.0, 0.001]",circuit city gets takeover offer circuit city ...
6,6,sport,sport,"[0.0, 0.0, 0.0, 0.999, 0.0]",venus stunned far ##ina eli ##a venus williams...
7,7,sport,sport,"[0.021, 0.012, 0.038, 0.908, 0.021]",an ##el ##ka ap ##olo ##gis ##es criticism man...
8,8,entertainment,tech,"[0.121, 0.167, 0.032, 0.123, 0.557]",fears raised ballet future fewer children uk f...
9,9,tech,tech,"[0.001, 0.0, 0.0, 0.0, 0.999]",consumer concern rf ##id tags consumers concer...


In [44]:

cm = confusion_matrix(y_true, y_pred_argmax)
print(cm)


cr = classification_report(y_true, y_pred_argmax, target_names = bbc_labels)
print(cr)


[[75  0  1  0  2]
 [ 0 36  1  0  1]
 [ 2  0 46  0  2]
 [ 0  0  0 69  0]
 [ 0  0  0  0 65]]
               precision    recall  f1-score   support

     business       0.97      0.96      0.97        78
entertainment       1.00      0.95      0.97        38
     politics       0.96      0.92      0.94        50
        sport       1.00      1.00      1.00        69
         tech       0.93      1.00      0.96        65

     accuracy                           0.97       300
    macro avg       0.97      0.97      0.97       300
 weighted avg       0.97      0.97      0.97       300



In [45]:
test_acc = float(test_acc)
test_acc = f"{test_acc:.2f}"

In [46]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names = bbc_labels)


In [51]:
def get_confidence_score(bbc_test_np):
    print("get_confidence_score")
    bbc_test = bbc_test_np
    tokenized_test = [tokenize_data(text) for text in bbc_test];
    test_texts = [[text, test_df['category'][i+1925]] for i, text in enumerate(tokenized_test)]
    sorted_test_labels = [(text_lab[0], text_lab[1]) for text_lab in test_texts]
    processed_test = tf.data.Dataset.from_generator(lambda: sorted_test_labels, output_types = (tf.int32, tf.int32))
    batched_test_dataset = processed_test.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
    y_pred_confid = text_model.predict(batched_test_dataset)

    return y_pred_confid;

In [52]:
misclassified_idx = [];
for idx in range(len(test_result_inv_df)):
    if(test_result_inv_df['true_label'][idx] != test_result_inv_df['predicted_label'][idx]):
        misclassified_idx.append(idx)
print("misclassified_idx", misclassified_idx)
print()

misclassified_idx [8, 43, 101, 145, 147, 175, 182, 254, 276]



In [53]:
idx = 8
exp = explainer.explain_instance(test_result_inv_df['text'][idx], get_confidence_score, num_features=20, top_labels=5, num_samples=len(bbc_test))
print('True class: %s' % test_result_inv_df['true_label'][idx])
print('Predicted class: %s' % test_result_inv_df['predicted_label'][idx])
print('Confidence score: %s' % test_result_inv_df['confidence_score'][idx])
exp.show_in_notebook(text=True)

get_confidence_score
True class: entertainment
Predicted class: tech
Confidence score: [0.121, 0.167, 0.032, 0.123, 0.557]


In [54]:
from collections import Counter
import itertools

appended_tokenized_test = list(itertools.chain.from_iterable(tokenized_test))
print(type(appended_tokenized_test))

<class 'list'>


In [56]:
print('appended tokenized test' , len(appended_tokenized_test))
print(appended_tokenized_test[23])

appended tokenized test 72499
2497


In [58]:
counting_toked_test = Counter(appended_tokenized_test)
print(dict(counting_toked_test))
print('counting_toked_test', len(counting_toked_test))

{5432: 12, 2149: 326, 22024: 10, 15074: 28, 26457: 6, 2089: 90, 15358: 3, 4034: 22, 18227: 1, 3667: 26, 11550: 6, 5029: 15, 2877: 18, 22171: 7, 7420: 27, 5770: 15, 19739: 3, 20486: 1, 3723: 7, 3840: 6, 1052: 31, 2497: 62, 18195: 5, 24869: 154, 3361: 43, 2461: 24, 10880: 2, 10768: 6, 2099: 43, 4122: 35, 3519: 15, 2552: 22, 2612: 26, 4060: 6, 3021: 15, 3549: 12, 2689: 41, 4804: 8, 3513: 9, 2758: 97, 9786: 66, 3039: 14, 5547: 9, 5427: 2, 12882: 5, 2015: 616, 3477: 30, 4636: 16, 7499: 12, 2375: 51, 4861: 33, 2772: 22, 2195: 19, 2372: 25, 2421: 35, 10501: 1, 5543: 11, 17656: 1, 2520: 3, 22147: 2, 2056: 1002, 20006: 1, 2098: 77, 3663: 18, 2058: 20, 22299: 6, 6094: 9, 5356: 20, 18019: 1, 3316: 79, 2164: 52, 8582: 13, 2482: 44, 2437: 45, 3886: 3, 6088: 4, 5275: 17, 7927: 15, 3627: 11, 6815: 9, 23613: 2, 2052: 362, 2486: 23, 2000: 37, 3013: 39, 5841: 28, 2210: 37, 3813: 92, 2121: 58, 2192: 12, 3314: 31, 2228: 72, 2071: 211, 4468: 15, 2735: 21, 26980: 1, 14422: 3, 2067: 135, 3031: 18, 10107: 1, 

In [59]:
inversed_toked_test = tokenizer.convert_ids_to_tokens(appended_tokenized_test)
inversed_count_toked = Counter(inversed_toked_test)
print('inversed count tokenized')
print(dict(inversed_count_toked))

inversed count tokenized


In [62]:
sorted_toked_test = sorted(counting_toked_test.items(), key = lambda x:x[1], reverse=True)
sorted_inversed_toked = sorted(inversed_count_toked.items(), key = lambda x:x[1], reverse=True)
print(len(sorted_inversed_toked))

10541


In [64]:
inversed_count_without_sw = []
for i in range(len(sorted_inversed_toked)):
  if sorted_inversed_toked[i][0] not in STOPWORDS:
    inversed_count_without_sw.append(sorted_inversed_toked[i])

print(len(inversed_count_without_sw))

inversed_count_without_sw = dict(inversed_count_without_sw)

10440


In [66]:
common_occurences_dict = {key: value for key, value in inversed_count_without_sw.items() if 50 < value < 500}
print(common_occurences_dict)
sorted_common_val = sorted(common_occurences_dict.items(), key = lambda x:x[1], reverse=True)

{'mr': 394, 'would': 362, 'us': 326, 'people': 293, 'new': 286, 'year': 286, 'also': 266, 'one': 256, 'could': 211, 'mobile': 191, 'time': 188, 'first': 186, 'uk': 176, 'two': 169, 'world': 164, 'bn': 154, 'last': 150, 'get': 140, 'back': 135, 'told': 128, 'three': 124, '##ing': 123, 'like': 123, 'set': 122, 'music': 120, 'make': 115, 'many': 114, 'high': 112, 'technology': 111, 'game': 110, 'years': 110, 'use': 109, 'government': 108, 'added': 107, 'market': 106, 'way': 105, 'blair': 105, 'made': 104, 'games': 100, 'take': 99, 'says': 97, 'well': 97, 'next': 97, 'sales': 97, 'since': 96, 'bbc': 96, 'still': 94, 'see': 94, 'england': 94, 'phone': 93, 'firm': 92, 'players': 92, 'may': 90, 'going': 89, 'film': 89, 'used': 88, 'company': 87, 'go': 87, 'number': 87, 'bank': 87, 'home': 86, 'good': 85, 'labour': 85, 'play': 83, 'british': 83, 'data': 83, 'party': 83, 'second': 82, 'tv': 82, 'minister': 82, 'much': 81, 'part': 80, 'companies': 79, 'want': 78, 'europe': 78, '##ed': 77, '##a':