<a href="https://colab.research.google.com/github/cs145442/nlp-projects-with-tf2/blob/master/multi_class_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setting up libraries and dependencies

In [0]:
!pip install bert-for-tf2
!pip install sentencepiece



In [0]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [0]:
import re
import pandas as pd
import random
import math
import numpy as np

In [0]:
! ls -l

total 22228
drwxr-xr-x 1 root root     4096 May 29 18:19 sample_data
-rw-r--r-- 1 root root 22751429 Jun 10 15:31 train.csv


### Pre Processing the textual data

*computation or processing per se, with numbers only. Hence, language modeling.*

In [0]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

*we're using the bert layer for tokenisation only.*

In [0]:
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [0]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

*our encoder is ready, let's setup helper functions*

In [0]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [0]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

*Processing manager is ready! Let's get the data.*

### Loading the training data

In [0]:
train_data = pd.read_csv("/content/train.csv")

train_data.isnull().values.any()

train_data.shape

(11314, 3)

In [0]:
train_data.head()

Unnamed: 0,id,message,topic
0,0,From: lerxst@wam.umd.edu (where's my thing)\r\...,7
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,3,From: jgreen@amber (Joe Green)\r\nSubject: Re:...,1
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


*this is how the raw data looks like, let's pre-process the training data!*

In [0]:
messages = []
message = list(train_data['message'])
for msg in message:
    messages.append(preprocess_text(msg))

In [0]:
y = train_data['topic']

In [0]:
tokenizer.tokenize("don't be so judgmental")

['don', "'", 't', 'be', 'so', 'judgment', '##al']

*some magic happened over there, email me for any queries*

In [0]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("dont be so judgmental"))

[2123, 2102, 2022, 2061, 8689, 2389]

*Alas! Now we have a representation of textual information in numerical form.*

In [0]:
def tokenize_message(text_message):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_message))

In [0]:
tokenized_messages = [tokenize_message(message) for message in messages]

### Preparing data for training

In [0]:
messages_with_len = [[message, y[i], len(message)]
                 for i, message in enumerate(tokenized_messages)]

In [0]:
random.shuffle(messages_with_len)

*shuffling, something we should always do for better tangling*

In [0]:
messages_with_len.sort(key=lambda x: x[2])

*to handle the dimension for each sequence model, we pad the sequence as per batch size.*

In [0]:
sorted_messages_labels = [(message_lab[0], message_lab[1]) for message_lab in messages_with_len]

In [0]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_messages_labels, output_types=(tf.int32, tf.int32))

In [0]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [0]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 33), dtype=int32, numpy=
 array([[ 3029,  5334,  2110, ...,     0,     0,     0],
        [ 2013,  1044,  2140, ...,     0,     0,     0],
        [ 3029,  9502,  2110, ...,     0,     0,     0],
        ...,
        [ 3029,  2103,  2118, ...,  3521,  4717,     0],
        [ 2013, 14722,  3103, ...,  8583,  3212, 23689],
        [ 2013, 27859,  5051, ...,  2012, 12864,  3210]], dtype=int32)>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([18,  1,  8,  4,  4,  7, 17, 10, 10,  6, 15, 10,  6,  4,  3,  3,  7,
         7,  8, 15, 10, 15,  1, 14,  4,  9,  7,  6,  1, 18,  5,  2],
       dtype=int32)>)

In [0]:
TOTAL_BATCHES = math.ceil(len(sorted_messages_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

*keeping the 10% of the batched dataset for evaluation*

### Setting up the model

In [0]:
class MCTC_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="mctc_model"):
        super(MCTC_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

*our model is ready to invoke. please email me, for any queries!*

*initialising all the hyper parameters*

In [0]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 20

DROPOUT_RATE = 0.2

NB_EPOCHS = 4

*loading the model*

In [0]:
text_model = MCTC_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

*adding the configuration for loss and optimisation metrics*

In [0]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [0]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f7383869f98>

*after some hyper parameter tuning we got a training accuracy of 97% at the 3rd epoch*

In [0]:
results = text_model.evaluate(test_data)
print(results)

[0.960477352142334, 0.731249988079071]


*loading the test dataset*

In [0]:
! ls

sample_data  test.csv  train.csv


In [0]:
raw_test_data = pd.read_csv('test.csv')

In [0]:
raw_test_data.head()

Unnamed: 0,id,message
0,0,From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. ...
1,1,From: Rick Miller <rick@ee.uwm.edu>\r\nSubject...
2,2,From: mathew <mathew@mantis.co.uk>\r\nSubject:...
3,3,From: bakken@cs.arizona.edu (Dave Bakken)\r\nS...
4,4,From: livesey@solntze.wpd.sgi.com (Jon Livesey...


In [0]:
raw_test_data.shape

(7532, 2)

*writing a simple prediction function*

In [0]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = text_model(inputs, training=False)

    pred_class = np.argmax(np.array(output))
    return pred_class

*same textual data pre-processing, pre-processing*

In [0]:
test_messages = []
test_message = list(raw_test_data['message'])
for msg in test_message:
    test_messages.append(preprocess_text(msg))

In [0]:
get_prediction('How is the topic?')

15

In [0]:
for i in range(0,3):
  print(f"{get_prediction(message[i])}, {y[i]}")

7, 7
4, 4
4, 4


*validated the labelling, adding the predictions to a csv format file*

In [0]:
test_pred_classes = list()
for msg in test_messages:
  test_pred_classes.append(get_prediction(msg))

In [0]:
submission_data = {'id': raw_test_data['id'], 'topic': test_pred_classes}
submission_data = pd.DataFrame(submission_data)

In [0]:
submission_data.head()

Unnamed: 0,id,topic
0,0,14
1,1,18
2,2,0
3,3,18
4,4,0


In [0]:
submission_data.to_csv('submission.csv')