# 20 News Group Data Classification

Embeddings generation using Hugging Face Bert Pretrained Model and News Group classifier trained with multi-backend Keras.  
The model generated is then served with konduit-serving for REST inference.

Konduit-Serving: https://github.com/KonduitAI/konduit-serving  
Hugging Face NLP Library: https://github.com/huggingface/transformers  
Data: http://qwone.com/~jason/20Newsgroups/

In [1]:
from __future__ import print_function
import re
import os
import numpy as np
import pandas as pd
import transformers as ppb #!python -m pip install transformers
import pickle

## Data Vectorization

In [2]:
data_root_path = 'D:\\Users\\chiawei\\konduit\\Github\\newsgroup_data\\20news-bydate\\'
train_folder = '20news-bydate-train'
test_folder = '20news-bydate-test'
file_path = 'D:\\Users\\chiawei\\konduit\\Github\\rpa-email-forwarder\\files\\'
MAX_TOKENIZE_LEN = 512

class_label = [f for f in os.listdir(os.path.join(data_root_path, train_folder))]
class_index = [i for i in range(len(class_label))]

total_class = len(class_index)

label_index_pair = {}
for label, index in zip(class_label, class_index):
        label_index_pair[label] = index
        print(label, index)
        
index_label_pair = {}
for index, label in zip(class_index, class_label):
        index_label_pair[index] = label

print('Save index label')
label_path = "labelclass.pickle"
with open(label_path, 'wb') as labelhandler:
    pickle.dump(index_label_pair, labelhandler)

alt.atheism 0
comp.graphics 1
comp.os.ms-windows.misc 2
comp.sys.ibm.pc.hardware 3
comp.sys.mac.hardware 4
comp.windows.x 5
misc.forsale 6
rec.autos 7
rec.motorcycles 8
rec.sport.baseball 9
rec.sport.hockey 10
sci.crypt 11
sci.electronics 12
sci.med 13
sci.space 14
soc.religion.christian 15
talk.politics.guns 16
talk.politics.mideast 17
talk.politics.misc 18
talk.religion.misc 19
Save index label


In [3]:
def preprocess_regex(text):
    
    # Applies preprocessing on text
    
    #remove leading & end white spaces and convert text to lowercase
    text = text.strip().lower()
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove punctuation marks 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for i in text:
        if i in punctuations: 
                text = text.replace(i, "")
            
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    
    #remove number
    text = re.sub(r"\d+", "", text)
    
    return text

In [4]:
stop_words = ["from", "to", "subject", "title", "request", "looking", "look", "forward", "cheers", "regards", "thank", "thanks", "hi", "all", "since", "mentioned", "free", "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]

def remove_stop_words(input_str):
    
    tokenized_words = input_str.split()
    
    filtered_words = [w for w in tokenized_words if not w in stop_words]
    
    output = " ".join(filtered_words)
    
    if len(output) > MAX_TOKENIZE_LEN:
        return output[0: MAX_TOKENIZE_LEN]
    
    return output  #return as string



In [5]:
def get_dfs(data_path, class_dict):
    
    data = pd.DataFrame(columns = ["text", "classindex", "classlabel"])

    text = []
    class_index = []
    class_label = []
    
    for label in label_index_pair.keys():

        class_path = os.path.join(data_path, label)
        files_list = [f for f in os.listdir(class_path) ]

        for f in os.listdir(class_path):

            with open(os.path.join(class_path, f), "r") as reader:

                text.append(remove_stop_words(preprocess_regex(reader.read())))
                class_label.append(label)
                class_index.append(class_dict[label])
                
    data["text"] = text
    data["classindex"] = class_index
    data["classlabel"] = class_label
                
    return data

                    
train_data = get_dfs(os.path.join(data_root_path, train_folder), label_index_pair)
test_data = get_dfs(os.path.join(data_root_path, test_folder), label_index_pair)

In [6]:
#Shuffle data
train_data = train_data.reindex(np.random.permutation(train_data.index))
test_data = test_data.reindex(np.random.permutation(test_data.index))

print("Number of training data: {}".format(train_data.shape[0]))
print("Number of testing data: {}".format(test_data.shape[0]))

train_data.head(20)['text']


Number of training data: 11314
Number of testing data: 7532


6311     mmblamarcolostateedu michael burger tv info di...
5279     derichnetcomcom scottytissue day day day disab...
11161    stevepcadencecom steve peterson re question sa...
10123    masajsdccucsdedu system operator moment silenc...
9913     serazumauucp serdar argic day night armenians ...
4215     hagenjdwfuedu jeff hagen improvements automati...
10867    blhuiboiseidbsuedu broward l horne month xrece...
551      email michael abrash gmontemeiscalstateedu geo...
2343     ziacastleedacuk zia manji help please hand sca...
9150     jsleddssdcsasupennedu james sledd afterlife or...
10183    bdmcsritedu brendan d mckay re deir yassin nnt...
6944     smbresearchattcom steven bellovin re shelf che...
5533     kingcogsciucsdedu jonathan king re zanerescue ...
1378     kjetilkstudcsuitno kjetil kolin proteced mode ...
10285    waldocybernetcsefauedu todd j dicker re israel...
6845     rdippoldqualcommcom ron asbestos dippold re ta...
6714     jfcathenamitedu john f carr re screw people cr.

# Getting subset of data due to memory overload

In [7]:
train_data = train_data[0: 6000] #6000
test_data = test_data[0: 2000] #2000
#train_data.to_csv(os.path.join(data_root_path, "train_data.csv"))
#test_data.to_csv(os.path.join(data_root_path, "test_data.csv"))

## Embeddings Generation:  
Loading hugging face transformer bert pretrained model and tokenizer

In [8]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## uncomment below for  BERT instead of distilBERT
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

## Text Tokenization
Tokenize the sentences -- break them up into word and subwords in the format BERT is comfortable with.

In [9]:
tokenized_train_data = train_data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized_test_data = test_data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized_train_data.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded_train_data = np.array([i + [0]*(max_len-len(i)) for i in tokenized_train_data.values])
padded_test_data = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test_data.values])

#print("Shape of input data: {}".format(padded_train_data.shape))

## Masking of padded data  
Masking tells the NLP model to ignore (mask) the padding added when it's processing its input.  
That's what attention_mask is:

In [10]:
train_attention_mask = np.where(padded_train_data != 0, 1, 0)
test_attention_mask = np.where(padded_test_data != 0, 1, 0)
train_attention_mask.shape

(6000, 317)

# Get embeddings through Hugging Face Bert using Pytorch

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from torch.utils import data
from torchsummary import summary

from tensorflow.keras import backend
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Dropout
from keras.optimizers import RMSprop

batch_size = 512
epoch_count = 20
labels = 20

Using TensorFlow backend.


In [12]:
pytrain_input_ids = torch.tensor(padded_train_data)  
pytrain_attention_mask = torch.tensor(train_attention_mask)

pytrain_input_ids = torch.tensor(pytrain_input_ids).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(pytrain_input_ids, attention_mask=pytrain_attention_mask)
    
train_features = last_hidden_states[0][:,0,:].numpy()

train_labels = np.expand_dims(train_data["classindex"], axis = 1)

train_labels = keras.utils.to_categorical(train_labels, labels)

  after removing the cwd from sys.path.


In [13]:

pytest_input_ids = torch.tensor(padded_test_data)  
pytest_attention_mask = torch.tensor(test_attention_mask)

pytest_input_ids = torch.tensor(pytest_input_ids).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(pytest_input_ids, attention_mask=pytest_attention_mask)
    
test_features = last_hidden_states[0][:,0,:].numpy()

test_labels = np.expand_dims(test_data["classindex"], axis = 1)

test_labels = keras.utils.to_categorical(test_labels, labels)


  after removing the cwd from sys.path.


# Train embeddings with tf-backend Keras

In [16]:
classifier = Sequential()

classifier.add(Dense(250, activation='relu', input_shape=(768,)))
classifier.add(Dropout(0.1))
classifier.add(Dense(250, activation='relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(labels, activation='softmax'))

classifier.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
classifier.fit(train_features, train_labels,
          batch_size=batch_size,
          validation_data = (test_features, test_labels),
          epochs=50)#epoch_count)

classifier.save('bert-embeddings-keras-mlp.h5')


Train on 6000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
