In [19]:
import re
import numpy as np
import transformers as ppb #!python -m pip install transformers
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from torch.utils import data

from torchsummary import summary
import warnings
warnings.filterwarnings('ignore')

In [20]:
def preprocess_regex(text):
    
    # Applies preprocessing on text
    
    #remove leading & end white spaces and convert text to lowercase
    text = text.strip().lower()
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove punctuation marks 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for i in text:
        if i in punctuations: 
                text = text.replace(i, "")
            
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    
    #remove number
    text = re.sub(r"\d+", "", text)
    
    return text

In [21]:
stop_words = ["from", "to", "subject", "title", "request", "looking", "look", "forward", "cheers", "regards", "thank", "thanks", "hi", "all", "since", "mentioned", "free", "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]
MAX_TOKENIZE_LEN = 512
def remove_stop_words(input_str):
    
    tokenized_words = input_str.split()
    
    filtered_words = [w for w in tokenized_words if not w in stop_words]
    
    output = " ".join(filtered_words)
    
    if len(output) > MAX_TOKENIZE_LEN:
        return output[0: MAX_TOKENIZE_LEN]
    
    return output  #return as string

In [22]:
print('Load index label')
label_path = "labelclass.pickle"
labelhandler = open(label_path, 'rb')
labelhandler = pickle.load(labelhandler)

Load index label


In [23]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## uncomment below for  BERT instead of distilBERT
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): s3.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/bert-base-uncased-vocab.txt HTTP/1.1" 200 0
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\Admin\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): s3.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/distilbert-base-uncased-config.json HTTP/1.1" 200 0
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at C:\Users\Admin\.cache\torch\transformers\a41e817d5c0743e29e86ff85edc

In [24]:
# Load model
from keras.models import load_model

classifier = load_model('bert-embeddings-keras-mlp.h5')

In [25]:
# D:\\Users\\chiawei\\konduit\\Github\\newsgroup_data\\20news-bydate\\20news-bydate-test\\alt.atheism\\53257 alt.atheism
# D:\\Users\\chiawei\\konduit\\Github\\newsgroup_data\\20news-bydate\\20news-bydate-test\\comp.sys.ibm.pc.hardware\\60817 comp.sys.ibm.pc.hardware

test_file_input = 'D:\\Users\\chiawei\\konduit\\Github\\newsgroup_data\\20news-bydate\\20news-bydate-test\\comp.sys.ibm.pc.hardware\\60817'

with open(test_file_input, "r") as file_iterator:
    raw_input = file_iterator.read()
    
processed_input = remove_stop_words(preprocess_regex(raw_input))

In [26]:

tokenized_test_data = tokenizer.encode(processed_input, add_special_tokens=True)

max_len = 512
max_len_add = max_len

if len(tokenized_test_data) > max_len:
    max_len_add = len(tokenized_test_data)
    
padded_test_data = np.array([tokenized_test_data + [0]*(max_len_add-len(tokenized_test_data))])

attention_test_data = np.where(padded_test_data != 0, 1, 0)

input_test_ids = torch.tensor(padded_test_data)  
attention_test_mask = torch.tensor(attention_test_data)

input_test_ids = torch.tensor(input_test_ids).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_test_ids, attention_mask=attention_test_mask)


test_feature = last_hidden_states[0][:,0,:].numpy()

test_output = classifier.predict(test_feature)

local_index = int(np.argmax(test_output, 1)[0])

print("Class: {}".format(labelhandler[local_index]))
print("Probabilities: {}".format(np.max(test_output)))

Class: comp.sys.ibm.pc.hardware
Probabilities: 0.6100162863731384


In [27]:
import io
import logging
import time
from konduit.load import client_from_file

logging.basicConfig(level='DEBUG')
logging.info("Test")

client = client_from_file("config.yaml")

responses = []

start = time.time()
for i in range(10):
    response = client.predict({"default": test_feature})
    responses.append(response)

end = time.time()

print("%f seconds elapsed for %d requests (%d RPS)" % (end - start, len(responses), (10.0 / (end - start))))

INFO:root:Test
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "GET /healthcheck HTTP/1.1" 204 0
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "GET /config HTTP/1.1" 200 923
INFO:root:Retrieved config is
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "POST /classification/numpy HTTP/1.1" 200 608
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "POST /classification/numpy HTTP/1.1" 200 608
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "POST /classification/numpy HTTP/1.1" 200 608
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.conn

20.389534 seconds elapsed for 10 requests (0 RPS)


In [28]:
response = client.predict({"default": test_feature})

results = response["output"]["probabilities"]
index = int(np.argmax(response['output']['probabilities'], 1)[0])

print("Class: {}".format(labelhandler[index]))
print("Probabilities: {}".format(np.max(response['output']['probabilities'])))


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:65322
DEBUG:urllib3.connectionpool:http://localhost:65322 "POST /classification/numpy HTTP/1.1" 200 608


Class: comp.sys.ibm.pc.hardware
Probabilities: 0.6100168228149414
