In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
! cp kaggle.json /root/.kaggle/
! chmod 600 /root/.kaggle/kaggle.json
! kaggle datasets download -d PromptCloudHQ/flipkart-products
! unzip flipkart-products.zip

In [None]:
# this covers company names, address and location
! wget http://download.companieshouse.gov.uk/BasicCompanyDataAsOneFile-2020-06-01.zip
! unzip BasicCompanyDataAsOneFile-2020-06-01.zip

### Gather datasets

In [None]:
# company, address and location dataset
import pandas as pd
df = pd.read_csv('BasicCompanyDataAsOneFile-2020-06-01.csv')
df.head()

In [None]:
%%time
df_add = df[["RegAddress.AddressLine1", " RegAddress.AddressLine2", "RegAddress.PostTown", "RegAddress.County", "RegAddress.Country", "RegAddress.PostCode"]]
df_add["add"] = df_add.apply(lambda x : x.to_string(index=False, na_rep=""),axis=1).replace({"\n":','}, regex=True)
df_loc = df[["RegAddress.PostTown", "RegAddress.County", "RegAddress.Country"]]
df_loc["loc"] = df_loc.apply(lambda x : x.to_string(index=False, na_rep=""),axis=1).replace({"\n":','}, regex=True)

In [None]:
%%time
names = df.CompanyName.unique()
company_df = pd.DataFrame(columns=["text", "labels"])
company_df["text"] = names
company_df["labels"] = ["CompanyName"] * len(names)
add = df_add["add"].unique()
add_df = pd.DataFrame(columns=["text", "labels"])
add_df["text"] = add
add_df["labels"] = ["CompanyAdd"] * len(add)
loc = df_loc["loc"].unique()
loc_df = pd.DataFrame(columns=["text", "labels"])
loc_df["text"] = loc
loc_df["labels"] = ["CompanyLoc"] * len(loc)

In [None]:
# goods
df = pd.read_csv("flipkart_com-ecommerce_sample.csv")

In [None]:
%%time
goods = []
cat = df["product_category_tree"].apply(lambda x : x.replace("[", "").replace("]", "").replace('\"', "").split(" >> "))
flat_cat = [item for subitem in cat for item in subitem]
for x in flat_cat:
    if x not in goods:
        goods.append(x)
goods_df = pd.DataFrame(columns=["text", "labels"])
goods_df["text"] = goods
goods_df["labels"] = ["Goods"] * len(goods)

In [None]:
# serial number
import random
random.seed(42)
def artificial_serials():
    start = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    middle = '|-/'
    st = []
    for _ in range(15000):
        s = ""
        for i in range(random.randint(3, 5)):
            s += random.choice(list(start))
        mid = random.choice(list(middle))
        s += mid
        for j in range(random.randint(2, 5)):
            s += random.choice(list(start))
        s += mid
        for j in range(random.randint(2, 4)):
            s += random.choice(list(start))
        st.append(s)
    start = "0123456789"
    for _ in range(1000):
        s = ""
        for i in range(random.randint(6, 10)):
            s += random.choice(list(start))
        st.append(s)
    start = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    for _ in range(1000):
        s = ""
        for i in range(random.randint(3, 5)):
            s += random.choice(list(start))
        st.append(s)
        mid = random.choice(list(middle))
        s += mid
        for j in range(random.randint(3, 5)):
            s += random.choice(list(start))
    return st

In [None]:
%%time
st = artificial_serials()
serial = []
for s in st:
    serial.append(s)
serial_df = pd.DataFrame(columns=["text", "labels"])
serial_df["text"] = serial
serial_df["labels"] = ["Serial"] * len(serial)

In [None]:
data = pd.concat([company_df, add_df, loc_df, goods_df, serial_df], ignore_index=True)
data.head()
print (data["labels"].value_counts())

CompanyName    4594492
CompanyAdd     2601715
CompanyLoc      114019
Serial           17000
Goods             9045
Name: labels, dtype: int64


In [None]:
data.to_csv("dataset.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# ! cp "/content/dataset.csv" "/content/drive/My Drive/Colab Notebooks/dataset.csv"
! cp "/content/drive/My Drive/Colab Notebooks/dataset.csv" "/content/dataset.csv"

### Preprocessing

In [None]:
!pip install unidecode

In [None]:
from unidecode import unidecode
import string
import pandas as pd
import nltk
import re
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopset = list(string.punctuation)
stopset.remove("&")
stopset.remove("-")
stopset.remove("/")
stopset.remove("|")

In [None]:
data = pd.read_csv("dataset.csv")
c_names = list(data["text"][data["labels"] == "CompanyName"])
c_add = list(data["text"][data["labels"] == "CompanyAdd"])
c_loc = list(data["text"][data["labels"] == "CompanyLoc"])
c_serial = list(data["text"][data["labels"] == "Serial"])
c_goods = list(data["text"][data["labels"] == "Goods"])
print (data["labels"].value_counts())

CompanyName    4594492
CompanyAdd     2601715
CompanyLoc      114019
Serial           17000
Goods             9045
Name: labels, dtype: int64


In [None]:
def clean(arr):
    cleaned = []
    for word in tqdm(arr):
        word = word.lower()
        word = re.sub("[^A-Za-z0-9&-|/]+", " ", word)
        w_t = " ".join([i for i in word_tokenize(word) if i not in stopset])
        w_t = unidecode(w_t)
        cleaned.append(w_t)
    return cleaned

def clean_word(word):
    word = word.lower()
    word = re.sub("[^A-Za-z0-9&-|/]+", " ", word)
    w_t = " ".join([i for i in word_tokenize(word) if i not in stopset])
    w_t = unidecode(w_t)
    return w_t

In [None]:
new_c_names = []
print ("Old Length:", len(c_names))
for c in tqdm(c_names):
    if c[0] not in stopset and not c.startswith("and") and c[0] not in ["-", "&", "/", "|"] and not c[0].isdigit() and len(c)>= 15:
        new_c_names.append(c)
c_names = new_c_names
print ("New Length:", len(c_names))

In [None]:
c_names[:5], c_add[:5], c_loc[:5], c_serial[:5], c_goods[:5]

In [None]:
%%time
c_names = clean(c_names)
c_add = clean(c_add)
c_loc = clean(c_loc)
c_serial = clean(c_serial)
c_goods = clean(c_goods)

In [None]:
c_names[:5], c_add[:5], c_loc[:5], c_serial[:5], c_goods[:5]

### Model

In [None]:
! pip install bert-for-tf2
! pip install sentencepiece

In [None]:
import bert
import tensorflow_hub as hub
from bert import bert_tokenization

BertTokenizer = bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [None]:
def sample_token(arr):
    for i in range(5):
        token = tokenizer.tokenize(arr[i])
        print (arr[i], token, tokenizer.convert_tokens_to_ids(token))

sample_token(c_names)
sample_token(c_add)
sample_token(c_loc)
sample_token(c_serial)
sample_token(c_goods)

a m g engineering solutions limited ['a', 'm', 'g', 'engineering', 'solutions', 'limited'] [1037, 1049, 1043, 3330, 7300, 3132]
a & a properties south coast limited ['a', '&', 'a', 'properties', 'south', 'coast', 'limited'] [1037, 1004, 1037, 5144, 2148, 3023, 3132]
a & a property management services limited ['a', '&', 'a', 'property', 'management', 'services', 'limited'] [1037, 1004, 1037, 3200, 2968, 2578, 3132]
a & m school of motoring limited ['a', '&', 'm', 'school', 'of', 'motor', '##ing', 'limited'] [1037, 1004, 1049, 2082, 1997, 5013, 2075, 3132]
a & m swift ltd ['a', '&', 'm', 'swift', 'ltd'] [1037, 1004, 1049, 9170, 5183]
metrohouse 57 pepper road hunslet leeds yorkshire ls10 2ru ['metro', '##house', '57', 'pepper', 'road', 'hu', '##ns', '##let', 'leeds', 'yorkshire', 'l', '##s', '##10', '2', '##ru'] [6005, 4580, 5401, 11565, 2346, 15876, 3619, 7485, 7873, 7018, 1048, 2015, 10790, 1016, 6820]
the studio hatherlow house hatherlow romiley united kingdom sk6 3dy ['the', 'studio'

In [None]:
%%time
pre_c_names = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in c_names]
pre_c_add = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in c_add]
pre_c_loc = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in c_loc]
pre_c_serial = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in c_serial]
pre_c_goods = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in c_goods]

In [None]:
%%time
# sample 10,000 of each of classes
import random
random.seed(42)
from functools import reduce
import operator

sample_c_names = random.sample(pre_c_names, 10000)
sample_c_add = random.sample(pre_c_add, 10000)
sample_c_loc = random.sample(pre_c_loc, 10000)
sample_c_serial = random.sample(pre_c_serial, 10000)
sample_c_goods = random.sample(pre_c_goods, len(pre_c_goods))

x = []
for i in [sample_c_names, sample_c_add, sample_c_loc, sample_c_serial, sample_c_goods]:
    for j in i:
        x.append(j)

labels = ["CompanyName"]*len(sample_c_names)
labels.extend(["CompanyAdd"]*len(sample_c_add))
labels.extend(["CompanyLoc"]*len(sample_c_loc))
labels.extend(["Serial"]*len(sample_c_serial))
labels.extend(["Goods"]*len(sample_c_goods))
print (len(x), len(labels))

49045 49045
CPU times: user 68.3 ms, sys: 994 Âµs, total: 69.3 ms
Wall time: 68.6 ms


In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
ohe = OneHotEncoder()
y = ohe.fit_transform(np.array(labels).reshape(-1, 1))
y = y.toarray()
classes = list(ohe.categories_[0])
print (classes)

['CompanyAdd', 'CompanyLoc', 'CompanyName', 'Goods', 'Serial']


In [None]:
# Following this : https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/
import tensorflow as tf
from tensorflow.keras import layers

dataset = [[x[i], y[i], len(x)] for i in range(len(x))]
random.shuffle(dataset)
dataset.sort(key=lambda x: x[2])
sorted_dataset = [(d[0], d[1]) for d in dataset]

In [None]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_dataset, output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 256
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
next(iter(batched_dataset))

(<tf.Tensor: shape=(256, 28), dtype=int32, numpy=
 array([[ 1041,  9096, 18098, ...,     0,     0,     0],
        [ 9874,  2100,  3057, ...,     0,     0,     0],
        [ 1054,  3501,  2102, ...,     0,     0,     0],
        ...,
        [ 9779,  5974, 25545, ...,     0,     0,     0],
        [22851,  8973,  8862, ...,     0,     0,     0],
        [ 5170,  3270,  8953, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(256, 5), dtype=int32, numpy=
 array([[0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1],
        ...,
        [0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0]], dtype=int32)>)

In [None]:
TOTAL_BATCHES = int(len(sorted_dataset) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)
print (TOTAL_BATCHES*BATCH_SIZE, TEST_BATCHES*BATCH_SIZE)

48896 4864


In [None]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=5,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size, embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=model_output_classes, activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 5
DROPOUT_RATE = 0.4
NB_EPOCHS = 2

In [None]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

text_model.compile(loss="categorical_crossentropy",
                   optimizer="adam",
                   metrics=["categorical_accuracy"])

In [None]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fa49bdbb860>

In [None]:
results = text_model.evaluate(test_data)
print(results)

[0.028882566839456558, 0.9913651347160339]


In [None]:
strings = ["MARKS AND SPENCERS LTD", "INTEL Corporation LLC", "M&S LTD", "Microsoft Corporation", "XYZ 13423/ILD",
           "ABC/ICL/20891NC", "ICNAO02312", "LONDON", "LONDON, GREAT BRITAIN", "LONDON, ENGLAND",
           "SLOUGH SE12 2XY", "33 TIMBER YARD, LONDON, L1 8XY", "44 CHINA ROAD, KOWLOON, HONG KONG",
           "HARDWOOD TABLE", "PLASTIC BOTTLE", "TOYS", ]
gt = ["CompanyName", "CompanyName", "CompanyName", "CompanyName", 
      "Serial", "Serial", "Serial",  
      "CompanyLoc", "CompanyLoc", "CompanyLoc", 
      "CompanyAdd", "CompanyAdd", "CompanyAdd",
      "Goods", "Goods", "Goods"]

for i, x in enumerate(strings):
    cleaned = clean_word(x)
    tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))
    if len(tokens) < 20:
        tokens.extend([0]*(20-len(tokens)))
    preds = text_model(tf.expand_dims(tokens, 0), training=False)
    print ("String:", x, "Output:", preds[0][np.argmax(preds)]*100)
    print ("Prediction:", classes[np.argmax(preds)], "Truth:", gt[i])

String: MARKS AND SPENCERS LTD Output: tf.Tensor(99.99875, shape=(), dtype=float32)
Prediction: CompanyName Truth: CompanyName
String: INTEL Corporation LLC Output: tf.Tensor(80.18121, shape=(), dtype=float32)
Prediction: CompanyName Truth: CompanyName
String: M&S LTD Output: tf.Tensor(99.99949, shape=(), dtype=float32)
Prediction: CompanyName Truth: CompanyName
String: Microsoft Corporation Output: tf.Tensor(86.59512, shape=(), dtype=float32)
Prediction: CompanyName Truth: CompanyName
String: XYZ 13423/ILD Output: tf.Tensor(99.96125, shape=(), dtype=float32)
Prediction: Serial Truth: Serial
String: ABC/ICL/20891NC Output: tf.Tensor(99.985374, shape=(), dtype=float32)
Prediction: Serial Truth: Serial
String: ICNAO02312 Output: tf.Tensor(99.51053, shape=(), dtype=float32)
Prediction: Serial Truth: Serial
String: LONDON Output: tf.Tensor(99.748405, shape=(), dtype=float32)
Prediction: CompanyLoc Truth: CompanyLoc
String: LONDON, GREAT BRITAIN Output: tf.Tensor(99.838104, shape=(), dtype=