In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import itertools

In [2]:
MAX_TEXTS = 1000000

In [3]:
texts, tags, brands = [], [], []
with open("data/train_products.csv", 'r') as f:
    reader = csv.DictReader(f, fieldnames=["title","brand","tags"])
    count = 0
    for row in reader:
        #print(row)
        count += 1
        text, tag_set = row['title'], row['tags'].split(' ')[:-1]
        texts.append(text)
        tags.append(tag_set)
        brands.append(row['brand'])
        if count >= MAX_TEXTS:
            break

In [4]:
texts  = np.asarray(texts)
brands = np.asarray(brands)
tags   = np.asarray(tags)

In [5]:
texts.shape[0]

62184

In [5]:
np.random.seed(seed=1)
indices = np.arange(texts.shape[0])
np.random.shuffle(indices)
texts_reordered  = texts[indices].tolist()
brands_reordered = brands[indices].tolist()
tags_reordered   = tags[indices].tolist()

In [6]:
numSentences = len(texts_reordered)
np.random.seed(0)
training_examples = np.random.binomial(1, 0.7, numSentences)

In [7]:
trainSentence = []
testSentence = []


nerLabels_train =[]
nerLabels_test = []


for example in range(numSentences):
    if training_examples[example] == 1:
        trainSentence.append(texts_reordered[example])
        nerLabels_train.append(brands_reordered[example])
    else:
        testSentence.append(texts_reordered[example])
        nerLabels_test.append(brands_reordered[example])

In [52]:
# Use a parameter pair k_start, k_end to look at slices. This helps with quick tests.

k_start = 0
k_end = 500
#k_end = -1

if k_end == -1:
    k_end_train = len(trainSentence)
    k_end_test = len(testSentence)
else:
    k_end_train = k_end
    


trainSentence_k  = trainSentence[k_start:k_end_train]
nerLabels_train_k   = list(set(nerLabels_train[k_start:k_end_train]))[0:len(list(set(nerLabels_train[k_start:k_end_train])))-79]

In [53]:
len(set(nerLabels_train_k))

200

In [54]:
nerLabels_predict = []

In [55]:
i = 0
for sentence in testSentence:
    sentDict = {}
    sentDict["sentence"]       = sentence
    sentDict["label"]          = nerLabels_test[i]
    for label in set(nerLabels_train_k):
        if label in sentence:
            sentDict["pre_label"] = label
            break
    nerLabels_predict.append(sentDict)
    i = i + 1
            

In [56]:
predictionDF = pd.DataFrame(nerLabels_predict)

In [57]:
predictionDF["pre_label"] = predictionDF["pre_label"].fillna(" ")

In [58]:
def tag_brands(brand,title):
    tagging = ''
    brand = brand.split(' ')
    brand_started = False
    not_pass = False
    i = 0
    added_i = 0
    words = title.split(' ')
    for word in title.split(' '):
        if word == brand[0] and not_pass is False:
            tagging += 'B-B '
            brand_started = True
        elif len(brand) > 1 and brand_started:
            j = i
            for b in brand[1:]:
                #print(b,words[j],words,brand)
                if words[j] == b:
                    tagging += 'I-B '
                    added_i = added_i + 1
                else:
                    brand_started = False
                    tagging += 'O '
                    added_i = added_i + 1
                    
                j = j + 1
            brand_started = False
            not_pass = True
        else:
            brand_started = False
            if added_i >= 2:
                added_i = added_i - 1
            else:
                tagging += 'O '
                
        i = i + 1
    #return tagging
    #print("Words",tagging)
    tags = tagging.split(" ")
    tags.pop()
    return tags

In [59]:
predictionDF['tags']           = predictionDF.apply(lambda x: tag_brands(x['label'],x['sentence']), axis=1)
predictionDF['predicted_tags'] = predictionDF.apply(lambda x: tag_brands(x['pre_label'],x['sentence']), axis=1)

In [60]:
predictionDF['tags_length'] = predictionDF['tags'].str.len()
predictionDF['predicted_tags_length'] = predictionDF['predicted_tags'].str.len()

In [63]:
y_true = list(itertools.chain.from_iterable(list(predictionDF['tags'])))
y_pred = list(itertools.chain.from_iterable(list(predictionDF['predicted_tags'])))
target_names = ['0', 'B-B', 'I-B']

In [65]:
print(classification_report(y_true, y_pred, labels = ['O','B-B','I-B'], target_names=target_names))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     95691
         B-B       0.96      0.48      0.64     18592
         I-B       1.00      0.55      0.71      5531

    accuracy                           0.90    119814
   macro avg       0.95      0.67      0.76    119814
weighted avg       0.90      0.90      0.88    119814



In [None]:
predictionDF[predictionDF['tags_length']!=predictionDF['predicted_tags_length']]