In [2]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import itertools

In [3]:
MAX_TEXTS = 1000000

In [4]:
#data from product file to variables
texts, tags, brands = [], [], []
with open("data/train_products.csv", 'r') as f:
    reader = csv.DictReader(f, fieldnames=["title","brand","tags"])
    count = 0
    for row in reader:
        #print(row)
        count += 1
        text, tag_set = row['title'], row['tags'].split(' ')[:-1]
        texts.append(text)
        tags.append(tag_set)
        brands.append(row['brand'])
        if count >= MAX_TEXTS:
            break

In [5]:
len(texts)

62184

In [None]:
#turn lists to numpy arrays to randomize order
texts  = np.asarray(texts)
brands = np.asarray(brands)
tags   = np.asarray(tags)

In [None]:
#randomize order
np.random.seed(seed=1)
indices = np.arange(texts.shape[0])
np.random.shuffle(indices)
texts_reordered  = texts[indices].tolist()
brands_reordered = brands[indices].tolist()
tags_reordered   = tags[indices].tolist()

In [None]:
#create list with split
numSentences = len(texts_reordered)
np.random.seed(0)
training_examples = np.random.binomial(1, 0.7, numSentences)

In [None]:
#divide list of titles and labels into train and test
trainSentence = []
testSentence = []


nerLabels_train =[]
nerLabels_test = []


for example in range(numSentences):
    if training_examples[example] == 1:
        trainSentence.append(texts_reordered[example])
        nerLabels_train.append(brands_reordered[example])
    else:
        testSentence.append(texts_reordered[example])
        nerLabels_test.append(brands_reordered[example])

In [None]:
# Use a parameter pair k_start, k_end to look at slices.

k_start = 0
#k_end = 500
k_end = -1

if k_end == -1:
    k_end_train = len(trainSentence)
    k_end_test = len(testSentence)
else:
    k_end_train = k_end
    


trainSentence_k  = trainSentence[k_start:k_end_train]
nerLabels_train_k   = list(set(nerLabels_train[k_start:k_end_train]))[0:len(list(set(nerLabels_train[k_start:k_end_train])))-79]

In [None]:
#make predictions
nerLabels_predict = []
i = 0
for sentence in testSentence:
    sentDict = {}
    sentDict["sentence"]       = sentence
    sentDict["label"]          = nerLabels_test[i]
    for label in set(nerLabels_train_k):
        if label in sentence:
            sentDict["pre_label"] = label
            break
    nerLabels_predict.append(sentDict)
    i = i + 1
            

In [None]:
#predictions to dataframe
predictionDF = pd.DataFrame(nerLabels_predict)

In [None]:
predictionDF["pre_label"] = predictionDF["pre_label"].fillna(" ")

In [None]:
#tag brands in order to compare with the other models
def tag_brands(brand,title):
    tagging = ''
    brand = brand.split(' ')
    brand_started = False
    not_pass = False
    i = 0
    added_i = 0
    words = title.split(' ')
    for word in title.split(' '):
        if word == brand[0] and not_pass is False:
            tagging += 'B-B '
            brand_started = True
        elif len(brand) > 1 and brand_started:
            j = i
            for b in brand[1:]:
                #print(b,words[j],words,brand)
                if words[j] == b:
                    tagging += 'I-B '
                    added_i = added_i + 1
                else:
                    brand_started = False
                    tagging += 'O '
                    added_i = added_i + 1
                    
                j = j + 1
            brand_started = False
            not_pass = True
        else:
            brand_started = False
            if added_i >= 2:
                added_i = added_i - 1
            else:
                tagging += 'O '
                
        i = i + 1
    #return tagging
    #print("Words",tagging)
    tags = tagging.split(" ")
    tags.pop()
    return tags

In [None]:
#apply tag transformation to dataframe
predictionDF['tags']           = predictionDF.apply(lambda x: tag_brands(x['label'],x['sentence']), axis=1)
predictionDF['predicted_tags'] = predictionDF.apply(lambda x: tag_brands(x['pre_label'],x['sentence']), axis=1)

In [None]:
#compare lenghts just to check
predictionDF['tags_length'] = predictionDF['tags'].str.len()
predictionDF['predicted_tags_length'] = predictionDF['predicted_tags'].str.len()

In [None]:
#create lists from dataframe to create classification report
y_true = list(itertools.chain.from_iterable(list(predictionDF['tags'])))
y_pred = list(itertools.chain.from_iterable(list(predictionDF['predicted_tags'])))
target_names = ['0', 'B-B', 'I-B']

In [None]:
print(classification_report(y_true, y_pred, labels = ['O','B-B','I-B'], target_names=target_names, digits=3))