In [31]:
import os
import pandas as pd
import csv
import spacy
import string
from spacy.tokens import DocBin

In [32]:
tagged_titles = pd.read_csv('./dataset/Train_Tagged_Titles.tsv', sep='\t', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf8')

valid_tags = ['Accents', 'Brand', 'Character', 'Character Family', 'Closure', 'Color', 'Country/Region of Manufacture', 'Department', 'Fabric Type', 'Features', 'Handle Drop', 'Handle Style', 'Handle/Strap Material', 'Hardware Material', 'Lining Material', 'MPN', 'Material', 'Measurement, Dimension', 'Model', 'Occasion', 'Pattern', 'Pocket Type', 'Product Line', 'Season', 'Size', 'Strap Drop', 'Style', 'Theme', 'Trim Material', 'Type']

tokens = tagged_titles.groupby('Record Number')['Token'].apply(list).to_dict()
tags = tagged_titles.groupby('Record Number')['Tag'].apply(list).to_dict()

In [33]:
raw_tokenized_data = [[(tokens[i][j].lower(), tags[i][j]) for j in range(0, len(tags[i]))] for i in range(1, len(tags) + 1)]

#Append NaN token to previous value
#ex. append "Vuitton" to "Louis" in "Louis Vuitton"
#special case: append 's without a space in Women's because Women + 's = Women's
for i in range(0, len(raw_tokenized_data)):
    for j in reversed(range(1, len(raw_tokenized_data[i]))):
        if (raw_tokenized_data[i][j][1] != raw_tokenized_data[i][j][1]): #python nan moment
            if (raw_tokenized_data[i][j][0][0] == '\''):   #special case for apostrophe
                raw_tokenized_data[i][j - 1] = (raw_tokenized_data[i][j - 1][0] + raw_tokenized_data[i][j][0], raw_tokenized_data[i][j - 1][1])
            else:
                raw_tokenized_data[i][j - 1] = (raw_tokenized_data[i][j - 1][0] + " " +  raw_tokenized_data[i][j][0], raw_tokenized_data[i][j - 1][1])


In [34]:
#get rid of the NaN values
trimmed_tokenized_data = [[i for i in item if not i[1] != i[1]] for item in raw_tokenized_data]

In [35]:
#reinitialize tokens and tags
tokens = [' '.join([i[0] for i in item]) for item in trimmed_tokenized_data]
tags = [[i[1] for i in item] for item in trimmed_tokenized_data]


In [36]:
#convert from
# [token, token, ...]
# [tag, tag, ...]
# to 
# [[title, [(begin, end, token), (begin, end, token), ...]], [title, [(begin, end, token), (begin, end, token), ...]], ...]
for i in range(0, len(tags)):
    curr = 0
    for j in range(0, len(tags[i])):
        tags[i][j] = (curr, curr + len(trimmed_tokenized_data[i][j][0]), tags[i][j])
        curr += len(trimmed_tokenized_data[i][j][0]) + 1
#this stays
amouranth = [(tokens[i], tags[i]) for i in range(0, len(tokens))]

In [37]:
#add valid tags
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
for tag in valid_tags:
    ner.add_label(tag)

In [38]:
amouranth

[('louis vuitton m40096 handbag priscilla multi-color canvas multi-color canvas',
  [(0, 13, 'Brand'),
   (14, 20, 'MPN'),
   (21, 28, 'Type'),
   (29, 38, 'Model'),
   (39, 50, 'Color'),
   (51, 57, 'Fabric Type'),
   (58, 69, 'Color'),
   (70, 76, 'Fabric Type')]),
 ('louis vuitton petit noe drawstring shoulder bag monogram leather m42226 39sd442',
  [(0, 13, 'Brand'),
   (14, 23, 'Model'),
   (24, 34, 'Closure'),
   (35, 43, 'Type'),
   (44, 47, 'Type'),
   (48, 56, 'Pattern'),
   (57, 64, 'Material'),
   (65, 71, 'MPN'),
   (72, 79, 'No Tag')]),
 ('louis vuitton damier azur pochette bosphore shoulder bag n51112 lv auth yt523',
  [(0, 13, 'Brand'),
   (14, 20, 'Product Line'),
   (21, 25, 'Color'),
   (26, 34, 'Product Line'),
   (35, 43, 'Model'),
   (44, 52, 'Type'),
   (53, 56, 'Type'),
   (57, 63, 'MPN'),
   (64, 66, 'Brand'),
   (67, 71, 'No Tag'),
   (72, 77, 'No Tag')]),
 ('gucci bamboo 2way shoulder bag leather brown auth fm1002',
  [(0, 5, 'Brand'),
   (6, 12, 'Product Line

In [39]:
#split data into train and dev
dev = amouranth[::10]
train = amouranth
del train[::10]

In [40]:
#output data
os.makedirs("./trainset", exist_ok=True)

db = DocBin()
for text, annotations in train:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        if(span is None):
            print("skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./trainset/train.spacy")
print("wrote " + str(len(train)) + " lines to ./trainset/train.spacy")

db = DocBin()
for text, annotations in dev:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        if(span is None):
            print("skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./trainset/dev.spacy")
print("wrote " + str(len(dev)) + " lines to ./trainset/dev.spacy")

wrote 4500 lines to ./trainset/train.spacy
wrote 500 lines to ./trainset/dev.spacy
