In [127]:
import os
import pandas as pd
import csv
import spacy
from spacy.tokens import DocBin

In [128]:
tagged_titles = pd.read_csv('./dataset/Train_Tagged_Titles.tsv', sep='\t', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf8')
tagged_titles = tagged_titles[:1000]
valid_tags = ['Accents', 'Brand', 'Character', 'Character Family', 'Closure', 'Color', 'Country/Region of Manufacture', 'Department', 'Fabric Type', 'Features', 'Handle Drop', 'Handle Style', 'Handle/Strap Material', 'Hardware Material', 'Lining Material', 'MPN', 'Material', 'Measurement, Dimension', 'Model', 'Occasion', 'Pattern', 'Pocket Type', 'Product Line', 'Season', 'Size', 'Strap Drop', 'Style', 'Theme', 'Trim Material', 'Type']

tokens = tagged_titles.groupby('Record Number')['Token'].apply(list).to_dict()
tags = tagged_titles.groupby('Record Number')['Tag'].apply(list).to_dict()

In [129]:
tagged_titles

Unnamed: 0,Record Number,Title,Token,Tag
0,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,LOUIS,Brand
1,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,VUITTON,
2,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,M40096,MPN
3,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Handbag,Type
4,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Priscilla,Model
...,...,...,...,...
995,92,New Prada Rosso Red Nylon Toiletry Pouch with ...,Nylon,Material
996,92,New Prada Rosso Red Nylon Toiletry Pouch with ...,Toiletry,No Tag
997,92,New Prada Rosso Red Nylon Toiletry Pouch with ...,Pouch,Type
998,92,New Prada Rosso Red Nylon Toiletry Pouch with ...,with,No Tag


In [130]:
raw_tokenized_data = [[(tokens[i][tiddie], tags[i][tiddie]) for tiddie in range(0, len(tags[i]))] for i in range(1, len(tags) + 1)]

#Append NaN token to previous value
#ex. append "Vuitton" to "Louis" in "Louis Vuitton"
for i in range(0, len(raw_tokenized_data)):
    for j in reversed(range(1, len(raw_tokenized_data[i]))):
        if (raw_tokenized_data[i][j][1] != raw_tokenized_data[i][j][1]): #python nan moment
            raw_tokenized_data[i][j - 1] = (raw_tokenized_data[i][j - 1][0] + " " + raw_tokenized_data[i][j][0], raw_tokenized_data[i][j - 1][1])


In [131]:
#get rid of the NaN values
trimmed_tokenized_data = [[i for i in item if not i[1] != i[1]] for item in raw_tokenized_data]


In [132]:
#reinitialize tokens and tags
tokens = [' '.join([i[0] for i in item]) for item in trimmed_tokenized_data]
tags = [[i[1] for i in item] for item in trimmed_tokenized_data]


In [133]:
#convert from
# [token, token, ...]
# [tag, tag, ...]
# to 
# [[title, [(begin, end, token), (begin, end, token), ...]], [title, [(begin, end, token), (begin, end, token), ...]], ...]
for i in range(0, len(tags)):
    curr = 0
    for j in range(0, len(tags[i])):
        tags[i][j] = (curr, curr + len(trimmed_tokenized_data[i][j][0]), tags[i][j])
        curr += len(trimmed_tokenized_data[i][j][0]) + 1
#this stays
amouranth = [(tokens[i], tags[i]) for i in range(0, len(tokens))]

In [134]:
#add valid tags
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
for tag in valid_tags:
    ner.add_label(tag)

In [135]:
#split data into train and dev
dev = amouranth[::2]
train = amouranth
del train[::2]

In [136]:
#output data
os.makedirs("./trainset", exist_ok=True)

db = DocBin()
for text, annotations in train:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        if(span is None):
            print("skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./trainset/train.spacy")
print("wrote " + str(len(train)) + " lines to ./trainset/train.spacy")

db = DocBin()
for text, annotations in dev:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        if(span is None):
            print("skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./trainset/dev.spacy")
print("wrote " + str(len(dev)) + " lines to ./trainset/dev.spacy")

wrote 46 lines to ./trainset/train.spacy
wrote 46 lines to ./trainset/dev.spacy
