In [None]:
import glob
import nltk
import os
import re
import string
import fasttext
import fileinput

from tqdm.notebook import tqdm
from lxml import etree

TOKENIZER = nltk.tokenize.word_tokenize

# XML to token

In [None]:
parser = etree.XMLParser()
xml = etree.parse('FILEPATH_TO_XML')

for elem in xml.getiterator():
    elem.tag = etree.QName(elem).localname
etree.cleanup_namespaces(xml)

In [None]:
punctuation = "[{`,.?!:;/\()''""¬}]" #eventueel hier de hyphen weghalen zodat deze later gemerged kunnen worden in het outputfile

In [None]:
previous_row = dict()
tokens = []

line_elements = xml.xpath('//l')
for i, line in tqdm(enumerate(line_elements),
                   total=len(line_elements)):
    for element in line.xpath('child::text()|*'):
        if type(element) == etree._ElementUnicodeResult:
            label = 'O'
            attribute = ''
            wordstring = re.sub(r"((¬#?) ?)", "", str(element)).lower()
            for c in wordstring:
                if c in punctuation:
                    wordstring = wordstring.replace(c, '')
            for token in TOKENIZER(str(wordstring)):
                tokens.append(dict(sentence_id = i,
                               token = token,
                               label = label,
                               attribute = attribute
                               ))
                previous_row = dict()
        else:
            if len(previous_row) == 0:
                label = element.xpath('name()') + '-B'
            else:
                if previous_row['label'] == (element.xpath('name()') + '-I') or previous_row['label'] == (element.xpath('name()') + '-B'):
                    label = element.xpath('name()') + '-I'
                else:
                    label = element.xpath('name()') + '-B'
            text = ''.join(element.xpath('descendant::text()'))
            if label == 'hi':
                label = ''
                attribute = ''
            if label == 'waarneming-B' or label == 'waarneming-I':
                attribute = ''.join(element.xpath('@waarneming'))
            wordstring = str(text).lower()
            for c in wordstring:
                if c in punctuation:
                    wordstring = wordstring.replace(c, '')
            for j, token in (enumerate(TOKENIZER(str(wordstring)))):
                if j > 0 and label != '':
                    label = element.xpath('name()') + '-I'    
                tokens.append(dict(sentence_id = i,
                                   token = token,
                                   label = label,
                                   attribute = attribute
                                   ))
                previous_row = dict(sentence_id = i,
                                   token = token,
                                   label = label,
                                   attribute = attribute
                                   )

In [None]:
import pandas as pd
tokenized_text = pd.DataFrame(tokens)

tokenized_text.to_csv('TOKENIZED_TEXT.csv')

In [None]:
tokenized_text['label'].unique()

# Create file for fastText

In [None]:
title = xml.find('//title')
fname = title.text
text = xml.find('//text')
chronicle = ''.join(text.itertext())
wordstring = re.sub(r"((¬#?) ?)", "", chronicle.lower())
for c in wordstring:
    if c in punctuation:
        wordstring = wordstring.replace(c, '')

In [None]:
with open(str(fname) + '.txt', 'w') as f:
    f.write(str(TOKENIZER(wordstring)))

## Merge multiple files to one

In [None]:
file_list = glob.glob("FILEPATH_TO_TXT-FILES/*.txt")

with open('MERGED_TXT_FILE', 'w') as file:
    input_lines = fileinput.input(file_list)
    file.writelines(input_lines)

# Train model

In [None]:
model = fasttext.train_unsupervised("MERGED_TXT_FILE")

In [None]:
model.save_model("MODEL.bin")

# Load model

In [None]:
model = fasttext.load_model("MODEL.bin")

# Adding vectors to token-label-file

In [None]:
tokenized_text = pd.read_csv('TOKENIZED_TEXT.csv', index_col=0)

In [None]:
all_rows = []

for index, row in tqdm(tokenized_text.iterrows()):
    row_dict = dict(row)
    row_dict['vector'] = model.get_word_vector(row_dict['token'])
    all_rows.append(row_dict)

feature_file = pd.DataFrame(all_rows)

In [None]:
feature_file.to_csv('TOKEN-LABEL-VECTOR-FILE.csv', sep='\t')

# Merge token-label-vector files

In [None]:
all_feature_files = glob.glob(os.path.join('FILEPATH_TO_TOKEN-LABEL-VECTOR-FILES/*.csv'))

file_list = []
for file in all_feature_files:
    df = pd.read_csv(file, sep='\t')
    file_list.append(df)

total = pd.concat(file_list, ignore_index=True, sort=False).set_index('token').drop(['Unnamed: 0', 'sentence_id'], 1)
total.to_csv('MERGED_TOKEN-LABEL-VECTOR-FILE.csv')