In [None]:
import glob
import nltk
import os
import pandas as pd
import re
import string
import fasttext
import fileinput

from tqdm.notebook import tqdm
from lxml import etree

TOKENIZER = nltk.tokenize.word_tokenize

# 1. XML to token-label file

In [None]:
parser = etree.XMLParser()
punctuation = "[{`,.?!:;/\()''""¬}]"

all_chronicles = glob.glob(os.path.join('PATH_TO_DIRECTORY_WITH_XML-FILES/*.xml'))

for file in all_chronicles:
    xml = etree.parse(file)

    for elem in xml.getiterator():
        elem.tag = etree.QName(elem).localname
    etree.cleanup_namespaces(xml)
    
    previous_row = dict()
    tokens = []

    line_elements = xml.xpath('//l')
    for i, line in tqdm(enumerate(line_elements),
                       total=len(line_elements)):
        filename = file[-22:-8]
        for element in line.xpath('child::text()|*'):
            if type(element) == etree._ElementUnicodeResult:
                label = 'O'
                attribute = ''
                wordstring = re.sub(r"((¬#?) ?)", "", str(element))
                for token in TOKENIZER(str(wordstring)):
                    tokens.append(dict(sentence_id = i,
                                    filename = filename,
                                   token = token,
                                   label = label,
                                   attribute = attribute
                                   ))
                    previous_row = dict()
            else:
                if len(previous_row) == 0:
                    label = element.xpath('name()') + '-B'
                else:
                    if previous_row['label'] == (element.xpath('name()') + '-I') or previous_row['label'] == (element.xpath('name()') + '-B'):
                        label = element.xpath('name()') + '-I'
                    else:
                        label = element.xpath('name()') + '-B'
                text = ''.join(element.xpath('descendant::text()'))
                if label == 'hi-B':
                    label = 'O'
                if label == 'waarneming-B' or label == 'waarneming-I':
                    attribute = ''.join(element.xpath('@waarneming'))
                wordstring = str(text)
                for j, token in (enumerate(TOKENIZER(str(wordstring)))):
                    if j > 0 and label != '':
                        label = element.xpath('name()') + '-I'
                        if label == 'hi-I':
                            label = 'O'
                            attribute = ''
                    tokens.append(dict(sentence_id = i,
                                       filename = filename,
                                       token = token,
                                       label = label,
                                       attribute = attribute
                                       ))
                    previous_row = dict(sentence_id = i,
                                        filename = filename,
                                       token = token,
                                       label = label,
                                       attribute = attribute
                                       )
    tokenized_text = pd.DataFrame(tokens)
    tokenized_text.to_csv(file[:-7] + 'token-label.csv')

# 2. Split into train and test files

In [None]:
all_token_label_files = glob.glob(os.path.join('PATH_TO_DIRECTORY_WITH_TOKEN-LABEL-FILES/*label.csv'))

for file in all_token_label_files:
    df = pd.read_csv(file, sep=',', index_col=0)
    df_train = df.iloc[:(len(df) - round(len(df)*0.3)), :]
    df_test = df.iloc[len(df) - round(len(df)*0.3):, :]
    df_train.to_csv(file[:-4] + '_train.csv', sep='\t')
    df_test.to_csv(file[:-4] + '_test.csv', sep='\t')

## 2.1 Merge train files

In [None]:
all_train_files = glob.glob(os.path.join('PATH_TO_DIRECTORY_WITH_TRAIN-FILES/*train.csv'))

file_list = []
for file in all_train_files:
    df = pd.read_csv(file, sep='\t', index_col = 0)
    file_list.append(df)

total_train = pd.concat(file_list, ignore_index=True, sort=False).drop(['sentence_id'], 1)
total_train.to_csv('all_train.csv')

## 2.2 Merge test files

In [None]:
all_test_files = glob.glob(os.path.join('PATH_TO_DIRECTORY_WITH_TEST-FILES/*test.csv'))

file_list = []
for file in all_test_files:
    df = pd.read_csv(file, sep='\t', index_col = 0)
    file_list.append(df)

total_test = pd.concat(file_list, ignore_index=True, sort=False).drop(['sentence_id'], 1)
total_test.to_csv('all_test.csv')

# 3. Train word embedding model with fastText

## 3.1 Create corpus file

In [None]:
title = xml.find('//title')
fname = title.text
text = xml.find('//text')
chronicle = ''.join(text.itertext())
wordstring = re.sub(r"((¬#?) ?)", "", chronicle.lower())
for c in wordstring:
    if c in punctuation:
        wordstring = wordstring.replace(c, '')

In [None]:
with open(str(fname) + '.txt', 'w') as f:
    f.write(str(TOKENIZER(wordstring)))

## 3.2 Train model

In [None]:
model = fasttext.train_unsupervised("PATH_TO_DIRECTORY_WITH_CORPUS_FILE")

In [None]:
model.save_model("model.bin")

# 4. Adding vectors to train and test files

## 4.1 Load model

In [None]:
model = fasttext.load_model("PATH_TO_FASTTEXT_MODEL/model.bin")

In [None]:
total_train = pd.read_csv('all_train.csv', index_col=0)
total_train.head()

In [None]:
total_test = pd.read_csv('all_test.csv', index_col=0)
total_test.head()

## 4.2 Add vectors to train file

In [None]:
all_rows = []

for index, row in tqdm(total_train.iterrows()):
    row_dict = dict(row)
    row_dict['vector'] = model.get_word_vector(row_dict['token'].lower())
    all_rows.append(row_dict)

total_train_vectors = pd.DataFrame(all_rows)

In [None]:
total_train_vectors.to_csv('all_train_vectors.csv')

## 4.3 Add vectors to test file

In [None]:
all_rows = []

for index, row in tqdm(total_test.iterrows()):
    row_dict = dict(row)
    row_dict['vector'] = model.get_word_vector(row_dict['token'].lower())
    all_rows.append(row_dict)

total_test_vectors = pd.DataFrame(all_rows)

In [None]:
total_test_vectors.to_csv('all_test_vectors.csv')