In [1]:
import glob
import nltk
import os
import re
import string
import fasttext
import fileinput

from tqdm.notebook import tqdm
from lxml import etree

TOKENIZER = nltk.tokenize.word_tokenize

# XML to token

In [2]:
parser = etree.XMLParser()
xml = etree.parse('../chronicles/1789_Brug_Wall/1789_Brug_Wall_tei.xml')

for elem in xml.getiterator():
    elem.tag = etree.QName(elem).localname
etree.cleanup_namespaces(xml)

In [3]:
punctuation = "[{`,.?!:;/\()''""¬}]" #eventueel hier de hyphen weghalen zodat deze later gemerged kunnen worden in het outputfile

In [4]:
previous_row = dict()
tokens = []

line_elements = xml.xpath('//l')
for i, line in tqdm(enumerate(line_elements),
                   total=len(line_elements)):
    for element in line.xpath('child::text()|*'):
        if type(element) == etree._ElementUnicodeResult:
            label = 'O'
            attribute = ''
            wordstring = re.sub(r"((¬#?) ?)", "", str(element)).lower()
            for c in wordstring:
                if c in punctuation:
                    wordstring = wordstring.replace(c, '')
            for token in TOKENIZER(str(wordstring)):
                tokens.append(dict(sentence_id = i,
                               token = token,
                               label = label,
                               attribute = attribute
                               ))
                previous_row = dict()
        else:
            if len(previous_row) == 0:
                label = element.xpath('name()') + '-B'
            else:
                if previous_row['label'] == (element.xpath('name()') + '-I') or previous_row['label'] == (element.xpath('name()') + '-B'):
                    label = element.xpath('name()') + '-I'
                else:
                    label = element.xpath('name()') + '-B'
            text = ''.join(element.xpath('descendant::text()'))
            if label == 'hi':
                label = ''
                attribute = ''
            if label == 'waarneming-B' or label == 'waarneming-I':
                attribute = ''.join(element.xpath('@waarneming'))
            wordstring = str(text).lower()
            for c in wordstring:
                if c in punctuation:
                    wordstring = wordstring.replace(c, '')
            for j, token in (enumerate(TOKENIZER(str(wordstring)))):
                if j > 0 and label != '':
                    label = element.xpath('name()') + '-I'    
                tokens.append(dict(sentence_id = i,
                                   token = token,
                                   label = label,
                                   attribute = attribute
                                   ))
                previous_row = dict(sentence_id = i,
                                   token = token,
                                   label = label,
                                   attribute = attribute
                                   )

HBox(children=(IntProgress(value=0, max=1291), HTML(value='')))




In [5]:
import pandas as pd
tokenized_text = pd.DataFrame(tokens)

tokenized_text.to_csv('../output/1789_Brug_Wall/tokenized_text.csv')

In [6]:
tokenized_text['label'].unique()

array(['O', 'informatiebron-B', 'waarneming-B', 'ontvanger-B',
       'informatiebron-I', 'waarneming-I', 'ontvanger-I'], dtype=object)

# Create file for fastText

In [49]:
title = xml.find('//title')
fname = title.text
text = xml.find('//text')
chronicle = ''.join(text.itertext())
wordstring = re.sub(r"((¬#?) ?)", "", chronicle.lower())
for c in wordstring:
    if c in punctuation:
        wordstring = wordstring.replace(c, '')

In [50]:
with open(str(fname) + '.txt', 'w') as f:
    f.write(str(TOKENIZER(wordstring)))

## Merge multiple files to one

In [53]:
file_list = glob.glob("/Users/alielassche/documents/github/chronicling-sources/fasttext/xxxx_Brug_Wall/*.txt")

with open('xxxx_Brug_Wall.txt', 'w') as file:
    input_lines = fileinput.input(file_list)
    file.writelines(input_lines)

# Train model

In [56]:
model = fasttext.train_unsupervised("../fasttext/xxxx_Brug_Wall/xxxx_Brug_Wall_kopie")

In [58]:
model.save_model("../fasttext/xxxx_Brug_Wall/xxxx_Brug_Wall.bin")

# Load model

In [54]:
model = fasttext.load_model("../fasttext/alle_Brug_Wall/alle_Brug_Wall.bin")

# Adding vectors to feature file

In [68]:
tokenized_text = pd.read_csv('../output/1789_Brug_Wall/tokenized_text.csv', index_col=0)

In [69]:
all_rows = []

for index, row in tqdm(tokenized_text.iterrows()):
    row_dict = dict(row)
    row_dict['vector'] = model.get_word_vector(row_dict['token'])
    all_rows.append(row_dict)

feature_file = pd.DataFrame(all_rows)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [70]:
feature_file.to_csv('../output/1789_Brug_Wall/feature_file.csv', sep='\t')

# Merge feature files

In [71]:
all_feature_files = glob.glob(os.path.join('../output/alle_Brug_Wall/*.csv'))

file_list = []
for file in all_feature_files:
    df = pd.read_csv(file, sep='\t')
    file_list.append(df)

total = pd.concat(file_list, ignore_index=True, sort=False).set_index('token').drop(['Unnamed: 0', 'sentence_id'], 1)
total.to_csv('../output/alle_Brug_Wall/alle_Brug_Wall_concat.csv')