In [63]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import pickle
import xml.etree.ElementTree as ET
import re
from tqdm import tqdm_notebook as tqdm

In replicating Rao & Tetreault (2018) we need to vectorize text data using word embeddings (GloVe) trained on the whole Yahoo Answers corpus. Word vector dimensions are 300 and total vocabulary size is 50,000.

Note: "At test time, we replace unknown tokens with the source token that has the highest attention weight."

# prepare data: extract answers from XML file

In [3]:
%cd C:\Users\cramerus\Documents\Thesis\Yahoo Answers Corpus
%ls

C:\Users\cramerus\Documents\Thesis\Yahoo Answers Corpus
 Volume in Laufwerk C: hat keine Bezeichnung.
 Volumeseriennummer: D638-A535

 Verzeichnis von C:\Users\cramerus\Documents\Thesis\Yahoo Answers Corpus

09.04.2019  12:29    <DIR>          .
09.04.2019  12:29    <DIR>          ..
09.04.2019  12:31    12.263.270.226 FullOct2007.xml
16.09.2009  22:27     6.131.634.176 FullOct2007.xml.part1
16.09.2009  22:43     1.770.876.044 FullOct2007.xml.part1.gz
17.09.2009  02:01     6.131.636.050 FullOct2007.xml.part2
17.09.2009  02:18     1.936.135.644 FullOct2007.xml.part2.gz
17.09.2009  23:51             4.195 README.txt
16.09.2009  21:49            19.457 small_sample.xml
16.09.2009  21:49             1.874 WebscopeReadMe.txt
               8 Datei(en), 28.233.577.666 Bytes
               2 Verzeichnis(se), 260.927.369.216 Bytes frei


In [51]:
str.replace('<br />', '<br />', '')

''

In [67]:
file = 'FullOct2007.xml'
small_file = 'small_sample.xml'
new_output = 'YahooCorpus.txt'
sample_output = 'SampleCorpus.txt'

def xml_to_txt(input_file, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for event, elem in tqdm(ET.iterparse(input_file, events=('start', 'end'))):
            if event == 'start' and elem.tag == 'vespaadd': # start new question
                texts = []
                language = None
            if elem.tag == 'answer_item':
                if elem.text == None:
                    continue
                text = str.replace(elem.text, '<br />', '') # remove tags
                text = re.sub(r'\s+', ' ', text).strip() # remove whitespace
                if text not in texts:
                    texts.append(text)
            if elem.tag == 'language':
                language = elem.text # should be None or en-us

            if event == 'end' and elem.tag == 'vespaadd':
                if language == None or language == 'en-us':
                    for text in texts:
                        tokens = word_tokenize(text)
                        for token in tokens:
                            f.write(token + ' ')
                        f.write('\n')
                elem.clear()

In [64]:
xml_to_txt(small_file, sample_output)




In [68]:
xml_to_txt(file, new_output)




# train glove model on corpus

In [None]:
# https://github.com/stanfordnlp/GloVe
# https://github.com/stanfordnlp/GloVe/blob/master/demo.sh
# https://github.com/stanfordnlp/GloVe/tree/master/src