In [57]:
# Load chunks from all novels from particular authors
import os
import os.path
authors = [
    "Topsoee", "JacobsenJP", "SkramE", "SkramA", "AndersenHC",
    "Bauditz", "Bang", "Kielland", "Schandorph", "Drachman",
    "Hamsun", "HansenJH", "Pontoppidan", "GB",
    "Levison", "Fibiger"
    ]
paths = [os.path.join("korpus", filename) for filename in os.listdir("korpus")
         if filename.endswith(".txt")]
sample_paths = [path for path in paths if any((("_" + author + "_") in path) for author in authors)]
len(paths), len(sample_paths)

(839, 67)

In [61]:
def split(list_a, chunk_size):
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]
        
def chunk(paths):
    chunks_filenames = []
    for path in paths:
        with open(path, encoding="utf-8") as f:  # 75 token chunks
            chunks_filenames += [(doc, os.path.split(path)[-1][:-4]) for doc in split(f.read().split(), 75) if re.search("[Ss]kj?æbne.?.?", " ".join(doc))]
    chunks, filenames = zip(*chunks_filenames)
    return chunks, filenames
    
docs, titles = chunk(paths)
sample_docs, sample_titles = chunk(sample_paths)
len(docs), len(sample_docs), sample_docs[0], sample_titles[0]

(7845,
 300,
 ['naaer',
  'fra',
  'de',
  'højeste',
  'aandelige',
  'Begsæringer',
  'til',
  'de',
  'sandselig',
  'betagende',
  '.',
  'Men',
  'der',
  'var',
  'tillige',
  'noget',
  'Andet',
  ',',
  'som',
  'herved',
  'kom',
  'frem',
  '.',
  'Det',
  'havde',
  'nemlig',
  'forekommet',
  'ham',
  ',',
  'at',
  'der',
  'dog',
  'manglede',
  'ham',
  'Noget',
  'i',
  'at',
  'vende',
  'hjem',
  'som',
  'Sejerherre',
  ';',
  'men',
  'denne',
  'Mangel',
  'var',
  'der',
  'nu',
  'ikke',
  'mere',
  'efter',
  'dette',
  'Møde',
  ',',
  'og',
  'den',
  'Mulighed',
  'af',
  'at',
  'kunne',
  'beherske',
  'en',
  'Kvindes',
  'Skæbne',
  ',',
  'det',
  'havde',
  'lagt',
  'i',
  'hans',
  'Haand',
  '.',
  'Nu',
  'følte',
  'han'],
 '1875_Topsoee_JasonMedDetGyldneSkind')

In [56]:
# Save as unlabeled test set
!mkdir -p wsd/unlabeled-da wsd/sample-da
from xml.sax.saxutils import escape
import re

def print_dataset(chunks, name):
    with open("wsd/{}-da/{}-da.data.xml".format(name, name), "w", encoding="utf-8") as f, open("wsd/{}-da/{}-da.gold.key.txt".format(name, name), "w", encoding="utf-8") as g:
        print('<?xml version="1.0" encoding="UTF-8"?>', file=f)
        print('<corpus lang="da" name="test-da" sources="wordnet-danish">', file=f)
        print('   <text id="d000">', file=f)
        for i, doc in enumerate(chunks):
            if re.search("[Ss]kj?æbne.?.?", " ".join(doc)):
                print('      <sentence id="d000.s{:03d}" source="wordnet-danish">'.format(i), file=f)
                for token in doc:
                    if re.match("[Ss]kj?æbne.?.?", token):
                        print('         <instance id="d000.s{:03d}.t000" lemma="skæbne" pos="NOUN">{}</instance>'.format(i, escape(token)), file=f)
                        print('d000.s{:03d}.t000 bn:00026603n'.format(i), file=g)
                    else:
                        print('         <wf lemma="" pos="">{}</wf>'.format(escape(token)), file=f)
                print('      </sentence>', file=f)
        print('    </text>', file=f)
        print('</corpus>', file=f)
print_dataset(docs, "unlabeled")
print_dataset(sample_docs, "sample")

In [None]:
!PYTHONPATH=. python src/evaluation/evaluate_model.py --config /work/config_da.unlabeled.yaml --checkpoint_path pretrained_models/bert-base-multilingual-cased/model_state_best.th  --cpu
!PYTHONPATH=. python src/evaluation/evaluate_model.py --config /work/config_da.sample.yaml --checkpoint_path pretrained_models/bert-base-multilingual-cased/model_state_best.th  --cpu

In [59]:
# Convert predictions back to readable format
import csv
with open("xl-wsd-code/pretrained_models/batchnorm_wsd_classifier_bert-base-multilingual-cased/evaluation/unlabeled-da.predictions.txt") as f, open("predictions.csv", "w", newline="", encoding="utf-8") as g:
    csvwriter = csv.writer(g)
    for doc, title, line in zip(docs, titles, f):
        csvwriter.writerow([title, " ".join(doc), line.split()[1]])

In [31]:
# Load annotated test set
from pandas import read_csv
import os
import os.path

data = []
for filename in os.listdir("annotations"):
    path = os.path.join("annotations", filename)
    if path.endswith(".csv"):
        df = read_csv(path, delimiter=";")
        data += df[~df['match'].isnull()].to_dict(orient="records")
data[1]

{'corpus': 'MeMo corpus v. 0.5',
 'match_position': 22977994.0,
 'left context': 'Enten—Eller , enten Liv eller Død — saa levende var den almindelige Interesse for og Deltagelse i den dømte Forbryders',
 'match': 'Skæbne',
 'right_context': '. Pludselig gjorde Bødlen , søm hidtil havde staaet ubevægelig og med korslagte Arme , . en Bevægelse . Han',
 'Kodning': 0.0,
 'text_gender': nan,
 'corpus_id': 'MEMO_ALL',
 'text_nationality': nan,
 'text_illustrations': 'n',
 'text_subtitle': 'Original Roman fra Nutiden',
 'text_publisher': 'Simonsen & Co.',
 'text_id': '1880_RH_ArbejderBankierOgBaronEllerLykkensOmskiftelser',
 'text_pseudonym': 'R.H.',
 'text_pages': 750.0,
 'text_source': 'KB',
 'text_surname': nan,
 'text_file_id': 130024104401.0,
 'text_year': 1880.0,
 'text_typeface': 'gothic',
 'text_volume': nan,
 'text_price': '5',
 'text_title': 'Arbejder, Bankier og Baron eller Lykkens Omskiftelser',
 'text_firstname': nan}

In [42]:
# Save annotated test set for evaluation
from xml.sax.saxutils import escape

with open("wsd/test-da/test-da.data.xml", "w", encoding="utf-8") as f, open("wsd/test-da/test-da.gold.key.txt", "w", encoding="utf-8") as g:
    print('<?xml version="1.0" encoding="UTF-8"?>', file=f)
    print('<corpus lang="da" name="test-da" sources="wordnet-danish">', file=f)
    print('   <text id="d000">', file=f)
    for i, line in enumerate(data):
        print('      <sentence id="d000.s{:03d}" source="wordnet-danish">'.format(i), file=f)
        for token in line["left context"].split():
            print('         <wf lemma="" pos="">{}</wf>'.format(escape(token)), file=f)
        print('         <instance id="d000.s{:03d}.t000" lemma="skæbne" pos="NOUN">{}</instance>'.format(i, escape(line["match"])), file=f)
        print('d000.s{:03d}.t000 {}'.format(i, 'bn:00026603n' if int(line["Kodning"]) == 0 else 'bn:00019222n'), file=g)
        for token in line["right_context"].split():
            print('         <wf lemma="" pos="">{}</wf>'.format(escape(token)), file=f)
        print('      </sentence>', file=f)
    print('    </text>', file=f)
    print('</corpus>', file=f)

In [None]:
!PYTHONPATH=. python src/evaluation/evaluate_model.py --config /work/config_da.test.yaml --checkpoint_path pretrained_models/bert-base-multilingual-cased/model_state_best.th  --cpu