# Download, pre-process and run feature extraction on the data sets

Joakim: Word n-gram for facsimile data. Divergences: Jensen-Shannon, KL, Jeffreys.

Bea: Add menota data http://clarino.uib.no/menota/catalogue, mostly norwegian and islandic, HistCorp https://cl.lingfil.uu.se/histcorp/index.html

In [2]:
from tqdm import tqdm
import time

import numpy as np
import re
from multiprocessing import Pool

# Web
from urllib.request import urlopen, urlretrieve

# File system
import os
import os.path

# For downloading and using external libraries
import sys
sys.path.append('external')
assert os.path.exists('external')

# Data location
assert os.path.exists("data")

# Preprocessing

Arabic and roman numerals are blanked out from all documents. Instead of making a 'smart' regex for all numerals, we generate all numbers (arabic and roman) that we want to remove. This long regex of special cases is compiled to make it faster.

In [3]:
from roman import toRoman

tokens = ['NUMBER\_TOKEN']
for n in range(1100, 2020+1):                                      # The range of years to remove
    tokens.append("%i" % n)                                        # Adds the arabic number
    r = toRoman(n)                                                 # Adds the roman number
    tokens.append("\.*".join(list(r)))                             # Adds zero or more dots between roman 'digits'
lengths = np.asarray([len(token) for token in tokens])
tokens = list(np.asarray(tokens)[np.argsort(lengths)[::-1]])
numerals_regexp = "(" + "|".join(tokens) + ")"                     # Converts the list of special cases into a regex string
numerals_regexp = r"(?<=\b)"+numerals_regexp+r"(?=\b)"             # Adds word limits
#print(numerals_regexp)
numerals_regexp = re.compile(numerals_regexp, flags=re.IGNORECASE) # Compiles the regex to make the list of specal cases faster

# Tests
m = numerals_regexp
#exp = re.compile(r'[MDCLX]*[MDCLXVI]{1,3}')
#exp = re.compile("M{0,4}\.*(CM|CD|D?C{0,3})\.*(XC|XL|L?X{0,3})\.*(IX|IV|V?I){1,3}")
#print(exp.findall("I'm a green dream"))
#print(exp.findall("M.DCCCVI"))
assert len(m.findall("i")) == 0
assert len(m.findall("iv")) == 0
assert len(m.findall("i.v")) == 0
assert len(m.findall("Iv")) == 0
assert len(m.findall("I'm a green dream")) == 0
assert len(m.findall("MDIII")) > 0
assert len(m.findall("MDIiI")) > 0
assert len(m.findall("MD.IiI")) > 0
assert len(m.findall("mdiii")) > 0
assert len(m.findall("M.DCCCVI")) > 0
assert '2000' in m.findall("2000 lkfdjöd fösddlfk lorem ipsum1829 1943")
assert '1943' in m.findall("2000 lkfdjöd fösddlfk lorem ipsum1829 1943")
assert '1829' not in m.findall("2000 lkfdjöd fösddlfk lorem ipsum1829 1943")

In [4]:
from functools import partial
import html
import regex

def replace_html_entities(text):
    return html.unescape(text)

def remove_repeated_space(text):
    text = regex.sub(r" +", " ", text)
    return text

def remove_html_tags(text):
    text = regex.sub(r"<.*?>", "", text)
    return text

def remove_footnotes(text):
    text = regex.sub(r"\(\d+\)", "", text)
    return text

def remove_parentheses(text):
    text = regex.sub(r"\p{Ps}|\p{Pe}", "", text)
    return text

def split_regroup(match, group=1, join_by=" ", prefix=" ", suffix=""):
    return prefix+join_by.join(list(match.group(group)))+suffix

def split_punctuation_from_word(text):
    """ Split punctuation from beginning and end of words
    """
    text = regex.sub(r"(?<=\s|^)(\p{p}+)\b", r"\1 ", text)
    text = regex.sub(r"\b(\p{p}+)(?=\s|$)", r" \1", text)
    return text

def split_punctuation(text):
    """ Split sequence of punctuation
    """
    text = regex.sub(r"(?<=\s|^)(\p{p}{2,})(?=\s|$)", partial(split_regroup, prefix="", suffix=""), text)
    return text

def substitute_number(text, replace="<NUM>"):#, predefined=[r"NUMBER\_TOKEN"]):
    text = numerals_regexp.sub(replace, text)
    return text

In [5]:
class ProcessPipe(object):
    def __init__(self, *processes):
        self.processes = processes
    def transform(self, input):
        for process in self.processes:
            input = process(input)
        return input

In [6]:
preprocess = [replace_html_entities, remove_html_tags, remove_footnotes, remove_parentheses, split_punctuation_from_word, split_punctuation]
postproccess = [remove_repeated_space, lambda x: x.strip()]

textprocess = ProcessPipe(*preprocess, substitute_number, *postproccess)

# Process data sets

## SemEval 2015

[Data description](http://alt.qcri.org/semeval2015/task7/)

In [7]:
fn = os.path.join('data', 'original', 'SemEval2015_1.zip')
if not os.path.exists(fn):
    data_url = """http://alt.qcri.org/semeval2015/task7/data/uploads/training08.zip"""
    urlretrieve(data_url, fn)

fn = os.path.join('data', 'original', 'SemEval2015_2.zip')
if not os.path.exists(fn):
    data_url = """http://alt.qcri.org/semeval2015/task7/data/uploads/moretraining.zip"""
    urlretrieve(data_url, fn)

!unzip -o data/original/SemEval2015_1.zip -d data/original
!unzip -o data/original/SemEval2015_2.zip -d data/original

Archive:  data/original/SemEval2015_1.zip
   creating: data/original/training08/
  inflating: data/original/training08/training08T1.txt  
   creating: data/original/__MACOSX/
   creating: data/original/__MACOSX/training08/
  inflating: data/original/__MACOSX/training08/._training08T1.txt  
  inflating: data/original/training08/training08T2.txt  
Archive:  data/original/SemEval2015_2.zip
   creating: data/original/moreTraining/
  inflating: data/original/moreTraining/training12T2.txt  
  inflating: data/original/moreTraining/training12T1.txt  


In [8]:
!grep '<text ' data/original/training08/training08T1.txt | wc --lines
!grep '<text ' data/original/training08/training08T2.txt | wc --lines
!grep '<text ' data/original/moretraining/training12T1.txt | wc --lines
!grep '<text ' data/original/moretraining/training12T2.txt | wc --lines
# !head -n 10 training08/training08T2.txt
# !grep -v '<' training08/training08T2.txt | head -n 10
# !grep '<text ' training08/training08T2.txt | head -n 10

wc: illegal option -- -
usage: wc [-clmw] [file ...]
wc: illegal option -- -
usage: wc [-clmw] [file ...]
wc: illegal option -- -
usage: wc [-clmw] [file ...]
wc: illegal option -- -
usage: wc [-clmw] [file ...]


In [24]:
def parse(textlines, subtask):
  errors = 0
  """Parse the data from one semeval file. Text should be split into lines."""
  import re
  import numpy as np
  regex = r'(?P<tag>yes|id|no)=\"(?P<data>[a-zA-Z0-9\-]+)\"'
  data = list()
  for line in tqdm(textlines):
    if line[:6].lower() == '<text ':
      # s = line[:6][:-1]
      y = [-np.inf, np.inf]
      e = list(re.finditer(regex, line))[0]
      item = {e.groupdict()['tag']: e.groupdict()['data'], 'subtask': subtask}
    elif line[:6].lower() == '<textf' or line[:6].lower() == '<textm' or line[:6].lower() == '<textc':
      # Dating data
      for match in re.finditer(regex, line):
        if match.groupdict()['tag'] == 'yes':
          s = match.groupdict()['data'].split("-")
          try:
              y[0] = max(y[0], int(s[0]))
              y[1] = min(y[1], int(s[1]))
          except ValueError:
              y[0] = None
              y[1] = None
              errors+=1
              
    elif len(line) > 10 and line[0] != '<':
      item['text'] = textprocess.transform(line)
    elif line[:6].lower() == '</text':
      item['date'] = tuple(y)
      if 'text' in item.keys():
        data.append(item)

  return data, errors

with open('data/original/training08/training08T1.txt', 'r') as file:
  textlines = [tl.strip() for tl in file.readlines()]
  data, e = parse(textlines, subtask = 1)
  data = [x for x in data if x["date"][0]!=None]
  print("Errors:", e)
with open('data/original/training08/training08T2.txt', 'r') as file:
  textlines = [tl.strip() for tl in file.readlines()]
  d, e = parse(textlines, subtask = 2)
  data.extend(d)
  print("Errors:", e)    

with open('data/original/moretraining/training12T1.txt', 'r') as file:
  textlines = [tl.strip() for tl in file.readlines()]
  d, e = parse(textlines, subtask = 1)
  data.extend([x for x in d if x["date"][0]!=None])
  print("Errors:", e)    
with open('data/original/moretraining/training12T2.txt', 'r') as file:
  textlines = [tl.strip() for tl in file.readlines()]
  d, e = parse(textlines, subtask = 2)
  data.extend([x for x in d if x["date"][0]!=None])
  print("Errors:", e)

for i in range(len(data)):
  data[i]['tokens'] = [sentence.strip().split() for sentence in data[i]['text'].split(".") if len(sentence) > 0]
  assert data[i]['date'][0] <= data[i]['date'][1]

print("Found %i data items" % len(data))

100%|██████████| 1673/1673 [00:00<00:00, 4153.39it/s]
  2%|▏         | 362/22008 [00:00<00:06, 3582.56it/s]

Errors: 0


100%|██████████| 22008/22008 [00:05<00:00, 4229.88it/s]
100%|██████████| 469/469 [00:00<00:00, 3241.57it/s]
  0%|          | 0/7168 [00:00<?, ?it/s]

Errors: 0
Errors: 0


100%|██████████| 7168/7168 [00:02<00:00, 3500.23it/s]


Errors: 111
Found 4424 data items


In [11]:
fn = os.path.join('data', 'SemEval2015.npz')
np.savez_compressed(fn, data=data)

## Colonia

Corpus of Historical Portuguese

__TODO:__
* Verify that all files are found
* Check for duplicates
* Check for empty of suspiciously short books

In [12]:
# This page links to the source material
with urlopen("""http://corporavm.uni-koeln.de/colonia/inventory.html""") as source:
    data = source.read().decode(errors='replace')

base_url = """http://corporavm.uni-koeln.de/colonia/"""
data_path = os.path.join('data', 'original', 'colonia')

# Parse out files from the web page
pattern = r'<a\W+href=\"([\w\/]+.txt)\">'
files_in_html = re.findall(pattern, data, flags=re.IGNORECASE+re.MULTILINE)
file_urls = [base_url+fn for fn in files_in_html]
files = [tuple([url, os.path.join(data_path, url.split("/")[-1])]) for url in file_urls]
print("Found %i files" % len(files))

# Make data path
if not os.path.exists(data_path):
    os.makedirs(data_path)

def _get(data):
    """Simple http downloader"""
    url, local_fn = data
    from urllib.request import urlretrieve
    try:
        urlretrieve(url, local_fn)
    except:
        pass

with Pool(processes=max(5, os.cpu_count())) as pool:
    # Only download files once
    dl = [f for f in files if not os.path.exists(f[1])]
    # The actual downloading
    for data in tqdm(pool.imap_unordered(_get, dl), desc="Downloading colonia", total=len(dl)):
        pass

# Count the missing files
missing = [f for f in files if not os.path.exists(f[1])]
if len(missing) > 0:
    print(", %i missing files" % len(missing))

Found 103 files


Downloading colonia: 100%|██████████| 4/4 [00:00<00:00, 41.41it/s]


, 4 missing files


In [13]:
!head -n 10 data/original/colonia/melo1650.txt
!ls data/original/colonia

﻿<text id="melo1650">
<s>
Pinto	V	pintar
para	PRP	para
os	DET	o
tempos	NOM	tempo
a	PRP	a
imagem	NOM	imagem
de	PRP	de
um	DET	um
abreu1856.txt	  barbosa1691.txt   guerreiro16th.txt  queiroz1878.txt
aires1752.txt	  barreto1915.txt   holanda1548.txt    queiroz1887.txt
alencar1857.txt   barreto1920.txt   lobo1619.txt       queiroz1888.txt
alencar1862.txt   barreto1948.txt   macedo1811.txt     queiroz1900.txt
alencar1865.txt   barros1540.txt    macedo1844.txt     rocha1910.txt
alencar1875.txt   botelho1705.txt   macedo1878.txt     sanches1760.txt
almeida1633.txt   brandao1632.txt   machado1876.txt    silva1733.txt
almeida17th.txt   brochado17th.txt  machado1878.txt    silva1734.txt
almeida1852.txt   caminha1500.txt   machado1881.txt    silva1735.txt
almeida1901.txt   caminha1893.txt   machado1885.txt    silva1736b.txt
almeida1905.txt   caminha1894.txt   machado1891.txt    silva1736.txt
almeida1921.txt   caminha1895.txt   machado1899.txt    silva1737b.txt
alves1870.txt	  caminha1896.txt   mac

The following is some old code for processing Colonia.

In [14]:
colonia = list()
year_pattern = r'[\w\/]+((1[5-9])([0-9]{2}|th))[ab12]?.txt'

def _parse_file(fn):
  import re
  groups = re.match(year_pattern, fn.split("/")[-1]).groups()
  if groups[2].isnumeric():
    years = int(groups[0])
  else:
    c = int(groups[1])*100
    years = tuple([c, c+99])
  # print(years)
  with open(fn, 'r', encoding='utf-8', errors='replace') as f:
    tokens = list()
    pos = list()
    lemmas = list()  
    in_sentence = False
    for n, rawline in enumerate(f.readlines()):
      text = rawline.strip()
      if text.find("<s>") >= 0:
        tokens.append([])
        pos.append([])
        lemmas.append([])
        in_sentence = True
      elif text.find("</s>") >= 0:
        in_sentence = False
      elif re.search(r'<[0-9]+>', text):
        # Ignore weird tokens
        pass
      elif in_sentence:
        try:
          token, part, lemma = text.split()
          if lemma == "@card@":
            token = "<NUM>" # Remove number tokens
          tokens[-1].append(token)
          pos[-1].append(part)
          lemmas[-1].append(lemma)
        except:
          pass
          # print("Error in %s on line %i: %s" % (fn.split("/")[-1], n, text))
  return {'date': years, 'file': fn, 'tokens': tokens, 'pos': pos, 'lemmas': lemmas}

with Pool(processes=os.cpu_count()) as pool:
  files_for_parsing = [f[1] for f in files if os.path.exists(f[1])]
  for d in tqdm(pool.imap_unordered(_parse_file, files_for_parsing),
                   desc="Parsing Colonia", total=len(files_for_parsing)):
    colonia.append(d)


Parsing Colonia: 100%|██████████| 99/99 [00:03<00:00, 31.61it/s]


In [15]:
fn = os.path.join('data', 'Colonia.npz')
np.savez_compressed(fn, data=colonia)

## SDHK

In [16]:
urlretrieve("https://raw.githubusercontent.com/fredrikwahlberg/harvesters/master/sdhk.py", 
            "external/sdhk.py")
from sdhk import SDHKHarvester

db = SDHKHarvester(os.path.join('data', 'original', 'sdhk.json.gz'))

ids_with_textcontent = set([n for n in db.get_good_ids()
                            if 'textcontent' in db[n].keys() and
                            db[n]['textcontent'] is not None and
                            len(db[n]['textcontent']) > 100])
ids_with_year = set([n for n in db.get_good_ids() if 'year' in db[n].keys() and
                     db[n]['year'] >= 1100 and
                     db[n]['year'] <= 1523])

ids_with_swedish = set([n for n in db.get_good_ids()
                        if 'language' in db[n].keys() and
                        str(db[n]['language']).find('svenska') >= 0])
good_ids_swedish = list(ids_with_swedish
                .intersection(ids_with_year)
                .intersection(ids_with_textcontent))
print("Found %i charters with Swedish" % len(good_ids_swedish))

ids_with_latin = set([n for n in db.get_good_ids()
                        if 'language' in db[n].keys() and
                        str(db[n]['language']).find('latin') >= 0])
good_ids_latin = list(ids_with_latin.intersection(ids_with_year).intersection(ids_with_textcontent))
print("Found %i charters with Latin" % len(good_ids_latin))


Found 3086 charters with Swedish
Found 7572 charters with Latin


### Swedish

In [17]:
data = list()
for i in tqdm(good_ids_swedish):
    data.append(db[i])
    #data[-1]['text'] = re.sub('\W+', ' ', data[-1]['textcontent'].lower())
    data[-1]['text'] = textprocess.transform(data[-1]['textcontent'])
    data[-1]['date'] = data[-1]['year']
    #data[-1]['tokens'] = [sentence.split() for sentence in data[-1]['textcontent'].split(".") if len(sentence) > 0]
    data[-1]['tokens'] = [sentence.split() for sentence in data[-1]['text'].split(".") if len(sentence) > 0]

fn = os.path.join('data', 'SDHK_Swedish.npz')
np.savez_compressed(fn, data=data)

100%|██████████| 3086/3086 [00:13<00:00, 234.96it/s]


### Latin

In [18]:
data = list()
for i in tqdm(good_ids_latin):
    data.append(db[i])
    #data[-1]['text'] = re.sub('\W+', ' ', data[-1]['textcontent'].lower())
    data[-1]['text'] = textprocess.transform(data[-1]['textcontent'])    
    data[-1]['date'] = data[-1]['year']
    #data[-1]['tokens'] = [sentence.split() for sentence in data[-1]['textcontent'].split(".") if len(sentence) > 0]
    data[-1]['tokens'] = [sentence.split() for sentence in data[-1]['text'].split(".") if len(sentence) > 0]

fn = os.path.join('data', 'SDHK_Latin.npz')
np.savez_compressed(fn, data=data)

100%|██████████| 7572/7572 [00:31<00:00, 238.52it/s]


# St. Clare Archive

In [19]:
import pandas as pd
import zipfile
from collections import Counter

### Diplomatic level

In [20]:
level = "dipl"
data_path = os.path.join('data', "original", "StClare_{}.zip".format(level))
zip_archive = zipfile.ZipFile(data_path, "r")

velux = pd.read_csv(zip_archive.open(os.path.join("StClare_{}".format(level), "meta.csv")), sep="\t", header=0)

In [21]:
Counter(velux["language"])

Counter({'lat': 361, 'dan': 100, 'swe': 6, 'mlg': 2})

In [22]:
velux_data =  list()
undated = list()
for i, doc in tqdm(velux.iterrows(), total=len(velux.index)):
    
    doc_path = os.path.join("StClare_{}".format(level), "{}.txt".format(int(doc["text number"])))
    
    with zip_archive.open(doc_path, "r") as f:
        content = f.read().decode("utf-8").strip()
        content = textprocess.transform(content)

    data = {"id":   doc["text number"],
            "date": (doc["year-min"], doc["year-max"]),
            "text": content,
            "language": doc["language"],
            "tokens": [content.split(), ]
           }

    if pd.isnull(doc["year-min"]):
        undated.append(data)

    else:
        velux_data.append(data)

100%|██████████| 469/469 [00:02<00:00, 224.16it/s]


In [23]:
dest_latin = os.path.join('data', "StClare_{level}_latin.npz".format(level=level))
np.savez_compressed(dest_latin, 
                    data=list(filter(lambda x: x["language"]=="lat", velux_data)), 
                    undated=list(filter(lambda x: x["language"]=="lat", undated)))

dest_danish = os.path.join('data', "StClare_{level}_danish.npz".format(level=level))
np.savez_compressed(dest_danish, 
                    data=list(filter(lambda x: x["language"]=="dan", velux_data)),
                    undated=list(filter(lambda x: x["language"]=="dan", undated)))

#dest_misc = os.path.join(base_path, "velux_{level}_misc.npz".format(level=level))
#np.savez_compressed(dest_misc, 
#                    data=list(filter(lambda x: x["language"] not in ("dan", "lat"), velux_data)),
#                    undated=list(filter(lambda x: x["language"] not in ("dan", "lat"), undated)))


### Facsimile

In [24]:
level = "facs"
data_path = os.path.join('data', "original", "StClare_{}.zip".format(level))
zip_archive = zipfile.ZipFile(data_path, "r")

velux = pd.read_csv(zip_archive.open(os.path.join("StClare_{}".format(level), "meta.csv")), sep="\t", header=0)

In [25]:
velux_data =  list()
undated = list()
for i, doc in tqdm(velux.iterrows(), total=len(velux.index)):
    
    doc_path = os.path.join("StClare_{}".format(level), "{}.txt".format(int(doc["text number"])))
    
    with zip_archive.open(doc_path, "r") as f:
        content = f.read().decode("utf-8").strip()
        content = textprocess.transform(content)

    data = {"id":   doc["text number"],
            "date": (doc["year-min"], doc["year-max"]),
            "text": content,
            "language": doc["language"],
            "tokens": [content.split(), ]
           }

    if pd.isnull(doc["year-min"]):
        undated.append(data)

    else:
        velux_data.append(data)

100%|██████████| 469/469 [00:02<00:00, 179.82it/s]


In [26]:
dest_latin = os.path.join('data', "StClare_{level}_latin.npz".format(level=level))
np.savez_compressed(dest_latin, 
                    data=list(filter(lambda x: x["language"]=="lat", velux_data)), 
                    undated=list(filter(lambda x: x["language"]=="lat", undated)))

dest_danish = os.path.join('data', "StClare_{level}_danish.npz".format(level=level))
np.savez_compressed(dest_danish, 
                    data=list(filter(lambda x: x["language"]=="dan", velux_data)),
                    undated=list(filter(lambda x: x["language"]=="dan", undated)))

#dest_misc = os.path.join(base_path, "velux_{level}_misc.npz".format(level=level))
#np.savez_compressed(dest_misc, 
#                    data=list(filter(lambda x: x["language"] not in ("dan", "lat"), velux_data)),
#                    undated=list(filter(lambda x: x["language"] not in ("dan", "lat"), undated)))

### ... and lastly

Some code for loading the created data sets.

In [27]:
def load_dataset(data_source_filename):
    print("Loading %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    dataset = dict()
    with np.load(data_source_filename, allow_pickle=True) as source_file:
        for key in source_file.keys():
            # print(key)
            dataset[key] = source_file[key].tolist()
    print("done (%.1fs)" % (time.time()-t), flush=True)
    return dataset

data_source_filenames = [os.path.join('data', fn) for fn in os.listdir('data')
                            if os.path.isfile(os.path.join('data', fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/velux_facs_danish.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz',
 'data/velux_dipl_latin.npz',
 'data/velux_facs_latin.npz',
 'data/velux_dipl_danish.npz']

### Verify that texts looks good after preprocessing

In [28]:
import random

for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    
    instance = random.choice(dataset["data"])
    try:
        print(instance["text"])
    except KeyError:
        print(" ".join(sentence) for sentence in instance["tokens"])
    print()

Loading StClare_facs_danish.npz... done (0.0s)
Alle mendt thꝫͤ bꝛeff ee ell̅ꝛ hø læſ / helſſze vij p̲ moge̅ſ i vgliidſtͮp hꝛitz ffogett i waalbuꝛgꝭ hꝛiitt / Olűff mi i oꝛdeͮp / knd vgle j kaꝛlleby / Ewi̅deliighe mett gd / kngø vij ffoꝛ alle mett Thꝫͤ voꝛtt opne bff / Anno dn̅j <NUM> / Thn̅ løffwdag neſt ffaa ȷomffͮ mae dag natiͭꝭ Tha voꝛ / ſkiickett ffoꝛ oſſ oc ma̅ge da̅ne mend fle paa waalbuꝛgꝭ hꝛii ti̅ng / Eꝛliig oc ffoꝛnw̅ftiige mand / han lock i abbeted / paa ffͮ cꝛeſtenſ vegne j kla / oc haffde tiiſſzͤ effthꝛ̅ ſkꝭᷠͤ <NUM> da̅ne mend mett ſſeeg / ſo woꝛ / han dÿꝛiickſ j aaby veſt / laűn hanſ ibi / niel villomſ ibi p̲ ȷepſ i hoꝛſſzeſtaal / ȷond olſ i toꝛckiilſtͮp / niel enſ i toꝛckiilſtͮp / ȷngoꝛ hanſ i kiiꝛke ſaaby Tiilløff p̲ſ ibi / hilke ffoꝛne <NUM> da̅ne mend ſſo tiil waatagne ȷndhn̅ ti̅nghe / tiil thn̅ aaſynd paa thn̅ ſkoſſ lood ſo liighꝛ ⸌ tiil ⸍ niel teſ gaadt i foꝛᷠͤ toꝛckıılſtͮp / the ſſaade oc ſkdde o hand voꝛ god fo

# Generate folds

This method is deterministic.

In [29]:

for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    # Get years (average if span)
    years = np.asarray([d['date'] if type(d['date']) is int else int((d['date'][0]+d['date'][1])//2) for d in dataset['data']])
    # Generate indices
    indices = np.arange(len(years))
    # Sort by year
    new_order = np.argsort(years)
    years = years[new_order]
    indices = list(indices[new_order])
    # Distribute over folds
    folds = {'train': list(), 'val': list(), 'test': list()}
    keys = ['train', 'train', 'train', 'val', 'test']
    i = 0
    while len(indices) > 0:
        folds[keys[i]].append(indices[0])
        i += 1
        i = i % len(keys)
        indices = indices[1:]
    # Commit to data structure
    dataset['folds'] = folds
    # Verify size
    assert len(dataset['folds']['train'])+len(dataset['folds']['val'])+len(dataset['folds']['test']) == len(years)
    # Verify uniqueness
    all_folds = list()
    all_folds.extend(dataset['folds']['train'])
    all_folds.extend(dataset['folds']['val'])
    all_folds.extend(dataset['folds']['test'])
    assert len(set(all_folds)) == len(years)
    # Save
    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    np.savez_compressed(data_source_filename, **dataset)
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading StClare_facs_danish.npz... done (0.0s)
 Saving StClare_facs_danish.npz... done (0.1s)
Loading SDHK_Latin.npz... done (0.8s)
 Saving SDHK_Latin.npz... done (2.5s)
Loading StClare_facs_latin.npz... done (0.0s)
 Saving StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
 Saving StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.0s)
 Saving StClare_dipl_latin.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)
 Saving velux_facs_danish.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.3s)
 Saving SDHK_Swedish.npz... done (0.9s)
Loading Colonia.npz... done (5.5s)
 Saving Colonia.npz... done (11.9s)
Loading SemEval2015.npz... done (0.1s)
 Saving SemEval2015.npz... done (0.3s)
Loading velux_dipl_latin.npz... done (0.1s)
 Saving velux_dipl_latin.npz... done (0.3s)
Loading velux_facs_latin.npz... done (0.1s)
 Saving velux_facs_latin.npz... done (0.4s)
Loading velux_dipl_danish.npz... done (0.0s)
 Saving velu

# Feature extraction

* Word, POS, and Word+POS n-grams
  * tf-idf vectors
  * BOW vectors
* Character n-grams
  * Frequency vectors
  * BOW vectors


##  Zampieri style word and pos vectors

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

for data_source_filename in data_source_filenames:
    # Load data set
    dataset = load_dataset(data_source_filename)

    # Get training and validation sets for training the tf-idf
    train_val_set = list()
    train_val_set.extend(dataset['folds']['train'])
    train_val_set.extend(dataset['folds']['val'])
    train_val_set = np.asarray(train_val_set, dtype=np.int)

    # Collect data
    X_word_documents = list()
    X_pos_documents = list()
    X_word_pos_documents = list()
    if 'tokens' in dataset['data'][0]:
        for item in tqdm(dataset['data'], desc=" Preparing word(+pos) data"):
            # Create "token" and "pos" document for the count vectorizer below
            a = [word.lower() for sent in item['tokens'] for word in sent]
            X_word_documents.append(" ".join(a))
            if 'pos' in item:
                b = [word for sent in item['pos'] for word in sent]
                assert len(a)==len(b)
                X_pos_documents.append(" ".join(b))
                X_word_pos_documents.append(" ".join([x+y for x, y in zip(a, b)]))
        if 'pos' in dataset['data'][0]:
            assert len(X_word_documents)==len(X_pos_documents)
    else:
        print(" Nothing to process")

    # Add feature set dict
    if 'feature_sets' not in dataset:
      dataset['feature_sets'] = dict()

    if len(X_word_documents)>0:
        print(" Running BOW feature transform... ", end="")
        t = time.time()
        vectoriser = CountVectorizer().fit(np.asarray(X_word_documents)[train_val_set])
        X = vectoriser.transform(X_word_documents)
        X[X>1] = 1
        dataset['feature_sets']['bow_words'] = X
        if len(X_pos_documents)>0:
            vectoriser = CountVectorizer().fit(np.asarray(X_pos_documents)[train_val_set])
            X = vectoriser.transform(X_pos_documents)
            X[X>1] = 1
            dataset['feature_sets']['bow_pos'] = X
            vectoriser = CountVectorizer().fit(np.asarray(X_word_pos_documents)[train_val_set])
            X = vectoriser.transform(X_word_pos_documents)
            X[X>1] = 1
            dataset['feature_sets']['bow_words_pos'] = X
        print("done (%.1fs)" % (time.time()-t))

        print(" Running tf-idf feature transform... ", end="")
        t = time.time()
        vectoriser = TfidfVectorizer().fit(np.asarray(X_word_documents)[train_val_set])
        X = vectoriser.transform(X_word_documents)
        #X[X>1] = 1
        dataset['feature_sets']['tfidf_words'] = X
        if len(X_pos_documents)>0:
            vectoriser = TfidfVectorizer().fit(np.asarray(X_pos_documents)[train_val_set])
            X = vectoriser.transform(X_pos_documents)
            #X[X>1] = 1
            dataset['feature_sets']['tfidf_pos'] = X
            vectoriser = TfidfVectorizer().fit(np.asarray(X_word_pos_documents)[train_val_set])
            X = vectoriser.transform(X_word_pos_documents)
            #X[X>1] = 1
            dataset['feature_sets']['tfidf_words_pos'] = X
        print("done (%.1fs)" % (time.time()-t))

    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    np.savez_compressed(data_source_filename, **dataset)
    print("done (%.1fs)" % (time.time()-t), flush=True)


Loading StClare_facs_danish.npz... done (0.0s)


 Preparing word(+pos) data: 100%|██████████| 98/98 [00:00<00:00, 21672.56it/s]

 Running BOW feature transform... done (0.0s)
 Running tf-idf feature transform... done (0.0s)
 Saving StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... 




done (0.8s)


 Preparing word(+pos) data: 100%|██████████| 7572/7572 [00:00<00:00, 42575.54it/s]

 Running BOW feature transform... 




done (2.8s)
 Running tf-idf feature transform... done (2.9s)
 Saving SDHK_Latin.npz... done (3.4s)
Loading StClare_facs_latin.npz... done (0.0s)


 Preparing word(+pos) data: 100%|██████████| 358/358 [00:00<00:00, 29744.48it/s]

 Running BOW feature transform... 




done (0.1s)
 Running tf-idf feature transform... done (0.1s)
 Saving StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)


 Preparing word(+pos) data: 100%|██████████| 98/98 [00:00<00:00, 8556.95it/s]

 Running BOW feature transform... done (0.0s)
 Running tf-idf feature transform... done (0.0s)
 Saving StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.0s)



 Preparing word(+pos) data: 100%|██████████| 358/358 [00:00<00:00, 43142.10it/s]


 Running BOW feature transform... done (0.1s)
 Running tf-idf feature transform... done (0.1s)
 Saving StClare_dipl_latin.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)


 Preparing word(+pos) data: 100%|██████████| 98/98 [00:00<00:00, 21492.38it/s]

 Running BOW feature transform... done (0.0s)
 Running tf-idf feature transform... done (0.0s)
 Saving velux_facs_danish.npz... done (0.1s)





Loading SDHK_Swedish.npz... done (0.3s)


 Preparing word(+pos) data: 100%|██████████| 3086/3086 [00:00<00:00, 39432.81it/s]

 Running BOW feature transform... 




done (1.0s)
 Running tf-idf feature transform... done (1.0s)
 Saving SDHK_Swedish.npz... done (1.3s)
Loading Colonia.npz... done (5.6s)


 Preparing word(+pos) data: 100%|██████████| 99/99 [00:01<00:00, 72.70it/s] 


 Running BOW feature transform... done (14.7s)
 Running tf-idf feature transform... done (15.0s)
 Saving Colonia.npz... done (13.4s)
Loading SemEval2015.npz... done (0.1s)


 Preparing word(+pos) data: 100%|██████████| 3370/3370 [00:00<00:00, 86735.75it/s]


 Running BOW feature transform... done (0.3s)
 Running tf-idf feature transform... done (0.3s)
 Saving SemEval2015.npz... done (0.4s)
Loading velux_dipl_latin.npz... done (0.1s)


 Preparing word(+pos) data: 100%|██████████| 358/358 [00:00<00:00, 28923.45it/s]

 Running BOW feature transform... 




done (0.1s)
 Running tf-idf feature transform... done (0.1s)
 Saving velux_dipl_latin.npz... done (0.3s)
Loading velux_facs_latin.npz... done (0.1s)


 Preparing word(+pos) data: 100%|██████████| 358/358 [00:00<00:00, 29136.72it/s]

 Running BOW feature transform... 




done (0.1s)
 Running tf-idf feature transform... done (0.1s)
 Saving velux_facs_latin.npz... done (0.4s)
Loading velux_dipl_danish.npz... done (0.0s)


 Preparing word(+pos) data: 100%|██████████| 98/98 [00:00<00:00, 22024.42it/s]

 Running BOW feature transform... done (0.0s)
 Running tf-idf feature transform... done (0.0s)
 Saving velux_dipl_danish.npz... done (0.1s)





## n-gram vectors

In [31]:
urlretrieve("https://raw.githubusercontent.com/fredrikwahlberg/5LN721/master/ngram.py", 
            "external/ngram.py")

from ngram import NGramModel
from scipy.sparse import lil_matrix

In [32]:
for data_source_filename in data_source_filenames:
    # Load the data (and maybe add a key)
    dataset = load_dataset(data_source_filename)
    if 'feature_sets' not in dataset:
        dataset['feature_sets'] = dict()

    # Define an ngram extraction function for processing documents
    # characters = list(dataset['data'][0]['text'])
    # tokens = [word.lower() for sent in item['tokens'] for word in sent]
    def _make_ngrams(item):
        from ngram import NGramModel
        # Data word word ngrams
        if 'tokens' in item.keys():
            # Flatten and lower() the sentences
            tokens = [word.lower() for sent in item['tokens'] for word in sent]
        else:
            # Improvise from raw text
            tokens = item['text'].lower().split()
        # Data for character ngrams
        if 'text' in item:
            characters = list(item['text'].lower())
        else:
            characters = list(" ".join(tokens).lower())
        # Make the models
        ret = dict()
        for n_order in range(1, 3+1):
            ret["word_ngram_%i" % n_order] = NGramModel(tokens, order=n_order)
            ret["character_ngram_%i" % n_order] = NGramModel(characters, order=n_order)
        return ret

    # Create models for all documents and model configurations
    ngram_models = dict()
    with Pool(processes=os.cpu_count()) as pool:
        for models in tqdm(pool.imap(_make_ngrams, dataset['data'], chunksize=10), desc=" Creating ngram models", total=len(dataset['data'])):
            for k in models.keys():
                if k not in ngram_models:
                    ngram_models[k] = list()
                ngram_models[k].append(models[k])
    # Verify that all the models are there
    for k in ngram_models.keys():
        assert len(ngram_models[k]) == len(dataset['data'])

    # Create the reference models
    print(" Creating reference models...", end="")
    t = time.time()
    ngram_reference_model = dict()
    for k in ngram_models.keys():
        for i, model in enumerate(ngram_models[k]):
            if i==0:
                ngram_reference_model[k] = model.copy()
            else:
                ngram_reference_model[k] = ngram_reference_model[k].union_update(model)
    print("done (%.1fs)" % (time.time()-t), flush=True)

    # Vectorize and make sparse feature matrices
    keys = list(ngram_reference_model.keys())
    keys.sort()
    for k in keys:
        X = lil_matrix((len(dataset['data']), len(ngram_reference_model[k])), dtype=np.float)
        for i, model in tqdm(enumerate(ngram_models[k]), desc=" Building matrix for %s" % k, total=len(dataset['data'])):
            v = model.vectorize(codebook=ngram_reference_model[k].codebook())
            for j in np.nonzero(v):
                X[i, j] = v[j]
        dataset['feature_sets'][k] = X.tocsr()

    print("Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    np.savez_compressed(data_source_filename, **dataset)
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading StClare_facs_danish.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 98/98 [00:00<00:00, 850.25it/s]


 Creating reference models...done (0.1s)


 Building matrix for character_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 8630.44it/s]
 Building matrix for character_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 4425.23it/s]
 Building matrix for character_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 1957.03it/s]
 Building matrix for word_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 4482.51it/s]
 Building matrix for word_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 2694.38it/s]
 Building matrix for word_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2379.09it/s]


Saving StClare_facs_danish.npz... done (0.1s)
Loading SDHK_Latin.npz... done (1.0s)


 Creating ngram models: 100%|██████████| 7572/7572 [00:04<00:00, 1774.06it/s]


 Creating reference models...done (4.4s)


 Building matrix for character_ngram_1: 100%|██████████| 7572/7572 [00:00<00:00, 14092.55it/s]
 Building matrix for character_ngram_2: 100%|██████████| 7572/7572 [00:01<00:00, 6920.26it/s]
 Building matrix for character_ngram_3: 100%|██████████| 7572/7572 [00:03<00:00, 2218.07it/s]
 Building matrix for word_ngram_1: 100%|██████████| 7572/7572 [00:06<00:00, 1199.91it/s]
 Building matrix for word_ngram_2: 100%|██████████| 7572/7572 [00:35<00:00, 210.76it/s]
 Building matrix for word_ngram_3: 100%|██████████| 7572/7572 [01:02<00:00, 120.61it/s]


Saving SDHK_Latin.npz... done (6.8s)
Loading StClare_facs_latin.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 358/358 [00:00<00:00, 1067.58it/s]


 Creating reference models...done (0.6s)


 Building matrix for character_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 11206.60it/s]
 Building matrix for character_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 4532.59it/s]
 Building matrix for character_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1635.49it/s]
 Building matrix for word_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 3149.31it/s]
 Building matrix for word_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 1719.77it/s]
 Building matrix for word_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1503.77it/s]


Saving StClare_facs_latin.npz... done (0.4s)
Loading StClare_dipl_danish.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 98/98 [00:00<00:00, 772.67it/s]


 Creating reference models...done (0.1s)


 Building matrix for character_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 11235.87it/s]
 Building matrix for character_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 5357.13it/s]
 Building matrix for character_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2571.45it/s]
 Building matrix for word_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 4516.25it/s]
 Building matrix for word_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 2670.98it/s]
 Building matrix for word_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2374.10it/s]


Saving StClare_dipl_danish.npz... done (0.1s)
Loading StClare_dipl_latin.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 358/358 [00:00<00:00, 1482.53it/s]


 Creating reference models...done (0.2s)


 Building matrix for character_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 13631.35it/s]
 Building matrix for character_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 6796.39it/s]
 Building matrix for character_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 2751.68it/s]
 Building matrix for word_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 4690.79it/s]
 Building matrix for word_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 2147.40it/s]
 Building matrix for word_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1738.07it/s]


Saving StClare_dipl_latin.npz... done (0.3s)
Loading velux_facs_danish.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 98/98 [00:00<00:00, 636.86it/s]


 Creating reference models...done (0.1s)


 Building matrix for character_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 10488.44it/s]
 Building matrix for character_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 4439.09it/s]
 Building matrix for character_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 1827.15it/s]
 Building matrix for word_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 4330.08it/s]
 Building matrix for word_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 2554.93it/s]
 Building matrix for word_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2266.39it/s]


Saving velux_facs_danish.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.4s)


 Creating ngram models: 100%|██████████| 3086/3086 [00:01<00:00, 1743.69it/s]


 Creating reference models...done (1.9s)


 Building matrix for character_ngram_1: 100%|██████████| 3086/3086 [00:00<00:00, 13518.23it/s]
 Building matrix for character_ngram_2: 100%|██████████| 3086/3086 [00:00<00:00, 6211.61it/s]
 Building matrix for character_ngram_3: 100%|██████████| 3086/3086 [00:01<00:00, 2151.39it/s]
 Building matrix for word_ngram_1: 100%|██████████| 3086/3086 [00:01<00:00, 1551.47it/s]
 Building matrix for word_ngram_2: 100%|██████████| 3086/3086 [00:07<00:00, 425.13it/s]
 Building matrix for word_ngram_3: 100%|██████████| 3086/3086 [00:11<00:00, 275.20it/s]


Saving SDHK_Swedish.npz... done (2.8s)
Loading Colonia.npz... done (5.7s)


 Creating ngram models: 100%|██████████| 99/99 [00:14<00:00,  6.66it/s]


 Creating reference models...done (6.1s)


 Building matrix for character_ngram_1: 100%|██████████| 99/99 [00:00<00:00, 11110.89it/s]
 Building matrix for character_ngram_2: 100%|██████████| 99/99 [00:00<00:00, 3115.94it/s]
 Building matrix for character_ngram_3: 100%|██████████| 99/99 [00:00<00:00, 519.51it/s]
 Building matrix for word_ngram_1: 100%|██████████| 99/99 [00:00<00:00, 154.97it/s]
 Building matrix for word_ngram_2: 100%|██████████| 99/99 [00:04<00:00, 22.89it/s]
 Building matrix for word_ngram_3: 100%|██████████| 99/99 [00:07<00:00, 13.00it/s]


Saving Colonia.npz... done (16.3s)
Loading SemEval2015.npz... done (0.3s)


 Creating ngram models: 100%|██████████| 3370/3370 [00:01<00:00, 3129.42it/s]


 Creating reference models...done (1.3s)


 Building matrix for character_ngram_1: 100%|██████████| 3370/3370 [00:00<00:00, 13949.62it/s]
 Building matrix for character_ngram_2: 100%|██████████| 3370/3370 [00:00<00:00, 8080.04it/s]
 Building matrix for character_ngram_3: 100%|██████████| 3370/3370 [00:00<00:00, 4001.09it/s]
 Building matrix for word_ngram_1: 100%|██████████| 3370/3370 [00:00<00:00, 4188.22it/s]
 Building matrix for word_ngram_2: 100%|██████████| 3370/3370 [00:03<00:00, 1089.72it/s]
 Building matrix for word_ngram_3: 100%|██████████| 3370/3370 [00:04<00:00, 714.66it/s]


Saving SemEval2015.npz... done (1.1s)
Loading velux_dipl_latin.npz... done (0.1s)


 Creating ngram models: 100%|██████████| 358/358 [00:00<00:00, 1162.31it/s]


 Creating reference models...done (0.4s)


 Building matrix for character_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 13453.76it/s]
 Building matrix for character_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 6337.89it/s]
 Building matrix for character_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 2330.63it/s]
 Building matrix for word_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 4080.21it/s]
 Building matrix for word_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 1916.44it/s]
 Building matrix for word_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1441.15it/s]


Saving velux_dipl_latin.npz... done (0.3s)
Loading velux_facs_latin.npz... done (0.1s)


 Creating ngram models: 100%|██████████| 358/358 [00:00<00:00, 1213.65it/s]


 Creating reference models...done (0.3s)


 Building matrix for character_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 8059.73it/s]
 Building matrix for character_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 3587.16it/s]
 Building matrix for character_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1581.44it/s]
 Building matrix for word_ngram_1: 100%|██████████| 358/358 [00:00<00:00, 3074.80it/s]
 Building matrix for word_ngram_2: 100%|██████████| 358/358 [00:00<00:00, 1675.04it/s]
 Building matrix for word_ngram_3: 100%|██████████| 358/358 [00:00<00:00, 1502.31it/s]


Saving velux_facs_latin.npz... done (0.4s)
Loading velux_dipl_danish.npz... done (0.0s)


 Creating ngram models: 100%|██████████| 98/98 [00:00<00:00, 745.76it/s]


 Creating reference models...done (0.1s)


 Building matrix for character_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 12310.33it/s]
 Building matrix for character_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 5273.35it/s]
 Building matrix for character_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2342.12it/s]
 Building matrix for word_ngram_1: 100%|██████████| 98/98 [00:00<00:00, 4697.35it/s]
 Building matrix for word_ngram_2: 100%|██████████| 98/98 [00:00<00:00, 2666.16it/s]
 Building matrix for word_ngram_3: 100%|██████████| 98/98 [00:00<00:00, 2273.30it/s]


Saving velux_dipl_danish.npz... done (0.1s)
