In [None]:
import json
from pathlib import Path
import gzip
import ast
import re
import random
from collections import Sequence

In [None]:
address_dir = Path("/home/dani/dev/openaddresses/")

In [None]:
class AddressGenerator(Sequence):
    _STRASSE_PAT = re.compile("straße")
    
    def __init__(self, openaddresses_dir, limit_per_file=100000):
        self.address_dir = Path(openaddresses_dir)
        self.limit_per_file = limit_per_file

        self.addresses = []
        for address_json in self.address_dir.glob("*.geojson.gz"):
            with gzip.open(address_json, 'rt') as g:
                for _ in range(self.limit_per_file):
                    line = g.readline()
                    self.addresses.append(ast.literal_eval(line))
                    
        # keep only things with post-codes
        self.addresses = [addr for addr in self.addresses if addr["properties"]["postcode"]]
    
    def __len__(self):
        return len(self.addresses)
    
    def __getitem__(self, index):
        return self.addresses[index]

    def __iter__(self):
        return iter(self.addresses)
    
    @staticmethod
    def _rand_bool(p=0.5):
        return random.random() < p

    def sample(self, real=False):

        if real:
            address = random.choice(self)
            
            street = address["properties"]["street"]
            number = address["properties"]["number"]
            unit = address["properties"]["unit"]
            postcode = address["properties"]["postcode"]
            city = address["properties"]["city"]

        else:
            addresses = random.sample(self, k=5)
            
            street = addresses[0]["properties"]["street"]
            number = addresses[1]["properties"]["number"]
            unit = addresses[2]["properties"]["unit"]
            postcode = addresses[3]["properties"]["postcode"]
            city = addresses[4]["properties"]["city"]

        if self._rand_bool():
            street = self._STRASSE_PAT.sub("str.", street)

        out = ""
        out += street + " "
        out += number

        if unit:
            if self._rand_bool():
                out += " "
            out += unit

        if self._rand_bool():
            out += "\n"
        else:
            out += " "

        if self._rand_bool(0.8):
            if postcode:
                out += postcode + " "

            out += city


        if self._rand_bool():
            out = out.lower()

        return out.strip()

In [None]:
generator = AddressGenerator(address_dir, limit_per_file=10000)

In [None]:
generator.sample(real=False)

In [None]:
%%time
addresses = [generator.sample() for i in range(100000)]

In [None]:
len(addresses)

In [None]:
generator.sample()

In [None]:
import spacy

nlp = spacy.load("de_core_news_sm")

In [None]:
nlp.disable_pipes(["tagger","ner","attribute_ruler", "parser","lemmatizer"])

In [None]:
nlp.pipe_names

In [None]:
data = pd.read_json("/home/dani/dev/conny-dev/general-dataset/data/clean/data.jsonl", lines=True, nrows=1000)

In [None]:
doc = nlp(data.iloc[0].text[0])

In [None]:
%%time
spans = []
for i, tok in enumerate(doc):
    for j in range(2, 10):
        spans.append(doc[i:i+j])

In [None]:
len(spans)

collecting all 2-12 token spans in a page produces ~3000 samples

In [None]:
spans[100]