Skip to content

Commit

Permalink
馃挮 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Browse files Browse the repository at this point in the history
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 馃帀

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning
  • Loading branch information
ines authored and honnibal committed Dec 3, 2018
1 parent 40b57ea commit f378630
Show file tree
Hide file tree
Showing 33 changed files with 130 additions and 238 deletions.
3 changes: 1 addition & 2 deletions CONTRIBUTING.md
Expand Up @@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
helper function.

```python
from .compat import unicode_, json_dumps, is_config
from .compat import unicode_, is_config

compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
if is_config(windows=True, python2=True):
print("You are using Python 2 on Windows.")
```
Expand Down
6 changes: 3 additions & 3 deletions bin/load_reddit.py
Expand Up @@ -3,7 +3,7 @@

import bz2
import regex as re
import ujson
import srsly
import sys
import random
import datetime
Expand Down Expand Up @@ -44,7 +44,7 @@ def __iter__(self):
line = line.strip()
if not line:
continue
comment = ujson.loads(line)
comment = srsly.json_loads(line)
if self.is_valid(comment):
text = self.strip_tags(comment["body"])
yield {"text": text}
Expand Down Expand Up @@ -75,7 +75,7 @@ def is_valid(self, comment):
def main(path):
reddit = Reddit(path)
for comment in reddit:
print(ujson.dumps(comment))
print(srsly.json_dumps(comment))


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/information_extraction/phrase_matcher.py
Expand Up @@ -45,7 +45,7 @@
from bz2 import BZ2File
import time
import plac
import ujson
import json

from spacy.matcher import PhraseMatcher
import spacy
Expand All @@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"):

def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)):
data = ujson.loads(line.strip())
data = json.loads(line.strip())
phrase = tokenizer(data["text"])
for w in phrase:
_ = tokenizer.vocab[w.text]
Expand All @@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1):
def read_text(bz2_loc, n=10000):
with BZ2File(bz2_loc) as file_:
for i, line in enumerate(file_):
data = ujson.loads(line)
data = json.loads(line)
yield data["body"]
if i >= n:
break
Expand Down
2 changes: 1 addition & 1 deletion examples/keras_parikh_entailment/__main__.py
@@ -1,5 +1,5 @@
import numpy as np
import ujson as json
import json
from keras.utils import to_categorical
import plac
import sys
Expand Down
2 changes: 1 addition & 1 deletion examples/notebooks/Decompositional Attention.ipynb
Expand Up @@ -77,7 +77,7 @@
}
],
"source": [
"import ujson as json\n",
"import json\n",
"from keras.utils import to_categorical\n",
"\n",
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
cytoolz>=0.9.0,<0.10.0
plac<1.0.0,>=0.9.6
ujson>=1.35
dill>=0.2,<0.3
regex==2018.01.10
requests>=2.13.0,<3.0.0
jsonschema>=2.6.0,<3.0.0
wasabi>=0.0.8,<1.1.0
srsly>=0.0.4,<1.1.0
pathlib==1.0.1; python_version < "3.4"
# Development dependencies
cython>=0.25
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -203,12 +203,12 @@ def setup_package():
"thinc==7.0.0.dev4",
"blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6",
"ujson>=1.35",
"regex==2018.01.10",
"dill>=0.2,<0.3",
"requests>=2.13.0,<3.0.0",
"jsonschema>=2.6.0,<3.0.0",
"wasabi>=0.0.8,<1.1.0",
"srsly>=0.0.4,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"',
],
setup_requires=["wheel"],
Expand Down
13 changes: 6 additions & 7 deletions spacy/cli/convert.py
Expand Up @@ -4,9 +4,9 @@
import plac
from pathlib import Path
from wasabi import Printer
import srsly

from ..util import write_jsonl, write_json
from ..compat import json_dumps, path2str
from ..compat import path2str
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json
from ._messages import Messages
Expand Down Expand Up @@ -77,17 +77,16 @@ def convert(
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
write_json(output_file, data)
srsly.write_json(output_file, data)
elif file_type == "jsonl":
write_jsonl(output_file, data)
srsly.write_jsonl(output_file, data)
msg.good(
Messages.M032.format(name=path2str(output_file)),
Messages.M033.format(n_docs=len(data)),
)
else:
# Print to stdout
if file_type == "json":
print(json_dumps(data))
srsly.write_json("-", data)
elif file_type == "jsonl":
for line in data:
print(json_dumps(line))
srsly.write_jsonl("-", data)
4 changes: 2 additions & 2 deletions spacy/cli/converters/jsonl2json.py
@@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals

import ujson
import srsly

from ...util import get_lang_class
from .._messages import Messages
Expand All @@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None:
raise ValueError(Messages.M054)
json_docs = []
input_tuples = [ujson.loads(line) for line in input_data]
input_tuples = [srsly.json_loads(line) for line in input_data]
nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text)
Expand Down
7 changes: 4 additions & 3 deletions spacy/cli/debug_data.py
Expand Up @@ -5,10 +5,11 @@
from collections import Counter
import plac
import sys
import srsly
from wasabi import Printer, MESSAGES

from ..gold import GoldCorpus, read_json_object
from ..util import load_model, get_lang_class, read_json, read_jsonl
from ..util import load_model, get_lang_class

# from .schemas import get_schema, validate_json
from ._messages import Messages
Expand Down Expand Up @@ -320,11 +321,11 @@ def debug_data(
def _load_file(file_path, msg):
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
data = read_json(file_path)
data = srsly.read_json(file_path)
msg.good("Loaded {}".format(file_name))
return data
elif file_path.suffix == ".jsonl":
data = read_jsonl(file_path)
data = srsly.read_jsonl(file_path)
msg.good("Loaded {}".format(file_name))
return data
msg.fail(
Expand Down
3 changes: 2 additions & 1 deletion spacy/cli/info.py
Expand Up @@ -5,6 +5,7 @@
import platform
from pathlib import Path
from wasabi import Printer
import srsly

from ._messages import Messages
from ..compat import path2str, basestring_, unicode_
Expand Down Expand Up @@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = path2str(model_path)
meta["source"] = path2str(model_path.resolve())
Expand Down
5 changes: 3 additions & 2 deletions spacy/cli/init_model.py
Expand Up @@ -11,12 +11,13 @@
import tarfile
import gzip
import zipfile
import srsly
from wasabi import Printer

from ._messages import Messages
from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
from ..util import ensure_path, get_lang_class, read_jsonl
from ..util import ensure_path, get_lang_class

try:
import ftfy
Expand Down Expand Up @@ -59,7 +60,7 @@ def init_model(
settings.append("-c")
msg.warn(Messages.M063, Messages.M064)
jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = read_jsonl(jsonl_loc)
lex_attrs = srsly.read_jsonl(jsonl_loc)
else:
clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc)
Expand Down
7 changes: 4 additions & 3 deletions spacy/cli/package.py
Expand Up @@ -5,9 +5,10 @@
import shutil
from pathlib import Path
from wasabi import Printer, get_raw_input
import srsly

from ._messages import Messages
from ..compat import path2str, json_dumps
from ..compat import path2str
from .. import util
from .. import about

Expand Down Expand Up @@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals

meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = util.read_json(meta_path)
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good(Messages.M041, meta_path)
else:
Expand All @@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
)
Path.mkdir(package_path, parents=True)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
create_file(main_path / "meta.json", json_dumps(meta))
create_file(main_path / "meta.json", srsly.json_dumps(meta))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
Expand Down
17 changes: 5 additions & 12 deletions spacy/cli/pretrain.py
Expand Up @@ -5,19 +5,17 @@
import random
import numpy
import time
import ujson
import sys
from collections import Counter
from pathlib import Path
from thinc.v2v import Affine, Maxout
from thinc.api import wrap
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly

from ..tokens import Doc
from ..attrs import ID, HEAD
from ..compat import json_dumps
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .. import util

Expand Down Expand Up @@ -72,7 +70,7 @@ def pretrain(
if not output_dir.exists():
output_dir.mkdir()
msg.good("Created output directory")
util.write_json(output_dir / "config.json", config)
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")

# Load texts from file or stdin
Expand All @@ -81,12 +79,12 @@ def pretrain(
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."):
texts = list(util.read_jsonl(texts_loc))
texts = list(srsly.read_jsonl(texts_loc))
msg.good("Loaded input texts")
random.shuffle(texts)
else: # reading from stdin
msg.text("Reading input text from stdin...")
texts = stream_texts()
texts = srsly.read_jsonl("-")

with msg.loading("Loading model '{}'...".format(vectors_model)):
nlp = util.load_model(vectors_model)
Expand Down Expand Up @@ -130,18 +128,13 @@ def pretrain(
"epoch": epoch,
}
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(json_dumps(log) + "\n")
file_.write(srsly.json_dumps(log) + "\n")
tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
random.shuffle(texts)


def stream_texts():
for line in sys.stdin:
yield ujson.loads(line)


def make_update(model, docs, optimizer, drop=0.0):
"""Perform an update over a single batch of documents.
Expand Down
4 changes: 2 additions & 2 deletions spacy/cli/profile.py
Expand Up @@ -3,7 +3,7 @@

import plac
from pathlib import Path
import ujson
import srsly
import cProfile
import pstats
import sys
Expand Down Expand Up @@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
msg.info("Using data from {}".format(input_path.parts[-1]))
file_ = input_path.open()
for line in file_:
data = ujson.loads(line)
data = srsly.json_loads(line)
text = data["text"]
yield text
4 changes: 2 additions & 2 deletions spacy/cli/schemas/__init__.py
Expand Up @@ -3,9 +3,9 @@

from pathlib import Path
from jsonschema import Draft4Validator
import srsly

from ...errors import Errors
from ...util import read_json


SCHEMAS = {}
Expand All @@ -25,7 +25,7 @@ def get_schema(name):
schema_path = Path(__file__).parent / "{}.json".format(name)
if not schema_path.exists():
raise ValueError(Errors.E104.format(name=name))
schema = read_json(schema_path)
schema = srsly.read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
validator.check_schema(schema)
Expand Down

0 comments on commit f378630

Please sign in to comment.