💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning
explosion · Dec 3, 2018 · f378630 · f378630
1 parent 40b57ea
commit f378630
Show file tree

Hide file tree

Showing 33 changed files with 130 additions and 238 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
 helper function.
 
 ```python
-from .compat import unicode_, json_dumps, is_config
+from .compat import unicode_, is_config
 
 compatible_unicode = unicode_('hello world')
-compatible_json = json_dumps({'key': 'value'})
 if is_config(windows=True, python2=True):
     print("You are using Python 2 on Windows.")
 ```

diff --git a/bin/load_reddit.py b/bin/load_reddit.py
@@ -3,7 +3,7 @@
 
 import bz2
 import regex as re
-import ujson
+import srsly
 import sys
 import random
 import datetime
@@ -44,7 +44,7 @@ def __iter__(self):
                     line = line.strip()
                     if not line:
                         continue
-                    comment = ujson.loads(line)
+                    comment = srsly.json_loads(line)
                     if self.is_valid(comment):
                         text = self.strip_tags(comment["body"])
                         yield {"text": text}
@@ -75,7 +75,7 @@ def is_valid(self, comment):
 def main(path):
     reddit = Reddit(path)
     for comment in reddit:
-        print(ujson.dumps(comment))
+        print(srsly.json_dumps(comment))
 
 
 if __name__ == "__main__":

diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
@@ -45,7 +45,7 @@
 from bz2 import BZ2File
 import time
 import plac
-import ujson
+import json
 
 from spacy.matcher import PhraseMatcher
 import spacy
@@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"):
 
 def read_gazetteer(tokenizer, loc, n=-1):
     for i, line in enumerate(open(loc)):
-        data = ujson.loads(line.strip())
+        data = json.loads(line.strip())
         phrase = tokenizer(data["text"])
         for w in phrase:
             _ = tokenizer.vocab[w.text]
@@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1):
 def read_text(bz2_loc, n=10000):
     with BZ2File(bz2_loc) as file_:
         for i, line in enumerate(file_):
-            data = ujson.loads(line)
+            data = json.loads(line)
             yield data["body"]
             if i >= n:
                 break

diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py
@@ -1,5 +1,5 @@
 import numpy as np
-import ujson as json
+import json
 from keras.utils import to_categorical
 import plac
 import sys

diff --git a/examples/notebooks/Decompositional Attention.ipynb b/examples/notebooks/Decompositional Attention.ipynb
@@ -77,7 +77,7 @@
     }
    ],
    "source": [
-    "import ujson as json\n",
+    "import json\n",
     "from keras.utils import to_categorical\n",
     "\n",
     "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",

diff --git a/requirements.txt b/requirements.txt
@@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
-ujson>=1.35
 dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
 jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
+srsly>=0.0.4,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 cython>=0.25

diff --git a/setup.py b/setup.py
@@ -203,12 +203,12 @@ def setup_package():
                 "thinc==7.0.0.dev4",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
-                "ujson>=1.35",
                 "regex==2018.01.10",
                 "dill>=0.2,<0.3",
                 "requests>=2.13.0,<3.0.0",
                 "jsonschema>=2.6.0,<3.0.0",
                 "wasabi>=0.0.8,<1.1.0",
+                "srsly>=0.0.4,<1.1.0",
                 'pathlib==1.0.1; python_version < "3.4"',
             ],
             setup_requires=["wheel"],

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
@@ -4,9 +4,9 @@
 import plac
 from pathlib import Path
 from wasabi import Printer
+import srsly
 
-from ..util import write_jsonl, write_json
-from ..compat import json_dumps, path2str
+from ..compat import path2str
 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
 from ._messages import Messages
@@ -77,17 +77,16 @@ def convert(
         suffix = ".{}".format(file_type)
         output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
         if file_type == "json":
-            write_json(output_file, data)
+            srsly.write_json(output_file, data)
         elif file_type == "jsonl":
-            write_jsonl(output_file, data)
+            srsly.write_jsonl(output_file, data)
         msg.good(
             Messages.M032.format(name=path2str(output_file)),
             Messages.M033.format(n_docs=len(data)),
         )
     else:
         # Print to stdout
         if file_type == "json":
-            print(json_dumps(data))
+            srsly.write_json("-", data)
         elif file_type == "jsonl":
-            for line in data:
-                print(json_dumps(line))
+            srsly.write_jsonl("-", data)
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import ujson
+import srsly
 
 from ...util import get_lang_class
 from .._messages import Messages
@@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
     if lang is None:
         raise ValueError(Messages.M054)
     json_docs = []
-    input_tuples = [ujson.loads(line) for line in input_data]
+    input_tuples = [srsly.json_loads(line) for line in input_data]
     nlp = get_lang_class(lang)()
     for i, (raw_text, ents) in enumerate(input_tuples):
         doc = nlp.make_doc(raw_text)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
@@ -5,10 +5,11 @@
 from collections import Counter
 import plac
 import sys
+import srsly
 from wasabi import Printer, MESSAGES
 
 from ..gold import GoldCorpus, read_json_object
-from ..util import load_model, get_lang_class, read_json, read_jsonl
+from ..util import load_model, get_lang_class
 
 # from .schemas import get_schema, validate_json
 from ._messages import Messages
@@ -320,11 +321,11 @@ def debug_data(
 def _load_file(file_path, msg):
     file_name = file_path.parts[-1]
     if file_path.suffix == ".json":
-        data = read_json(file_path)
+        data = srsly.read_json(file_path)
         msg.good("Loaded {}".format(file_name))
         return data
     elif file_path.suffix == ".jsonl":
-        data = read_jsonl(file_path)
+        data = srsly.read_jsonl(file_path)
         msg.good("Loaded {}".format(file_name))
         return data
     msg.fail(

diff --git a/spacy/cli/info.py b/spacy/cli/info.py
@@ -5,6 +5,7 @@
 import platform
 from pathlib import Path
 from wasabi import Printer
+import srsly
 
 from ._messages import Messages
 from ..compat import path2str, basestring_, unicode_
@@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
         meta_path = model_path / "meta.json"
         if not meta_path.is_file():
             msg.fail(Messages.M020, meta_path, exits=1)
-        meta = util.read_json(meta_path)
+        meta = srsly.read_json(meta_path)
         if model_path.resolve() != model_path:
             meta["link"] = path2str(model_path)
             meta["source"] = path2str(model_path.resolve())

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
@@ -11,12 +11,13 @@
 import tarfile
 import gzip
 import zipfile
+import srsly
 from wasabi import Printer
 
 from ._messages import Messages
 from ..vectors import Vectors
 from ..errors import Errors, Warnings, user_warning
-from ..util import ensure_path, get_lang_class, read_jsonl
+from ..util import ensure_path, get_lang_class
 
 try:
     import ftfy
@@ -59,7 +60,7 @@ def init_model(
                 settings.append("-c")
             msg.warn(Messages.M063, Messages.M064)
         jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = read_jsonl(jsonl_loc)
+        lex_attrs = srsly.read_jsonl(jsonl_loc)
     else:
         clusters_loc = ensure_path(clusters_loc)
         freqs_loc = ensure_path(freqs_loc)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
@@ -5,9 +5,10 @@
 import shutil
 from pathlib import Path
 from wasabi import Printer, get_raw_input
+import srsly
 
 from ._messages import Messages
-from ..compat import path2str, json_dumps
+from ..compat import path2str
 from .. import util
 from .. import about
 
@@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
 
     meta_path = meta_path or input_path / "meta.json"
     if meta_path.is_file():
-        meta = util.read_json(meta_path)
+        meta = srsly.read_json(meta_path)
         if not create_meta:  # only print if user doesn't want to overwrite
             msg.good(Messages.M041, meta_path)
         else:
@@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
             )
     Path.mkdir(package_path, parents=True)
     shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
-    create_file(main_path / "meta.json", json_dumps(meta))
+    create_file(main_path / "meta.json", srsly.json_dumps(meta))
     create_file(main_path / "setup.py", TEMPLATE_SETUP)
     create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
     create_file(package_path / "__init__.py", TEMPLATE_INIT)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
@@ -5,19 +5,17 @@
 import random
 import numpy
 import time
-import ujson
-import sys
 from collections import Counter
 from pathlib import Path
 from thinc.v2v import Affine, Maxout
 from thinc.api import wrap
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
 from wasabi import Printer
+import srsly
 
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from ..compat import json_dumps
 from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from .. import util
 
@@ -72,7 +70,7 @@ def pretrain(
     if not output_dir.exists():
         output_dir.mkdir()
         msg.good("Created output directory")
-    util.write_json(output_dir / "config.json", config)
+    srsly.write_json(output_dir / "config.json", config)
     msg.good("Saved settings to config.json")
 
     # Load texts from file or stdin
@@ -81,12 +79,12 @@ def pretrain(
         if not texts_loc.exists():
             msg.fail("Input text file doesn't exist", texts_loc, exits=1)
         with msg.loading("Loading input texts..."):
-            texts = list(util.read_jsonl(texts_loc))
+            texts = list(srsly.read_jsonl(texts_loc))
         msg.good("Loaded input texts")
         random.shuffle(texts)
     else:  # reading from stdin
         msg.text("Reading input text from stdin...")
-        texts = stream_texts()
+        texts = srsly.read_jsonl("-")
 
     with msg.loading("Loading model '{}'...".format(vectors_model)):
         nlp = util.load_model(vectors_model)
@@ -130,18 +128,13 @@ def pretrain(
                 "epoch": epoch,
             }
             with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(json_dumps(log) + "\n")
+                file_.write(srsly.json_dumps(log) + "\n")
         tracker.epoch_loss = 0.0
         if texts_loc != "-":
             # Reshuffle the texts if texts were loaded from a file
             random.shuffle(texts)
 
 
-def stream_texts():
-    for line in sys.stdin:
-        yield ujson.loads(line)
-
-
 def make_update(model, docs, optimizer, drop=0.0):
     """Perform an update over a single batch of documents.
 

diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
@@ -3,7 +3,7 @@
 
 import plac
 from pathlib import Path
-import ujson
+import srsly
 import cProfile
 import pstats
 import sys
@@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
         msg.info("Using data from {}".format(input_path.parts[-1]))
         file_ = input_path.open()
     for line in file_:
-        data = ujson.loads(line)
+        data = srsly.json_loads(line)
         text = data["text"]
         yield text
diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py
@@ -3,9 +3,9 @@
 
 from pathlib import Path
 from jsonschema import Draft4Validator
+import srsly
 
 from ...errors import Errors
-from ...util import read_json
 
 
 SCHEMAS = {}
@@ -25,7 +25,7 @@ def get_schema(name):
         schema_path = Path(__file__).parent / "{}.json".format(name)
         if not schema_path.exists():
             raise ValueError(Errors.E104.format(name=name))
-        schema = read_json(schema_path)
+        schema = srsly.read_json(schema_path)
         # TODO: replace with (stable) Draft6Validator, if available
         validator = Draft4Validator(schema)
         validator.check_schema(schema)