diff --git a/codext/__common__.py b/codext/__common__.py index 81ca88e..e874bf2 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1,14 +1,24 @@ # -*- coding: UTF-8 -*- import _codecs import codecs +import os import re import sys -import types from functools import wraps from six import binary_type, string_types, text_type +from types import FunctionType +try: # Python3 + from inspect import getfullargspec +except ImportError: + from inspect import getargspec as getfullargspec +try: + from importlib import reload +except ImportError: + pass -__all__ = ["add", "b", "codecs", "ensure_str", "re", "PY3"] +__all__ = ["add", "b", "clear", "codecs", "ensure_str", "re", "register", + "remove", "reset", "s2i", "PY3"] PY3 = sys.version[0] == "3" @@ -16,25 +26,32 @@ iss = lambda s: isinstance(s, string_types) fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x +s2i = lambda s: int(codecs.encode(s, "base16"), 16) -def add(ename, encode=None, decode=None, pattern=None, text=True): + +def add(ename, encode=None, decode=None, pattern=None, text=True, + add_to_codecs=False): """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern and with file handling (if text is True). - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - """ - if encode and not isinstance(encode, types.FunctionType): - raise ValueError("Bad encode function") - if decode and not isinstance(decode, types.FunctionType): - raise ValueError("Bad decode function") + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + """ + if encode and not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + if decode and not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") if not encode and not decode: - raise ValueError("At least one function must be defined") + raise ValueError("At least one en/decoding function must be defined") # search function for the new encoding def getregentry(encoding): if encoding != ename and not (pattern and re.match(pattern, encoding)): @@ -79,8 +96,13 @@ def decode(self, input, final=False): except AttributeError: return # this occurs when m is None, meaning no match except IndexError: - pass # this occurs while m is not None, but possibly no - # capture group that gives at least 1 group index + # this occurs while m is not None, but possibly no capture group + # that gives at least 1 group index ; in this case, if + # fenc/fdec is a decorated function, execute it with no arg + if fenc and len(getfullargspec(fenc).args) == 1: + fenc = fenc() + if fdec and len(getfullargspec(fdec).args) == 1: + fdec = fdec() if fenc: fenc = fix_inout_formats(fenc) if fdec: @@ -107,21 +129,59 @@ class StreamReader(Codec, codecs.StreamReader): streamreader=streamreader, _is_text_encoding=text, ) - codecs.register(getregentry) -codecs.add = add + getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) + register(getregentry, add_to_codecs) + + +def clear(): + """ + Clear codext's local registry of search functions. + """ + global __codecs_registry + __codecs_registry = [] +codecs.clear = clear + + +def remove(encoding): + """ + Remove all search functions matching the input encoding name from codext's + local registry. + + :param encoding: encoding name + """ + tbr = [] + for search in __codecs_registry: + if search(encoding) is not None: + tbr.append(search) + for search in tbr: + __codecs_registry.remove(search) +codecs.remove = remove + + +def reset(): + """ + Reset codext's local registry of search functions. + """ + clear() + for f in os.listdir(os.path.dirname(__file__)): + if not f.endswith(".py") or f.startswith("_"): + continue + reload(__import__(f[:-3], globals(), locals(), [], 1)) +codecs.reset = reset +# conversion functions def b(s): """ Non-crashing bytes conversion function. """ if PY3: try: - return s.encode("latin-1") + return s.encode("utf-8") except: pass try: - return s.encode("utf-8") + return s.encode("latin-1") except: pass return s @@ -159,18 +219,22 @@ def _wrapper(*args, **kwargs): # codecs module hooks -orig_lookup = _codecs.lookup -orig_register = _codecs.register -_ts_codecs_registry = [] -_ts_codecs_registry_hashes = [] +orig_lookup = _codecs.lookup +orig_register = _codecs.register + + +def __add(ename, encode=None, decode=None, pattern=None, text=True, + add_to_codecs=True): + add(ename, encode, decode, pattern, text, add_to_codecs) +__add.__doc__ = add.__doc__ +codecs.add = __add def __decode(obj, encoding='utf-8', errors='strict'): """ Custom decode function relying on the hooked lookup function. """ - codecinfo = __lookup(encoding) - return codecinfo.decode(obj, errors)[0] + return __lookup(encoding).decode(obj, errors)[0] codecs.decode = __decode @@ -178,8 +242,7 @@ def __encode(obj, encoding='utf-8', errors='strict'): """ Custom encode function relying on the hooked lookup function. """ - codecinfo = __lookup(encoding) - return codecinfo.encode(obj, errors)[0] + return __lookup(encoding).encode(obj, errors)[0] codecs.encode = __encode @@ -188,7 +251,7 @@ def __lookup(encoding): Hooked lookup function for searching first for codecs in the local registry of this module. """ - for search in _ts_codecs_registry: + for search in __codecs_registry: codecinfo = search(encoding) if codecinfo is not None: return codecinfo @@ -196,14 +259,35 @@ def __lookup(encoding): codecs.lookup = __lookup -def __register(search_function): +def register(search_function, add_to_codecs=False): + """ + Register function for registering new codecs in the local registry of this + module and, if required, in the native codecs registry (for use with the + built-in 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + """ + if search_function not in __codecs_registry: + __codecs_registry.append(search_function) + if add_to_codecs: + orig_register(search_function) + + +def __register(search_function, add_to_codecs=True): """ Hooked register function for registering new codecs in the local registry - of this module. + of this module and in the native codecs registry (for use with the built-in + 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later """ - h = hash(search_function) - if h not in _ts_codecs_registry_hashes: - _ts_codecs_registry_hashes.append(h) - _ts_codecs_registry.append(search_function) - orig_register(search_function) + register(search_function, add_to_codecs) codecs.register = __register diff --git a/codext/__init__.py b/codext/__init__.py index e9d6ec1..18c9413 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -2,25 +2,21 @@ """Module for enhancing codecs preimport. """ -import os +from .__common__ import * +from .__info__ import __author__, __copyright__, __license__, __version__ -from .__common__ import add, codecs - -__all__ = ["add", "decode", "encode", "lookup", "open", "register"] +__all__ = ["add", "clear", "decode", "encode", "lookup", "open", "register", + "remove", "reset"] decode = codecs.decode encode = codecs.encode lookup = codecs.lookup open = codecs.open -register = codecs.register -for f in os.listdir(os.path.dirname(__file__)): - if not f.endswith(".py") or f == "__init__.py": - continue - __import__(f[:-3], globals(), locals(), [], 1) +reset() def main(): diff --git a/docs/features.md b/docs/features.md index a1e92b0..5735bd1 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,11 +1,15 @@ -Basically, the `codecs` library, relying on the built-in `_codecs` library, maintains a registry of search functions that maps an input `encoding` variable to the right de/encode function. `codext` hooks the native `codecs` to insert its own registry between the function calls and the native one. +Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. -!!! note "`codecs` import and the `open` built-in function" +`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. + +!!! note "The `open` built-in function" + + Two behaviors are to be considered when using `codext`: - When `codext` is imported, the new encodings are added to its registry but also to the native one. Moreover, hooked functions are bound to the `codext` module but also overwrites the original ones in `codecs`. Consequently: + 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. + 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. - 1. Once `codext` has been imported, `codecs` can be imported elsewhere in the program and will have the `add` and hooked functions attached, with the new encodings available. - 2. While `codecs.open` will handle the new encodings according to `codext`'s registry first, the native `open` function will rely on the native registry only and therefore handle the encoding search functions of `codext` _after_ these of the native registry has it will rely on non-hooked functions. + This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. ----- @@ -18,16 +22,20 @@ New codecs can be added easily using the new function `add`. >>> help(codext.add) Help on function add in module codext.__common__: -add(ename, encode=None, decode=None, pattern=None, text=True) +add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern and with file handling (if text is True). - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later ``` @@ -78,6 +86,61 @@ In this second example, we can see that: ----- +## Remove a custom encoding + +New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. + +```python +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +>>> codext.remove("bin") +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + File "", line 1, in + codext.encode("test", "bin") + File "codext/__common__.py", line 245, in __encode + return __lookup(encoding).encode(obj, errors)[0] + File "codext/__common__.py", line 259, in __lookup + codecs.lookup = __lookup +LookupError: unknown encoding: bin +``` + +While trying to remove a codec that is in the native registry won't raise a `LookupError`. + +```python +>>> codext.remove("utf-8") +>>> codext.encode("test", "utf-8") +b'test' +``` + +----- + +## Remove or restore `codext` encodings + +It can be useful while playing with encodings e.g. from Idle to be able to remove or restore `codext`'s encodings. This can be achieved using respectively the new `clear` and `reset` functions. + +```python +>>> codext.clear() +>>> codext.encode("test", "bin") +Traceback (most recent call last): + File "", line 1, in + codext.encode("test", "bin") + File "/mnt/data/Projects/maint/python-codext/codext/__common__.py", line 245, in __encode + return __lookup(encoding).encode(obj, errors)[0] + File "/mnt/data/Projects/maint/python-codext/codext/__common__.py", line 258, in __lookup + return orig_lookup(encoding) +LookupError: unknown encoding: bin +``` + +```python +>>> codext.reset() +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +``` + +----- + ## Hooked `codecs` functions In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. diff --git a/tests/test_common.py b/tests/test_common.py index d51dcf7..b8080ee 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -5,12 +5,57 @@ """ from unittest import TestCase +import codext from codext.__common__ import * +def dummy_encode(input, errors="strict"): + return input, len(input) + + +def dummy_decode(input, errors="strict"): + return input, len(input) + + +def getregentry(encoding): + if encoding == "dummy3": + return codecs.CodecInfo( + name="dummy3", + encode=dummy_encode, + decode=dummy_decode, + ) + + class TestCommon(TestCase): def test_add_codec(self): - f = lambda: None - self.assertRaises(ValueError, codecs.add, "test") - self.assertRaises(ValueError, codecs.add, "test", "BAD") - self.assertRaises(ValueError, codecs.add, "test", f, "BAD") + self.assertRaises(ValueError, codext.add, "test") + self.assertRaises(ValueError, codext.add, "test", "BAD") + self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") + self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + + def test_remove_codec(self): + self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + self.assertIsNone(codext.remove("dummy")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + # special case, when adding a new codec also to the native codecs + # registry, then it won't be possible to remove it further + self.assertIsNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.remove("dummy2")) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.register(getregentry)) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + self.assertIsNone(codecs.remove("dummy3")) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + + def test_clear_codecs(self): + self.assertIsNotNone(codecs.encode("test", "morse")) + self.assertIsNone(codecs.clear()) + self.assertRaises(LookupError, codecs.encode, "test", "morse") + + def test_reset_codecs(self): + self.assertIsNone(codext.reset()) + self.assertIsNotNone(codext.encode("test", "morse")) + self.assertRaises(LookupError, codext.encode, "test", "dummy")