Skip to content

Commit

Permalink
Refactored __common__
Browse files Browse the repository at this point in the history
  • Loading branch information
dhondta committed Feb 2, 2020
1 parent 1fa82c1 commit 9119479
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 60 deletions.
156 changes: 120 additions & 36 deletions codext/__common__.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,57 @@
# -*- coding: UTF-8 -*-
import _codecs
import codecs
import os
import re
import sys
import types
from functools import wraps
from six import binary_type, string_types, text_type
from types import FunctionType
try: # Python3
from inspect import getfullargspec
except ImportError:
from inspect import getargspec as getfullargspec
try:
from importlib import reload
except ImportError:
pass


__all__ = ["add", "b", "codecs", "ensure_str", "re", "PY3"]
__all__ = ["add", "b", "clear", "codecs", "ensure_str", "re", "register",
"remove", "reset", "s2i", "PY3"]
PY3 = sys.version[0] == "3"


isb = lambda s: isinstance(s, binary_type)
iss = lambda s: isinstance(s, string_types)
fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x

s2i = lambda s: int(codecs.encode(s, "base16"), 16)

def add(ename, encode=None, decode=None, pattern=None, text=True):

def add(ename, encode=None, decode=None, pattern=None, text=True,
add_to_codecs=False):
"""
This adds a new codec to the codecs module setting its encode and/or decode
functions, eventually dynamically naming the encoding with a pattern and
with file handling (if text is True).
:param ename: encoding name
:param encode: encoding function or None
:param decode: decoding function or None
:param pattern: pattern for dynamically naming the encoding
:param text: specify whether the codec is a text encoding
"""
if encode and not isinstance(encode, types.FunctionType):
raise ValueError("Bad encode function")
if decode and not isinstance(decode, types.FunctionType):
raise ValueError("Bad decode function")
:param ename: encoding name
:param encode: encoding function or None
:param decode: decoding function or None
:param pattern: pattern for dynamically naming the encoding
:param text: specify whether the codec is a text encoding
:param add_to_codecs: also add the search function to the native registry
NB: this will make the codec available in the
built-in open(...) but will make it impossible
to remove the codec later
"""
if encode and not isinstance(encode, FunctionType):
raise ValueError("Bad 'encode' function")
if decode and not isinstance(decode, FunctionType):
raise ValueError("Bad 'decode' function")
if not encode and not decode:
raise ValueError("At least one function must be defined")
raise ValueError("At least one en/decoding function must be defined")
# search function for the new encoding
def getregentry(encoding):
if encoding != ename and not (pattern and re.match(pattern, encoding)):
Expand Down Expand Up @@ -79,8 +96,13 @@ def decode(self, input, final=False):
except AttributeError:
return # this occurs when m is None, meaning no match
except IndexError:
pass # this occurs while m is not None, but possibly no
# capture group that gives at least 1 group index
# this occurs while m is not None, but possibly no capture group
# that gives at least 1 group index ; in this case, if
# fenc/fdec is a decorated function, execute it with no arg
if fenc and len(getfullargspec(fenc).args) == 1:
fenc = fenc()
if fdec and len(getfullargspec(fdec).args) == 1:
fdec = fdec()
if fenc:
fenc = fix_inout_formats(fenc)
if fdec:
Expand All @@ -107,21 +129,59 @@ class StreamReader(Codec, codecs.StreamReader):
streamreader=streamreader,
_is_text_encoding=text,
)
codecs.register(getregentry)
codecs.add = add
getregentry.__name__ = re.sub(r"[\s\-]", "_", ename)
register(getregentry, add_to_codecs)


def clear():
"""
Clear codext's local registry of search functions.
"""
global __codecs_registry
__codecs_registry = []
codecs.clear = clear


def remove(encoding):
"""
Remove all search functions matching the input encoding name from codext's
local registry.
:param encoding: encoding name
"""
tbr = []
for search in __codecs_registry:
if search(encoding) is not None:
tbr.append(search)
for search in tbr:
__codecs_registry.remove(search)
codecs.remove = remove


def reset():
"""
Reset codext's local registry of search functions.
"""
clear()
for f in os.listdir(os.path.dirname(__file__)):
if not f.endswith(".py") or f.startswith("_"):
continue
reload(__import__(f[:-3], globals(), locals(), [], 1))
codecs.reset = reset


# conversion functions
def b(s):
"""
Non-crashing bytes conversion function.
"""
if PY3:
try:
return s.encode("latin-1")
return s.encode("utf-8")
except:
pass
try:
return s.encode("utf-8")
return s.encode("latin-1")
except:
pass
return s
Expand Down Expand Up @@ -159,27 +219,30 @@ def _wrapper(*args, **kwargs):


# codecs module hooks
orig_lookup = _codecs.lookup
orig_register = _codecs.register
_ts_codecs_registry = []
_ts_codecs_registry_hashes = []
orig_lookup = _codecs.lookup
orig_register = _codecs.register


def __add(ename, encode=None, decode=None, pattern=None, text=True,
add_to_codecs=True):
add(ename, encode, decode, pattern, text, add_to_codecs)
__add.__doc__ = add.__doc__
codecs.add = __add


def __decode(obj, encoding='utf-8', errors='strict'):
"""
Custom decode function relying on the hooked lookup function.
"""
codecinfo = __lookup(encoding)
return codecinfo.decode(obj, errors)[0]
return __lookup(encoding).decode(obj, errors)[0]
codecs.decode = __decode


def __encode(obj, encoding='utf-8', errors='strict'):
"""
Custom encode function relying on the hooked lookup function.
"""
codecinfo = __lookup(encoding)
return codecinfo.encode(obj, errors)[0]
return __lookup(encoding).encode(obj, errors)[0]
codecs.encode = __encode


Expand All @@ -188,22 +251,43 @@ def __lookup(encoding):
Hooked lookup function for searching first for codecs in the local registry
of this module.
"""
for search in _ts_codecs_registry:
for search in __codecs_registry:
codecinfo = search(encoding)
if codecinfo is not None:
return codecinfo
return orig_lookup(encoding)
codecs.lookup = __lookup


def __register(search_function):
def register(search_function, add_to_codecs=False):
"""
Register function for registering new codecs in the local registry of this
module and, if required, in the native codecs registry (for use with the
built-in 'open' function).
:param search_function: search function for the codecs registry
:param add_to_codecs: also add the search function to the native registry
NB: this will make the codec available in the
built-in open(...) but will make it impossible
to remove the codec later
"""
if search_function not in __codecs_registry:
__codecs_registry.append(search_function)
if add_to_codecs:
orig_register(search_function)


def __register(search_function, add_to_codecs=True):
"""
Hooked register function for registering new codecs in the local registry
of this module.
of this module and in the native codecs registry (for use with the built-in
'open' function).
:param search_function: search function for the codecs registry
:param add_to_codecs: also add the search function to the native registry
NB: this will make the codec available in the
built-in open(...) but will make it impossible
to remove the codec later
"""
h = hash(search_function)
if h not in _ts_codecs_registry_hashes:
_ts_codecs_registry_hashes.append(h)
_ts_codecs_registry.append(search_function)
orig_register(search_function)
register(search_function, add_to_codecs)
codecs.register = __register
14 changes: 5 additions & 9 deletions codext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,21 @@
"""Module for enhancing codecs preimport.
"""
import os
from .__common__ import *
from .__info__ import __author__, __copyright__, __license__, __version__

from .__common__ import add, codecs


__all__ = ["add", "decode", "encode", "lookup", "open", "register"]
__all__ = ["add", "clear", "decode", "encode", "lookup", "open", "register",
"remove", "reset"]


decode = codecs.decode
encode = codecs.encode
lookup = codecs.lookup
open = codecs.open
register = codecs.register


for f in os.listdir(os.path.dirname(__file__)):
if not f.endswith(".py") or f == "__init__.py":
continue
__import__(f[:-3], globals(), locals(), [], 1)
reset()


def main():
Expand Down
85 changes: 74 additions & 11 deletions docs/features.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
Basically, the `codecs` library, relying on the built-in `_codecs` library, maintains a registry of search functions that maps an input `encoding` variable to the right de/encode function. `codext` hooks the native `codecs` to insert its own registry between the function calls and the native one.
Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched.

!!! note "`codecs` import and the `open` built-in function"
`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used.

!!! note "The `open` built-in function"

Two behaviors are to be considered when using `codext`:

When `codext` is imported, the new encodings are added to its registry but also to the native one. Moreover, hooked functions are bound to the `codext` module but also overwrites the original ones in `codecs`. Consequently:
1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`.
2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`.

1. Once `codext` has been imported, `codecs` can be imported elsewhere in the program and will have the `add` and hooked functions attached, with the new encodings available.
2. While `codecs.open` will handle the new encodings according to `codext`'s registry first, the native `open` function will rely on the native registry only and therefore handle the encoding search functions of `codext` _after_ these of the native registry has it will rely on non-hooked functions.
This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library.

-----

Expand All @@ -18,16 +22,20 @@ New codecs can be added easily using the new function `add`.
>>> help(codext.add)
Help on function add in module codext.__common__:

add(ename, encode=None, decode=None, pattern=None, text=True)
add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False)
This adds a new codec to the codecs module setting its encode and/or decode
functions, eventually dynamically naming the encoding with a pattern and
with file handling (if text is True).

:param ename: encoding name
:param encode: encoding function or None
:param decode: decoding function or None
:param pattern: pattern for dynamically naming the encoding
:param text: specify whether the codec is a text encoding
:param ename: encoding name
:param encode: encoding function or None
:param decode: decoding function or None
:param pattern: pattern for dynamically naming the encoding
:param text: specify whether the codec is a text encoding
:param add_to_codecs: also add the search function to the native registry
NB: this will make the codec available in the
built-in open(...) but will make it impossible
to remove the codec later

```

Expand Down Expand Up @@ -78,6 +86,61 @@ In this second example, we can see that:

-----

## Remove a custom encoding

New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one.

```python
>>> codext.encode("test", "bin")
'01110100011001010111001101110100'
>>> codext.remove("bin")
>>> codext.encode("test", "bin")

Traceback (most recent call last):
File "<pyshell#39>", line 1, in <module>
codext.encode("test", "bin")
File "codext/__common__.py", line 245, in __encode
return __lookup(encoding).encode(obj, errors)[0]
File "codext/__common__.py", line 259, in __lookup
codecs.lookup = __lookup
LookupError: unknown encoding: bin
```

While trying to remove a codec that is in the native registry won't raise a `LookupError`.

```python
>>> codext.remove("utf-8")
>>> codext.encode("test", "utf-8")
b'test'
```

-----

## Remove or restore `codext` encodings

It can be useful while playing with encodings e.g. from Idle to be able to remove or restore `codext`'s encodings. This can be achieved using respectively the new `clear` and `reset` functions.

```python
>>> codext.clear()
>>> codext.encode("test", "bin")
Traceback (most recent call last):
File "<pyshell#4>", line 1, in <module>
codext.encode("test", "bin")
File "/mnt/data/Projects/maint/python-codext/codext/__common__.py", line 245, in __encode
return __lookup(encoding).encode(obj, errors)[0]
File "/mnt/data/Projects/maint/python-codext/codext/__common__.py", line 258, in __lookup
return orig_lookup(encoding)
LookupError: unknown encoding: bin
```

```python
>>> codext.reset()
>>> codext.encode("test", "bin")
'01110100011001010111001101110100'
```

-----

## Hooked `codecs` functions

In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec.
Expand Down
Loading

0 comments on commit 9119479

Please sign in to comment.