Skip to content

Commit

Permalink
Refactored base encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
dhondta committed Feb 2, 2020
1 parent 9119479 commit 3d9541f
Show file tree
Hide file tree
Showing 12 changed files with 744 additions and 133 deletions.
2 changes: 1 addition & 1 deletion codext/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.3
1.1.0
160 changes: 160 additions & 0 deletions codext/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# -*- coding: UTF-8 -*-
"""Generic baseN functions.
"""
from math import log
from six import integer_types, string_types
from string import printable
from types import FunctionType

from .__common__ import *


# generic base en/decoding functions
class BaseError(ValueError):
pass


class BaseDecodeError(BaseError):
pass


class BaseEncodeError(BaseError):
pass


def _generate_charset(n):
"""
Generate a characters set.
:param n: size of charset
"""
if 1 < n <= 100:
return printable[:n]
elif 100 < n < 256:
return "".join(chr(i) for i in range(n))
raise ValueError("Bad size of character set")


def _get_charset(charset, p=""):
"""
Charaters set selection function. It allows to define charsets in many
different ways.
:param charset: charset object, can be a string (the charset itself), a
function (that chooses the right charset depending on the
input parameter) or a dictionary (either by exact key or by
pattern matching)
:param p: the parameter for choosing the charset
"""
# case 1: charset is a function, so return its result
if isinstance(charset, FunctionType):
return charset(p)
# case 2: charset is a string, so return it
elif isinstance(charset, string_types):
return charset
# case 3: charset is a dict with keys '' and 'inv', typically for a charset
# using lowercase and uppercase characters that can be inverted
elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]:
return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""]
# case 4: charset is a dict, but not with the specific keys '' and 'inv', so
# consider it as pattern-charset pairs
elif isinstance(charset, dict):
# try to handle [p]arameter as a simple key
try:
return charset[p]
except KeyError:
pass
# or handle [p]arameter as a pattern
default, n = None, None
for pattern, cset in charset.items():
n = len(cset)
if pattern == "":
default = cset
continue
if re.match(pattern, p):
return cset
# special case: the given [p]arameter can be the charset itself if
# it has the right length
p = re.sub(r"^[-_]+", "", p)
if len(p) == n:
return p
# or simply rely on key ''
if default is not None:
return default
raise ValueError("Bad charset descriptor")


def base_encode(input, charset, errors="strict", exc=BaseEncodeError):
"""
Base-10 to base-N encoding.
:param input: input (str or int) to be decoded
:param charset: base-N characters set
:param errors: errors handling marker
:param exc: exception to be raised in case of error
"""
i = input if isinstance(input, integer_types) else s2i(input)
n = len(charset)
r = ""
while i > 0:
i, c = divmod(i, n)
r = charset[c] + r
return r


def base_decode(input, charset, errors="strict", exc=BaseEncodeError):
"""
Base-N to base-10 decoding.
:param input: input to be decoded
:param charset: base-N characters set
:param errors: errors handling marker
:param exc: exception to be raised in case of error
"""
i, n = 0, len(charset)
for k, c in enumerate(input):
try:
i = i * n + charset.index(c)
except ValueError:
if errors == "strict":
raise exc("'base' codec can't decode character '{}' in position"
" {}".format(c, k))
elif errors in ["ignore", "replace"]:
continue
else:
raise ValueError("Unsupported error handling {}".format(errors))
return base_encode(i, [chr(j) for j in range(256)], errors, exc)


def base(charset, pattern=None, pow2=False,
encode_template=base_encode, decode_template=base_decode):
"""
Base-N codec factory.
:param charset: charset selection function
:param pattern: matching pattern for the codec name (first capturing group
is used as the parameter for selecting the charset)
:param pow2: whether the base codec's N is a power of 2
"""
is_n = isinstance(charset, int)
n = len(_generate_charset(charset) if is_n else _get_charset(charset))
nb = log(n, 2)
if pow2 and nb != int(nb):
raise BaseError("Bad charset ; {} is not a power of 2".format(n))

def encode(param=""):
a = _generate_charset(n) if is_n else _get_charset(charset, param)
def _encode(input, errors="strict"):
return encode_template(input, a, errors), len(input)
return _encode

def decode(param=""):
a = _generate_charset(n) if is_n else _get_charset(charset, param)
def _decode(input, errors="strict"):
return decode_template(input, a, errors), len(input)
return _decode

if pattern is None:
pattern = "base{}".format(n)
add("base{}".format(n), encode, decode, pattern)
115 changes: 115 additions & 0 deletions codext/_base2n.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# -*- coding: UTF-8 -*-
"""BaseN functions with N a power of 2.
"""
from math import ceil, log

from .__common__ import *
from ._base import base, _get_charset, BaseError


# base en/decoding functions for N a power of 2
class Base2NError(BaseError):
pass


class Base2NDecodeError(BaseError):
pass


class Base2NEncodeError(BaseError):
pass


def base2n(charset, pattern=None):
"""
Base-N codec factory for N a power of 2.
:param charset: charset selection function
:param pattern: matching pattern for the codec name (first capturing group
is used as the parameter for selecting the charset)
"""
base(charset, pattern, True, base2n_encode, base2n_decode)


def base2n_encode(string, charset, errors="strict", exc=Base2NEncodeError):
"""
8-bits characters to base-N encoding for N a power of 2.
:param string: string to be decoded
:param charset: base-N characters set
:param errors: errors handling marker
:param exc: exception to be raised in case of error
"""
bs, r, n = "", "", len(charset)
# find the number of bits for the given character set and the quantum
nb_out = int(log(n, 2))
q = nb_out
while q % 8 != 0:
q += nb_out
# iterate over the characters, gathering bits to be mapped to the charset
for i, c in enumerate(string):
c = c if isinstance(c, int) else ord(c)
bs += "{:0>8}".format(bin(c)[2:])
while len(bs) >= nb_out:
r += charset[int(bs[:nb_out], 2)]
bs = bs[nb_out:]
if len(bs) > 0:
for i in range(0, len(bs), nb_out):
c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out])
p = len(c) - len(bs[i:i+nb_out])
r += charset[int(c, 2)]
l = len(r) * nb_out
while l % q != 0:
l += nb_out
return r + int(l / nb_out - len(r)) * "="


def base2n_decode(string, charset, errors="strict", exc=Base2NDecodeError):
"""
Base-N to 8-bits characters decoding for N a power of 2.
:param string: string to be decoded
:param charset: base-N characters set
:param errors: errors handling marker
:param exc: exception to be raised in case of error
"""
bs, r, n = "", "", len(charset)
# find the number of bits for the given character set and the number of
# padding characters
nb_in = int(log(n, 2))
n_pad = len(string) - len(string.rstrip("="))
# iterate over the characters, mapping them to the character set and
# converting the resulting bits to 8-bits characters
for i, c in enumerate(string):
if c == "=":
bs += "0" * nb_in
else:
try:
bs += ("{:0>%d}" % nb_in).format(bin(charset.index(c))[2:])
except ValueError:
if errors == "strict":
raise exc("'base' codec can't decode character '{}' in "
"position {}".format(c, i))
elif errors == "replace":
bs += "0" * nb_in
elif errors == "ignore":
continue
else:
raise ValueError("Unsupported error handling {}"
.format(errors))
if len(bs) > 8:
r += chr(int(bs[:8], 2))
bs = bs[8:]
# if the number of bits is not multiple of 8 bits, it could mean a bad
# padding
if len(bs) != 8:
if errors == "strict":
raise Base2NDecodeError("Incorrect padding")
elif errors in ["replace", "ignore"]:
pass
else:
raise ValueError("Unsupported error handling {}".format(errors))
r += chr(int(bs, 2))
np = int(ceil(n_pad * nb_in / 8.0))
return r[:-np] if np > 0 else r
Loading

0 comments on commit 3d9541f

Please sign in to comment.