In [1]:
import abc
import math
import os

import bs4
import requests

In [2]:
%load_ext autoreload
%autoreload 1
%aimport encoding

In [3]:
from encoding import RLEncoding, test_encoding

In [5]:
ISSUE_URL = 'https://github.com/olin/collascii/issues/21'
CACHE_FILE = "cache.html"
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, 'r') as f:
        html = f.read()
else:
    response = requests.get(ISSUE_URL)
    if response.status_code is not 200:
        raise Exception("Response was not OK")
    html = response.text
    with open(CACHE_FILE, 'w') as f:
        f.write(html)
soup = bs4.BeautifulSoup(html)

In [6]:
blocks = soup.select(".comment-body pre > code")  # select all pre-formatted code blocks in comment bodies
works = list(map(lambda b: b.text.encode('ASCII'), blocks))

In [7]:
# let's make sure we got some
print(f"caught {len(works)} works")
print(works[0].decode('ascii'),works[-1].decode('ascii'), sep='\n'+'*'*80+'\n')

caught 24 works
         |  ###############  |\                         
         | #               # ||                         
         | #  HELLO WORLD  # ||                         
         | #               # |\                         
         |  ###############  | |                        
         *-------------------* |                        
          \                    \                        
           --                   |  "LONG-LEGGED LOOSEY" 
             \                  |                       
              --                \                       
                \                |                      
                 --              |                      
                   \             \                      
                    --            |                     
                      \           |                     
                       --         \ _____________       
                         \         |             \      
               

In [9]:
help(RLEncoding)

Help on class RLEncoding in module encoding:

class RLEncoding(Encoding)
 |  RLEncoding(escape: bytes = b'\\')
 |  
 |  Run-Length Encoding
 |  >>> RLEncoding().encode(b'bbbbbb')
 |  b'6b'
 |  >>> RLEncoding().encode(b'a')
 |  b'a'
 |  >>> RLEncoding().encode(b'1')
 |  b'\\1'
 |  >>> RLEncoding().encode(b'\\')
 |  b'\\\\'
 |  >>> b'\\\\' == br'\\'
 |  True
 |  >>> rle = RLEncoding(b'a')
 |  >>> rle.encode(b'1')
 |  b'a1'
 |  >>> rle.encode(b'122333')
 |  b'a12a23a3'
 |  >>> rle.decode(b'a12a23a3')
 |  b'122333'
 |  
 |  Method resolution order:
 |      RLEncoding
 |      Encoding
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, escape: bytes = b'\\')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  decode(self, content: bytes)
 |  
 |  encode(self, content: bytes)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmetho

In [13]:
rle = RLEncoding(escape=b"/")
orig = br"123abc"
test_encoding(rle, orig)

test_encoding(rle, b"122333")

(b'/12/23/3', b'122333')

In [14]:
print(b"b"*10)
rle.decode(b"10b")

b'bbbbbbbbbb'


b'bbbbbbbbbb'

In [15]:
results = [test_encoding(rle, w) for w in works]

In [16]:
reductions = [len(e) / len(d) for e, d in results]

In [17]:
print(f"Size reductions:")
for name, func in {"Average": lambda a: sum(a) / len(a), "Min": min, "Max": max}.items():
    print(f"\t{name}: {func(reductions)}")

Size reductions:
	Average: 0.38233358283079594
	Min: 0.13590939373750832
	Max: 0.7749523204068659
