In [102]:
import sys

class Node:
    '''
    A hybride linked-list/binary tree node that stores a character
    and the character's frequency
    '''
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None
        self.next = None
        
    def __repr__(self):
        return f"({self.char}, {self.freq})"

class Tree:
    def __init__(self, root):
        self.root = root
        
    def huffman_code(self):
        h_codes = dict()
        
        def traverse(code, node):
            if node.char:
                h_codes[node.char] = code
                return
                
            traverse(code + '0', node.left)
            traverse(code + '1', node.right)
        
        traverse('', self.root)
        
        return h_codes
        
class Queue:
    '''
    A priority queue
    '''
    def __init__(self):
        self.head = None
        self.size = 0
        
    def enqueue(self, new_node):
        '''
        Appends node ahead of larger frequencies or the end
        '''
        if new_node is None:
            return
        
        node = self.head
        if node is None:
            self.head = new_node
            self.size += 1
            return
        
        previous = None
        while node and node.freq <= new_node.freq:
            previous = node
            node = node.next
        
        new_node.next = node
        previous.next = new_node
        self.size += 1
        
    def dequeue(self):
        head = self.head
        
        if head is None:
            return None
    
        self.head = head.next
        self.size -= 1
        return head
    
    def __repr__(self):
        node = self.head
        all_nodes = []
        while(node):
            all_nodes.append(str(node))
            node = node.next
         
        return ', '.join(all_nodes)

def huffman_encoding(data):
    counts = dict()
    #Count frequency
    for char in data:
        if char in counts:
            counts[char] += 1
        else:
            counts[char] = 1
    
    q = Queue()
    
    for char in sorted(counts, key=lambda x:counts[x]):
        q.enqueue(Node(char, counts[char]))
        
    while q.size > 1:
        left = q.dequeue()
        right = q.dequeue()
        merger = Node(None, left.freq + right.freq)
        merger.left = left
        merger.right = right
        q.enqueue(merger)
        
    
    tree = Tree(q.head)
    h_codes = tree.huffman_code()
    encoded_data = ''.join([h_codes[char] for char in data])
   
    return encoded_data, Tree(q.head)

def huffman_decoding(data, tree):
    decoded = []
    
    node = tree.root
    
    for bit in data:
        if bit == '0':
            node = node.left
        else:
            node = node.right
        
        if node.char:
            decoded.append(node.char)
            node = tree.root
            
    return ''.join(decoded)



IndentationError: expected an indented block (<ipython-input-102-962882c53159>, line 48)

In [101]:
a_great_sentence = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."

print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence)))
print ("The content of the data is: {}\n".format(a_great_sentence))

encoded_data, tree = huffman_encoding(a_great_sentence)


The size of the data is: 280

The content of the data is: Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

{'s': '0000', 'd': '0001', 'i': '001', 'u': '0100', 'n': '0101', 'm': '0110', ',': '011100', 'v': '0111010', 'g': '0111011', 'c': '01111', 't': '1000', 'a': '1001', 'p': '101000', 'q': '101001', 'b': '1010100', '.': '1010101', 'x': '1010110', 'L': '10101110', 'U': '10101111', 'o': '1011', ' ': '110', 'e': '1110', 'r': '11110', 'l': '11111'}


In [98]:
print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
print ("The content of the encoded data is: {}\n".format(encoded_data))

The size of the encoded data is: 152

The content of the encoded data is: 10101110101111110111001101100011010000000010001101100001101111111101111110110000000110001101001011011101000011100110011111011010100001110011111000111010000100111101101001000100110100000100000111100101010111011110111011111001100001110011000001110000111000011011110111000101000000011010110001110100011100110101000101111110110001010101111001000100100010100010110001100100100011011111100110101001011111101110110111010001100001101111111101111110111011001101001011101101011001110100111111001101001010010011010101110101011111000110111001010010110110100100011100110001010100101101100111010111001010011001011001110011010100101000010000110010110110000100011110010000011101110101011011101111001111001100010011000001101101011100100111111111110010110011111011110111111001101010010111111000100001100101001000000111001001000110100111111001101001010000110100011011101010110110111010011100111110110110011010110001101111001111101101010000111010

In [99]:


decoded_data = huffman_decoding(encoded_data, tree)

print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
print ("The content of the encoded data is: {}\n".format(decoded_data))

1 None 135
0 None 64
1 None 34
0 None 16
1 None 8
1 None 4
1 None 2
0 L 1
1 None 135
0 None 64
1 None 34
1 o 18
1 None 135
1 None 71
1 None 36
1 None 18
0 r 9
1 None 135
1 None 71
1 None 36
0 e 18
0 None 96
1 None 51
1 None 27
0 m 13
1 None 135
1 None 71
0   35
0 None 96
0 None 45
1 i 23
1 None 135
0 None 64
1 None 34
0 None 16
0 None 8
0 p 4
0 None 96
0 None 45
0 None 22
0 s 11
0 None 96
1 None 51
0 None 24
0 u 12
0 None 96
1 None 51
1 None 27
0 m 13
1 None 135
1 None 71
0   35
0 None 96
0 None 45
0 None 22
1 d 11
1 None 135
0 None 64
1 None 34
1 o 18
1 None 135
1 None 71
1 None 36
1 None 18
1 l 9
1 None 135
0 None 64
1 None 34
1 o 18
1 None 135
1 None 71
1 None 36
1 None 18
0 r 9
1 None 135
1 None 71
0   35
0 None 96
0 None 45
0 None 22
0 s 11
0 None 96
0 None 45
1 i 23
1 None 135
0 None 64
0 None 30
0 t 15
1 None 135
1 None 71
0   35
1 None 135
0 None 64
0 None 30
1 a 15
0 None 96
1 None 51
1 None 27
0 m 13
1 None 135
1 None 71
1 None 36
0 e 18
1 None 135
0 None 64
0 None 30
0 t 15
