<a href="https://colab.research.google.com/github/estebanhernandezr/DNA-compression/blob/main/GZIP_DEFLATE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install bitarray

Collecting bitarray
  Downloading bitarray-2.3.5.tar.gz (88 kB)
[?25l[K     |███▊                            | 10 kB 16.8 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 9.7 MB/s eta 0:00:01[K     |███████████▏                    | 30 kB 9.1 MB/s eta 0:00:01[K     |██████████████▉                 | 40 kB 8.3 MB/s eta 0:00:01[K     |██████████████████▌             | 51 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████▎         | 61 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████████      | 71 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 81 kB 6.0 MB/s eta 0:00:01[K     |████████████████████████████████| 88 kB 3.6 MB/s 
[?25hBuilding wheels for collected packages: bitarray
  Building wheel for bitarray (setup.py) ... [?25l[?25hdone
  Created wheel for bitarray: filename=bitarray-2.3.5-cp37-cp37m-linux_x86_64.whl size=171987 sha256=50602656ffb30bc054366465b77cd81dd1ddf8d1509b77d724959ab8c7ca83f8
  Stored in directory: /ro

In [2]:
from bitarray import *
from heapq import heappush, heappop

In [3]:
class heapNode(object):
    def __init__(self):
        self.left = None
        self.right = None
        self.counts = 0
        self.twelveBitInteger = ""
    def __lt__(self, other):
        return self.counts < other.counts

In [7]:
n = 2000
Ls = 300
pad_symb = 37

def data_from_file(filename: str):
    with open(filename, 'rb') as input_file:
        filedata = input_file.read()
    return filedata

def file_from_bin(filename: str, newBuffer):
    with open(filename, 'wb') as outFile:
        outFile.write(newBuffer)
        return None

def bin_from_file(filename: str):
    filedata = bitarray(endian='big')
    with open(filename, 'rb') as input_file:
        filedata.fromfile(input_file)
    return filedata


def iniPad(symb, padsize, cad):
    pad = bytearray(chr(symb), 'utf-8')
    for i in range(padsize-1):
        pad.append(pad[0])
    return pad + cad

def codeWord(pos, size, char=None):
    #print(pos, size, char)
    cd = "{0:0{width}b}".format(pos, width=len("{0:b}".format(n)))
    cd += "{0:0{width}b}".format(size, width=len("{0:b}".format(Ls)))
    #cd += char
    return cd

def reproducible_extension(search, lookahead):
    pos = -1
    size = 0
    char = ''
    for prefixsize in range(1, min(n-Ls, len(lookahead))):
        prefix = lookahead[:prefixsize]
        p = search.rfind(prefix, 0, (n-Ls)+prefixsize-1)
        if p >= 0:
            pos = p
            size = prefixsize
            char = lookahead[size]
        else:
            break
    return pos, size, char

def get_distances(cad):
    pcad = iniPad(cad[0], n-Ls, cad)
    distances = []
    j: int = 0
    while j < len(pcad)-(n-Ls):
        #print(pcad[j:j+n], "|", pcad[j:j+n-Ls], " --> ", pcad[j+n-Ls:j+n])
        pos, size, char = reproducible_extension(pcad[j:j+n], pcad[j+n-Ls:j+n])
        #print(pos, size)
        if pos >= 0:
          distances.append(pos)
        j += max(1,size)#+1)
    return distances

def bl_count_from_distances(distances):
    counts = {}
    for distance in distances:
        if distance in counts:
            counts[distance]+=1
        else:
            counts[distance] = 1
    return counts

def heap_from_dictionary(counts):
    heap = []
    for key in counts:
        node = heapNode()
        node.twelveBitInteger = key
        node.counts = counts[key]
        heappush(heap, node)
    #for node in heap:
    #    print(node.twelveBitInteger, ":", node.counts)
    return heap

def create_huffman_tree(counts):
    heap = heap_from_dictionary(counts)
    root = None
    while len(heap) > 1:
        smallestNode = heappop(heap)
        secondSmallestNode = heappop(heap)

        newRoot = heapNode()
        newRoot.counts = smallestNode.counts + secondSmallestNode.counts
        newRoot.twelveBitInteger = ''
        newRoot.left = smallestNode
        newRoot.right = secondSmallestNode
        root = newRoot
        heappush(heap, newRoot)
    return root

codeDictionary = {}
def dictionary_from_tree(root, path, codes):
    if root.left is None and root.right is None:
        #print(root.twelveBitInteger, ": ", root.counts)
        codes["{0:0{width}b}".format(root.twelveBitInteger, width=len("{0:b}".format(n)))] = path
        return
    else:
        if root.left is not None:
            dictionary_from_tree(root.left, path+"0", codes)
        if root.right is not None:
            dictionary_from_tree(root.right, path+"1", codes)

treeBuffer = bitarray(endian='big')
def codify_huffman_tree(root, buffer):
    if root.left is None and root.right is None:
        buffer.append(False) # If current node is leaf then False (internal)
        bin_distance = "{0:0{width}b}".format(root.twelveBitInteger, width=len("{0:b}".format(n)))
        for bit in bin_distance:
            if bit == '1':
                buffer.append(True)
            else:
                buffer.append(False)
    else:
        buffer.append(True) # If current node is internal then True
        codify_huffman_tree(root.left, buffer)
        codify_huffman_tree(root.right, buffer)

def decodify_huffman_tree(buffer):
    nextBit = buffer.pop(0)
    if nextBit == False: # if leaf node
        root = heapNode()
        binaryString = ''
        for i in range(len("{0:b}".format(n))):# read next twelve bytes to get full code
            bit = buffer.pop(0)
            if bit == True:
                binaryString+="1"
            else:
                binaryString+="0"
        root.twelveBitInteger = int(binaryString, 2)
    else:
        root = heapNode()
        root.left = decodify_huffman_tree(buffer)
        root.right = decodify_huffman_tree(buffer)
    return root

outputBuffer = bitarray(endian='big')
def codify_cad(cad, buffer, dictionary):
    pad_symb = cad[0]
    pcad = iniPad(cad[0], n-Ls, cad)
    i: int = 0
    while i < len(pcad)-(n-Ls):
        pos, size, char = reproducible_extension(pcad[i:i+n], pcad[i+n-Ls:i+n])
        if pos >= 0 and size > 1:
            outputBuffer.append(True)
            if "{0:0{width}b}".format(pos, width=len("{0:b}".format(n))) in dictionary:
                huffman_code = dictionary["{0:0{width}b}".format(pos, width=len("{0:b}".format(n)))]
                #print("huffman_code:", huffman_code, " size:", '{0:04b}'.format(size))
                bin_code = huffman_code + "{0:0{width}b}".format(size, width=len("{0:b}".format(Ls)))
                #print("bin_code:= ", bin_code)
                #print(type(bin_code))
                for bit in bin_code:
                    if bit == '1':
                        outputBuffer.append(True)
                    else:
                        outputBuffer.append(False)
                i += size#+1
            elif len(dictionary)==0:
                bin_code = codeWord(pos, size)
                for bit in bin_code:
                    if bit == '1':
                        outputBuffer.append(True)
                    else:
                        outputBuffer.append(False)
                i += size#+1
        else:
            outputBuffer.append(False)
            outputBuffer.frombytes(bytes([pcad[i+n-Ls]])) # Literal symbol
            i += 1

def compress(filename: str):
    filedata = data_from_file(filename) #CHECKED
    distances = get_distances(filedata) #CHECKED

    counts = bl_count_from_distances(distances) #CHECKED

    #huffman_tree = create_huffman_tree(counts) #CHECKED - HUFFMAN PART

    #dictionary_from_tree(huffman_tree, '', codeDictionary) #CHECKED - HUFFMAN PART

    #codify_huffman_tree(huffman_tree, treeBuffer) #CHECKED - HUFFMAN PART

    codify_cad(filedata, outputBuffer, codeDictionary) #CHECKED


    newBuffer = treeBuffer + outputBuffer
    newBuffer.fill()

    file_from_bin('compressed_file', newBuffer)
    print("File compressed succesfully")
    return filedata

def decompress(filename, outputFile):
    filedata = bin_from_file(filename)
    codeDictionary = {}
    #root = decodify_huffman_tree(filedata) - HUFFMAN PART
    #dictionary_from_tree(root, '', codeDictionary) - HUFFMAN PART
    outputBuffer = iniPad(pad_symb, n-Ls, bytes())
    k: int = 0
    while len(filedata) >= 9:
        #print(filedata)
        huffman_pair = filedata.pop(0)
        if not huffman_pair:
            byte = filedata[0:8].tobytes()
            outputBuffer += byte
            del filedata[0:8]
            k += 1
        else:
            twelvebitnumber = ''
            for i in range(0, len("{0:b}".format(n))):
                bit = filedata.pop(0)
                if bit == True:
                    twelvebitnumber += '1'
                else:
                    twelvebitnumber += '0'

            curbitsubstring = ''
            stop = False
            while len(codeDictionary) > 0 and stop == False:
                bit = filedata.pop(0)
                if bit == True:
                    curbitsubstring+="1"
                else:
                    curbitsubstring+="0"
                for key in codeDictionary:
                    if codeDictionary[key] == str(curbitsubstring):
                        twelvebitnumber = key
                        stop = True

            bestLengthBinary = ''
            for i in range(0, len("{0:b}".format(Ls))):
                bit = filedata.pop(0)
                if bit == True:
                    bestLengthBinary+='1'
                else:
                    bestLengthBinary+='0'

            bestDistance = int(twelvebitnumber, 2)
            bestLength = int(bestLengthBinary, 2)
            #print(bestDistance, bestLength)
            for i in range(bestLength):
                outputBuffer.append(outputBuffer[k+bestDistance+i])
            k += bestLength

    file_from_bin(outputFile, outputBuffer[n-Ls:])
    print("File decompressed succesfully")
    return outputBuffer[n-Ls:]

In [8]:
filename: str = 'SuffixTree.py'
data = compress(filename)

out_filename: str = 'uncompressed_file'
decompressed = decompress('compressed_file', out_filename)

print(data == decompressed)
print(data)
print(decompressed)

File compressed succesfully
File decompressed succesfully
True
b'#SuffixTree----------------------------------------------------------------\r\nclass STree():\r\n    """Class representing the suffix tree."""\r\n\r\n    def __init__(self, input=\'\'):\r\n        self.root = _SNode()\r\n        self.root.depth = 0\r\n        self.root.idx = 0\r\n        self.root.parent = self.root\r\n        self.root._add_suffix_link(self.root)\r\n\r\n        if not input == \'\':\r\n            self.build(input)\r\n\r\n    def _check_input(self, input):\r\n        """Checks the validity of the input.\r\n        In case of an invalid input throws ValueError.\r\n        """\r\n        if isinstance(input, str):\r\n            return \'st\'\r\n        elif isinstance(input, list):\r\n            if all(isinstance(item, str) for item in input):\r\n                return \'gst\'\r\n\r\n        raise ValueError("String argument should be of type String or a list of strings")\r\n\r\n    def build(self, x):\r