<a href="https://colab.research.google.com/github/estebanhernandezr/DNA-compression/blob/master/gzip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PDF2TXT**

In [23]:
pip install PyPDF2



In [24]:
import PyPDF2

In [25]:
def pdf2txt(filename, text='', ini=0, fin=0):
    pdffileobj=open(filename,'rb')
    pdfreader=PyPDF2.PdfFileReader(pdffileobj)
    y=pdfreader.numPages

    x = ini
    y = y-fin

    for i in range(x, y):
        pageobj=pdfreader.getPage(i)
        text+=pageobj.extractText()
    print(text)
    return text

# **GZIP**

In [26]:
pip install bitarray



In [27]:
from bitarray import *
from typing import Dict, Sequence, Tuple

In [28]:
# FUNCTIONS

def data_from_file(filename: str) -> bytearray:
    with open(filename, 'rb') as input_file:
        filedata = input_file.read()
    return filedata

def file_from_bin(filename: str, buffer: bytearray) -> None:
    with open(filename, 'wb') as outfile:
        outfile.write(buffer)
        return None

def bin_from_file(filename: str) -> bytearray:
    filedata: bytearray = bitarray(endian='big')
    with open(filename, 'rb') as input_file:
        filedata.fromfile(input_file)
    return filedata

def inipad(symb: int, padsize: int, cad: str) -> bytearray:
    pad: bytearray = bytearray(chr(symb), 'utf-8')
    for i in range(padsize-1):
        pad.append(pad[0])
    return pad + cad

In [29]:
# COMPRESSER CLASS

class Compresser:
    _n: int
    _Ls: int
    _symb: int
    _compressed_string: bytearray

    _huffman_dictionary: Dict
    _compressed_huffman: bytearray

    def __init__(self, n: int, Ls: int):
        if (n < 0 or Ls < 0):
			      raise ValueError("Negative buffer sizes")

        self._n = n
        self._Ls = Ls
        self._symb = 32
        self._compressed_string = bitarray(endian='big')

        self._huffman_dictionary = {}
        self._compressed_huffman = bitarray(endian='big')

    def codify_word(self, pos: int, size: int, char: chr=None) -> str:
        n: int = self._n
        Ls: int = self._Ls
        codeword = "{0:0{width}b}".format(pos, width=len("{0:b}".format(n)))
        codeword += "{0:0{width}b}".format(size, width=len("{0:b}".format(Ls)))
        return codeword

    def rep_extension(self, search: str, lookahead: str) -> Sequence[int]:
        n: int = self._n
        Ls: int = self._Ls
        pos: int = -1
        size: int = 0
        char: chr = ''
        for prefixsize in range(1, min(n-Ls, len(lookahead))):
            prefix: str = lookahead[:prefixsize]
            p: int = search.rfind(prefix, 0, (n-Ls)+prefixsize-1)
            if p >= 0:
                pos = p
                size = prefixsize
                char = lookahead[size]
            else:
                break
        return pos, size, char

    def codify_cad(self, cad: str) -> None:
        n: int = self._n
        Ls: int = self._Ls
        symb: int = self._symb
        dictionary: Dict = self._huffman_dictionary

        pcad: bytearray = inipad(symb, n-Ls, cad)
        i: int = 0
        while i < len(pcad)-(n-Ls):
            triple: Sequence[int] = self.rep_extension(pcad[i:i+n], pcad[i+n-Ls:i+n])
            pos: int = triple[0]
            size: int = triple[1]
            if (pos >= 0 and size > 1):
                self._compressed_string.append(True)
                if "{0:0{width}b}".format(pos, width=len("{0:b}".format(n))) in dictionary:
                    huffman_code: str = dictionary["{0:0{width}b}".format(pos,
                                                                          width=len("{0:b}".format(n)))]
                    bin_code: str = huffman_code + "{0:0{width}b}".format(size,
                                                                          width=len("{0:b}".format(Ls)))
                elif len(dictionary) == 0:
                    bin_code: str = self.codify_word(pos, size)
                
                for bit in bin_code:
                    if bit == '1':
                        self._compressed_string.append(True)
                    else:
                        self._compressed_string.append(False)
                i += size
            else:
                self._compressed_string.append(False)
                self._compressed_string.frombytes(bytes([pcad[i+n-Ls]]))
                i += 1
        return None

    def compress(self, filename: str) -> None:
        filedata: bytearray = data_from_file(filename) #CHECKED
        
        #distances = get_distances(filedata) #CHECKED - HUFFMAN PART
        #counts = bl_count_from_distances(distances) #CHECKED - HUFFMAN PART
        #huffman_tree = create_huffman_tree(counts) #CHECKED - HUFFMAN PART
        #dictionary = dictionary_from_tree(huffman_tree, '', codeDictionary) #CHECKED - HUFFMAN PART
        #self._huffman_dictionary = dictionary

        #self.codify_huffmantree(huffman_tree, treeBuffer) #CHECKED - HUFFMAN PART
        self.codify_cad(filedata) #CHECKED

        return None

In [30]:
# DECOMPRESSER CLASS

class Decompresser:
    _n: int
    _Ls: int
    _symb: int
    _decompressed_string: bitarray
    
    _huffman_dictionary: Dict
    _decompressed_huffman: bitarray

    def __init__(self, n: int, Ls: int):
        if (n < 0 or Ls < 0):
			      raise ValueError("Negative buffer sizes")

        self._n = n
        self._Ls = Ls
        self._symb = 32
        self._decompressed_string = bitarray(endian='big')

        self._huffman_dictionary = {}
        self._decompressed_huffman = bitarray(endian='big')

    def decompress_cad(self, filedata: bytearray) -> None:
        n: int = self._n
        Ls: int = self._Ls
        symb: int = self._symb
        self._decompressed_string = inipad(symb, n-Ls, bytes())

        k: int = 0
        while len(filedata) >= 9:
            flag_pair = filedata.pop(0)
            if not flag_pair:
                byte = filedata[0:8].tobytes()
                self._decompressed_string += byte
                del filedata[0:8]
                k += 1
            else:
                position = ''
                for i in range(0, len("{0:b}".format(n))):
                    bit = filedata.pop(0)
                    if bit == True:
                        position += '1'
                    else:
                        position += '0'

                curbitsubstring = ''
                stop = False
                while len(self._huffman_dictionary) > 0 and stop == False:
                    bit = filedata.pop(0)
                    if bit == True:
                        curbitsubstring+="1"
                    else:
                        curbitsubstring+="0"
                    for key in self._huffman_dictionary:
                        if self._huffman_dictionary[key] == str(curbitsubstring):
                            position = key
                            stop = True

                length = ''
                for i in range(0, len("{0:b}".format(Ls))):
                    bit = filedata.pop(0)
                    if bit == True:
                        length += '1'
                    else:
                        length += '0'

                bestDistance = int(position, 2)
                bestLength = int(length, 2)
                for i in range(bestLength):
                    self._decompressed_string.append(
                        self._decompressed_string[k+bestDistance+i])
                k += bestLength

        self._decompressed_string = self._decompressed_string[n-Ls:]
        return None

    def decompress(self, filename: str):
        filedata: bytearray = bin_from_file(filename)

        #root = decodify_huffman_tree(filedata) - HUFFMAN PART
        #dictionary_from_tree(root, '', codeDictionary) - HUFFMAN PART
        
        self.decompress_cad(filedata)

# **Test Suite**

In [31]:
filename1 = 'Spanish [Spain].pdf'
filename2 = 'Italian [Italy].pdf'
text1 = pdf2txt(filename1)
text2 = pdf2txt(filename2, text1, ini=8, fin=0)

file=open('datos'+'.txt', 'w')
file.write(text2)

 Declaraci!n Universal de Derechos Humanos 
Adoptada y proclamada por la As
amblea General en su resoluci!n 217 A (III), de 10 de diciembre de 1948  Pre
"mbulo  Considerando que la libertad, la justicia y la paz en el mundo tienen por base el reconocimiento de la dignidad intr
#nseca y de los derechos iguales e inalienables 
de todos los miembros de la familia humana,  
Considerando que el desconocimiento y el menosprecio de los derechos humanos han originado actos de barbarie ultrajantes para la conciencia de la 
humanidad; y que se ha proclamado, como la aspiraci!n m"s elevada del hombre, el advenimiento de un mundo en que los seres humanos, liberados del temor y de la miseria, disfruten de la libertad de palabra y de la libertad de 
creencias,  Considerando esencial que los derechos humanos sean protegidos por un r$gimen de Derecho, a fin de que el hombre no se vea compelido al supremo recurso de la rebeli!n contra la tiran#a y la opresi!n,  Considerando tambi$n esencial promover el



13203

In [32]:
filename: str = 'datos.txt'

Compresor = Compresser(5000, 1000)
Compresor.compress(filename)

Compresor._compressed_string.fill()
file_from_bin('compressed_file', Compresor._compressed_string)

In [33]:
filename: str = 'compressed_file'

Decompresor = Decompresser(5000, 1000)
Decompresor.decompress(filename)

file_from_bin('decompressed_file', Decompresor._decompressed_string)