<a href="https://colab.research.google.com/github/estebanhernandezr/DNA-compression/blob/main/gzip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PDF2TXT**

In [80]:
pip install PyPDF2



In [81]:
import PyPDF2

In [82]:
def pdf2txt(filename, text='', ini=0, fin=0):
    pdffileobj=open(filename,'rb')
    pdfreader=PyPDF2.PdfFileReader(pdffileobj)
    y=pdfreader.numPages

    if ini < 0:
        x = y+ini
    else:
        x = ini
    if fin < 0:
        y = y+fin
    else:
        y = y-fin

    for i in range(x, y):
        pageobj=pdfreader.getPage(i)
        text+=pageobj.extractText()
    #print(text)
    return text

# **GZIP**

In [83]:
pip install bitarray



In [84]:
from bitarray import *
from typing import BinaryIO, Dict, Sequence, Tuple

In [85]:
# FUNCTIONS

def data_from_file(filename: str) -> BinaryIO:
    with open(filename, 'rb') as input_file:
        filedata = input_file.read()
    return filedata

def file_from_bin(filename: str, buffer: bytearray) -> None:
    with open(filename, 'wb') as outfile:
        outfile.write(buffer)
        return None

def bin_from_file(filename: str) -> BinaryIO:
    filedata: bytearray = bitarray(endian='big')
    with open(filename, 'rb') as input_file:
        filedata.fromfile(input_file)
    return filedata

def inipad(symb: int, padsize: int, cad: str) -> BinaryIO:
    pad: bytearray = bytearray(chr(symb), 'utf-8')
    for i in range(padsize-1):
        pad.append(pad[0])
    return pad + cad

In [86]:
# COMPRESSER CLASS

class Compresser:
    _n: int
    _Ls: int
    _symb: int
    _compressed_string: bytearray

    _huffman_dictionary: Dict
    _compressed_huffman: bytearray

    def __init__(self, n: int, Ls: int):
        if (n < 0 or Ls < 0):
			      raise ValueError("Negative buffer sizes")

        self._n = n
        self._Ls = Ls
        self._symb = 32
        self._compressed_string = bitarray(endian='big')

        self._huffman_dictionary = {}
        self._compressed_huffman = bitarray(endian='big')

    def codify_word(self, pos: int, size: int, char: chr=None) -> str:
        n: int = self._n
        Ls: int = self._Ls
        codeword = "{0:0{width}b}".format(pos, width=len("{0:b}".format(n)))
        codeword += "{0:0{width}b}".format(size, width=len("{0:b}".format(Ls)))
        return codeword

    def rep_extension(self, search: bytearray, lookahead: bytearray) -> Sequence[int]:
        n: int = self._n
        Ls: int = self._Ls
        pos: int = -1
        size: int = 0
        char: chr = ''
        for prefixsize in range(1, min(n-Ls, len(lookahead))):
            prefix: str = lookahead[:prefixsize]
            p: int = search.rfind(prefix, 0, (n-Ls)+prefixsize-1)
            if p >= 0:
                pos = p
                size = prefixsize
                char = lookahead[size]
            else:
                break
        return pos, size, char

    def codify_cad(self, cad: str) -> None:
        n: int = self._n
        Ls: int = self._Ls
        symb: int = self._symb
        dictionary: Dict = self._huffman_dictionary

        pcad: bytearray = inipad(symb, n-Ls, cad)
        i: int = 0
        while i < len(pcad)-(n-Ls):
            triple: Sequence[int] = self.rep_extension(pcad[i:i+n], pcad[i+n-Ls:i+n])
            pos: int = triple[0]
            size: int = triple[1]
            if (pos >= 0 and size > 1):
                self._compressed_string.append(True)
                if "{0:0{width}b}".format(pos, width=len("{0:b}".format(n))) in dictionary:
                    huffman_code: str = dictionary["{0:0{width}b}".format(pos,
                                                                          width=len("{0:b}".format(n)))]
                    bin_code: str = huffman_code + "{0:0{width}b}".format(size,
                                                                          width=len("{0:b}".format(Ls)))
                elif len(dictionary) == 0:
                    bin_code: str = self.codify_word(pos, size)
                
                for bit in bin_code:
                    if bit == '1':
                        self._compressed_string.append(True)
                    else:
                        self._compressed_string.append(False)
                i += size
            else:
                self._compressed_string.append(False)
                self._compressed_string.frombytes(bytes([pcad[i+n-Ls]]))
                i += 1
        return None

    def compress(self, filename: str) -> None:
        filedata: bytearray = data_from_file(filename) #CHECKED
        
        #distances = get_distances(filedata) #CHECKED - HUFFMAN PART
        #counts = bl_count_from_distances(distances) #CHECKED - HUFFMAN PART
        #huffman_tree = create_huffman_tree(counts) #CHECKED - HUFFMAN PART
        #dictionary = dictionary_from_tree(huffman_tree, '', codeDictionary) #CHECKED - HUFFMAN PART
        #self._huffman_dictionary = dictionary

        #self.codify_huffmantree(huffman_tree, treeBuffer) #CHECKED - HUFFMAN PART
        self.codify_cad(filedata) #CHECKED

        return None

In [87]:
# DECOMPRESSER CLASS

class Decompresser:
    _n: int
    _Ls: int
    _symb: int
    _decompressed_string: bitarray
    
    _huffman_dictionary: Dict
    _decompressed_huffman: bitarray

    def __init__(self, n: int, Ls: int):
        if (n < 0 or Ls < 0):
			      raise ValueError("Negative buffer sizes")

        self._n = n
        self._Ls = Ls
        self._symb = 32
        self._decompressed_string = bitarray(endian='big')

        self._huffman_dictionary = {}
        self._decompressed_huffman = bitarray(endian='big')

    def decompress_cad(self, filedata: bytearray) -> None:
        n: int = self._n
        Ls: int = self._Ls
        symb: int = self._symb
        self._decompressed_string = inipad(symb, n-Ls, bytes())

        k: int = 0
        while len(filedata) >= 9:
            flag_pair = filedata.pop(0)
            if not flag_pair:
                byte = filedata[0:8].tobytes()
                self._decompressed_string += byte
                del filedata[0:8]
                k += 1
            else:
                position = ''
                for i in range(0, len("{0:b}".format(n))):
                    bit = filedata.pop(0)
                    if bit == True:
                        position += '1'
                    else:
                        position += '0'

                curbitsubstring = ''
                stop = False
                while len(self._huffman_dictionary) > 0 and stop == False:
                    bit = filedata.pop(0)
                    if bit == True:
                        curbitsubstring+="1"
                    else:
                        curbitsubstring+="0"
                    for key in self._huffman_dictionary:
                        if self._huffman_dictionary[key] == str(curbitsubstring):
                            position = key
                            stop = True

                length = ''
                for i in range(0, len("{0:b}".format(Ls))):
                    bit = filedata.pop(0)
                    if bit == True:
                        length += '1'
                    else:
                        length += '0'

                bestDistance = int(position, 2)
                bestLength = int(length, 2)
                for i in range(bestLength):
                    self._decompressed_string.append(
                        self._decompressed_string[k+bestDistance+i])
                k += bestLength

        self._decompressed_string = self._decompressed_string[n-Ls:]
        return None

    def decompress(self, filename: str):
        filedata: bytearray = bin_from_file(filename)

        #root = decodify_huffman_tree(filedata) - HUFFMAN PART
        #dictionary_from_tree(root, '', codeDictionary) - HUFFMAN PART
        
        self.decompress_cad(filedata)

# **TEST SUITE**

In [88]:
filename1 = 'PDFs/Spanish [Spain].pdf'
filename2 = 'PDFs/Italian [Italy].pdf'
text1 = pdf2txt(filename1)
text2 = pdf2txt(filename2, text1, ini=-2, fin=0)

file=open('datos'+'.txt', 'w')
file.write(text2)



14788

In [89]:
filename: str = 'datos.txt'

Compresor = Compresser(5000, 1000)
Compresor.compress(filename)

Compresor._compressed_string.fill()
file_from_bin('compressed_file', Compresor._compressed_string)

In [90]:
filename: str = 'compressed_file'

Decompresor = Decompresser(5000, 1000)
Decompresor.decompress(filename)

file_from_bin('decompressed_file', Decompresor._decompressed_string)

# **MATRIX CONSTRUCTION**

In [91]:
import os

In [92]:
def calculate_delta(txt1, txt2, dicc):
    L1 = dicc[txt1]
    L2 = dicc[txt2]
    delta = L1 - L2
    return delta

def calculate2(Aa, Ab, Ba, Bb, A, B, dicc):
    deltaAb = calculate_delta(Ab, A, dicc)
    deltaAa = calculate_delta(Aa, A, dicc)
    deltaBa = calculate_delta(Ba, B, dicc)
    deltaBb = calculate_delta(Bb, B, dicc)
    
    return ((deltaAb - deltaBb)/deltaBb)+((deltaBa-deltaAa)/deltaAa)


In [93]:
def relat_entropy(pdf_1, pdf_2, n, Ls):
    pdffileobj_1=open(pdf_1,'rb')
    pdfreader_1=PyPDF2.PdfFileReader(pdffileobj_1)
    y_1=pdfreader_1.numPages
    txt_1A = pdf2txt(pdf_1, fin=-2)
    txt_1Aa = pdf2txt(pdf_1, txt_1A, ini=-2, fin=0)

    pdffileobj_2=open(pdf_2,'rb')
    pdfreader_2=PyPDF2.PdfFileReader(pdffileobj_2)
    y_2=pdfreader_2.numPages
    txt_2B = pdf2txt(pdf_2, fin=-2)
    txt_2Bb = pdf2txt(pdf_2, txt_2B, ini=-2, fin=0)

    txt_1Ab = pdf2txt(pdf_2, txt_1A, ini=-2, fin=0)
    txt_2Ba = pdf2txt(pdf_1, txt_2B, ini=-2, fin=0)

    file_A=open("TXTs/A.txt", 'w')
    file_A.write(txt_1A)

    file_Aa=open("TXTs/Aa.txt", 'w')
    file_Aa.write(txt_1Aa)

    file_B=open("TXTs/B.txt", 'w')
    file_B.write(txt_2B)

    file_Bb=open("TXTs/Bb.txt", 'w')
    file_Bb.write(txt_2Bb)
    
    file_Ab=open("TXTs/Ab.txt", 'w')
    file_Ab.write(txt_1Ab)

    file_Ab=open("TXTs/Ba.txt", 'w')
    file_Ab.write(txt_2Ba)

    dictionario = {}

    Compresor_A = Compresser(n, Ls)
    Compresor_A.compress("TXTs/A.txt")
    dictionario["TXTs/A.txt"]=len(Compresor_A._compressed_string)

    Compresor_B = Compresser(n, Ls)
    Compresor_B.compress("TXTs/B.txt")
    dictionario["TXTs/B.txt"]=len(Compresor_B._compressed_string)

    Compresor_Aa = Compresser(n, Ls)
    Compresor_Aa.compress("TXTs/Aa.txt")
    dictionario["TXTs/Aa.txt"]=len(Compresor_Aa._compressed_string)

    Compresor_Bb = Compresser(n, Ls)
    Compresor_Bb.compress("TXTs/Bb.txt")
    dictionario["TXTs/Bb.txt"]=len(Compresor_Bb._compressed_string)

    Compresor_Ab = Compresser(n, Ls)
    Compresor_Ab.compress("TXTs/Ab.txt")
    dictionario["TXTs/Ab.txt"]=len(Compresor_Ab._compressed_string)

    Compresor_Ba = Compresser(n, Ls)
    Compresor_Ba.compress("TXTs/Ba.txt")
    dictionario["TXTs/Ba.txt"]=len(Compresor_Ba._compressed_string)

    res = calculate2( 'TXTs/Aa.txt', 'TXTs/Ab.txt', 'TXTs/Ba.txt', 'TXTs/Bb.txt', 'TXTs/A.txt', 'TXTs/B.txt', dictionario)
    print(pdf_1, "(",(res),")", pdf_2)
    return res

In [97]:
search = 2000
ahead = 1000

path = '/content/PDFs/'
dir_path = os.path.dirname(os.path.realpath(path))

PDFs = []
for root, dirs, files in os.walk(dir_path):
    for file_1 in files:
        for file_2 in files:
          if file_1.endswith('.pdf') and file_2.endswith('.pdf'):
              relat_entropy(path+file_1, path+file_2, search, ahead)

/content/PDFs/English [UK].pdf ( 0.0 ) /content/PDFs/English [UK].pdf




/content/PDFs/English [UK].pdf ( 0.09072326014585716 ) /content/PDFs/Italian [Italy].pdf
/content/PDFs/English [UK].pdf ( 2.40969134249388 ) /content/PDFs/German [Germany].pdf
/content/PDFs/English [UK].pdf ( 0.1449873550099888 ) /content/PDFs/Estonian [Estonia].pdf
/content/PDFs/English [UK].pdf ( 0.08238484350757722 ) /content/PDFs/Spanish [Spain].pdf
/content/PDFs/Italian [Italy].pdf ( 0.09072326014585716 ) /content/PDFs/English [UK].pdf
/content/PDFs/Italian [Italy].pdf ( 0.0 ) /content/PDFs/Italian [Italy].pdf
/content/PDFs/Italian [Italy].pdf ( 2.5638468701142383 ) /content/PDFs/German [Germany].pdf
/content/PDFs/Italian [Italy].pdf ( 0.1315736286261255 ) /content/PDFs/Estonian [Estonia].pdf
/content/PDFs/Italian [Italy].pdf ( 0.0803460847980605 ) /content/PDFs/Spanish [Spain].pdf
/content/PDFs/German [Germany].pdf ( 2.40969134249388 ) /content/PDFs/English [UK].pdf
/content/PDFs/German [Germany].pdf ( 2.5638468701142383 ) /content/PDFs/Italian [Italy].pdf
/content/PDFs/German [G

In [95]:
path = '/content/PDFs/'
dir_path = os.path.dirname(os.path.realpath(path)) # <----- Inicializar con esta variable la ruta a la cual se le quieren extraer los archivos de determinado tipo

PDFs = []
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.pdf'): # <----- # Especificamos el tipo de archivo que nos interesa extraer.
            location = str(file)
            location = location.replace("\\", "/")
            PDFs.append(location)

# CREAMOS ARCHIVOS TXTs
TXTs = []
for (i, pdf1) in enumerate(PDFs):
    for (j, pdf2) in enumerate(PDFs):
        filename1 = path+pdf1
        filename2 = path+pdf2
        text1 = pdf2txt(filename1, fin=-2)
        text2 = pdf2txt(filename2, text1, ini=-2, fin=0)

        TXTs.append("TXTs/"+str(i)+str(j)+'.txt')
        file=open("TXTs/"+str(i)+str(j)+'.txt', 'w')
        file.write(text2)

# CREAMOS ARCHIVOS TXTs
for (i, pdf1) in enumerate(PDFs):
    filename1 = path+pdf1
    text1 = pdf2txt(filename1, fin=-2)

    TXTs.append("TXTs/"+str(i)+'.txt')
    file=open("TXTs/"+str(i)+'.txt', 'w')
    file.write(text1)

print(TXTs)
# COMPRIMIMOS ARCHIVOS TXTs
dictionario = {}
for txt in TXTs:
    Compresor = Compresser(2000, 1000)
    Compresor.compress(txt)

    print(txt)
    print(len(Compresor._compressed_string))
    dictionario[txt]=len(Compresor._compressed_string)

print(dictionario)

    

res1 = calculate2( 'TXTs/00.txt', 'TXTs/01.txt', 'TXTs/10.txt', 'TXTs/11.txt', 'TXTs/0.txt', 'TXTs/1.txt', dictionario)
res2 = calculate2( 'TXTs/00.txt', 'TXTs/02.txt', 'TXTs/20.txt', 'TXTs/22.txt', 'TXTs/0.txt', 'TXTs/2.txt', dictionario)
res3 = calculate2( 'TXTs/22.txt', 'TXTs/21.txt', 'TXTs/12.txt', 'TXTs/11.txt', 'TXTs/2.txt', 'TXTs/1.txt', dictionario)

print("ITALY-TURKISH")
print(res1)
print("ITALY-SPANISH")
print(res2)
print("TURKISH-SPAIN")
print(res3)



['TXTs/00.txt', 'TXTs/01.txt', 'TXTs/02.txt', 'TXTs/03.txt', 'TXTs/10.txt', 'TXTs/11.txt', 'TXTs/12.txt', 'TXTs/13.txt', 'TXTs/20.txt', 'TXTs/21.txt', 'TXTs/22.txt', 'TXTs/23.txt', 'TXTs/30.txt', 'TXTs/31.txt', 'TXTs/32.txt', 'TXTs/33.txt', 'TXTs/0.txt', 'TXTs/1.txt', 'TXTs/2.txt', 'TXTs/3.txt']
TXTs/00.txt
62784
TXTs/01.txt
62729
TXTs/02.txt
75594
TXTs/03.txt
61891
TXTs/10.txt
77150
TXTs/11.txt
75717
TXTs/12.txt
89221
TXTs/13.txt
75787
TXTs/20.txt
62737
TXTs/21.txt
61952
TXTs/22.txt
73436
TXTs/23.txt
61227
TXTs/30.txt
72862
TXTs/31.txt
71822
TXTs/32.txt
84850
TXTs/33.txt
70727
TXTs/0.txt
47189
TXTs/1.txt
60889
TXTs/2.txt
46044
TXTs/3.txt
56478
{'TXTs/00.txt': 62784, 'TXTs/01.txt': 62729, 'TXTs/02.txt': 75594, 'TXTs/03.txt': 61891, 'TXTs/10.txt': 77150, 'TXTs/11.txt': 75717, 'TXTs/12.txt': 89221, 'TXTs/13.txt': 75787, 'TXTs/20.txt': 62737, 'TXTs/21.txt': 61952, 'TXTs/22.txt': 73436, 'TXTs/23.txt': 61227, 'TXTs/30.txt': 72862, 'TXTs/31.txt': 71822, 'TXTs/32.txt': 84850, 'TXTs/33.txt': 7

# DEBUGGIN