In [1]:
import heapq
from collections import Counter

import PyPDF2
import docx
from bs4 import BeautifulSoup
import graphviz 

In [2]:
class Node:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(s1, s2):
        return s1.freq < s2.freq


In [3]:
# Read Functions 
def read_pdf(file_path):
    pdf_text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            pdf_text += reader.pages[page_num].extract_text()
    return pdf_text


def read_docx(file_path):
    doc = docx.Document(file_path)
    doc_text = ""
    for paragraph in doc.paragraphs:
        doc_text += paragraph.text
    return doc_text

def read_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        html_text = soup.get_text()
    return html_text

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        txt_text = file.read()

    if not txt_text.strip():
        print("File is empty")
        exit()

    return txt_text


def read_file(file_path):
    if file_path.endswith('.pdf'):
        return read_pdf(file_path)
    elif file_path.endswith('.docx'):
        return read_docx(file_path)
    elif file_path.endswith('.html'):
        return read_html(file_path)
    elif file_path.endswith('.txt'):
        return read_txt(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide a file in PDF, DOCX, HTML, or TXT format.")


In [None]:
def build_huffman_tree(frequency):
    heap = [Node(char, freq) for char, freq in frequency.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        val = left.freq + right.freq
        merged = Node(None, val)
        merged.left = left
        merged.right = right
        heapq.heappush(heap, merged)

    return heap[0]

def generate_codes(root, prefix='', codebook={}):
    if root is None:
        return
    if root.char is not None:
        codebook[root.char] = prefix
    generate_codes(root.left, prefix + '0', codebook)
    generate_codes(root.right, prefix + '1', codebook)
    return codebook

def compress(text):
    frequency = Counter(text)
    if frequency == 0:
        print("Empty File")
        exit(1)
    huffman_tree = build_huffman_tree(frequency)
    codebook = generate_codes(huffman_tree)

    compressed_text = '.'.join(codebook[char] for char in text)
    return compressed_text, huffman_tree, codebook

def calculate_compression_ratio(original_text, compressed_text):
    original_size = len(original_text) * 8
    compressed_size = len(compressed_text)
    return round(original_size / compressed_size, 2)

In [5]:
def visualize_huffman_tree(root):
    dot = graphviz.Digraph(format='png', engine='dot')
    
    def add_nodes_edges(node, parent=None):
        if node is not None:
            node_label = f'{node.char}:{node.freq}' if node.char else f'Freq:{node.freq}'
            dot.node(str(id(node)), label=node_label)
            if parent:
                dot.edge(str(id(parent)), str(id(node)))
            add_nodes_edges(node.left, node)
            add_nodes_edges(node.right, node)
    
    add_nodes_edges(root)
    return dot

In [None]:
file_path = 'example6.txt'
text = read_file(file_path)

compressed_text, huffman_tree, codebook = compress(text)

compression_ratio = calculate_compression_ratio(text, compressed_text)
print(f'File: {file_path}')
print("Original Text:", text.replace('\n', ' '))
print("Compressed Text:", compressed_text)
print("Compression Ratio:", compression_ratio)

# Visualize Huffman Tree (This will display in Jupyter Notebook)
dot = visualize_huffman_tree(huffman_tree)
dot.view()  # This will render the tree in the notebook

File: example6.txt
Original Text: A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot.
Compressed Text: 0011110.111.00110.1001.1011.000010.011.11010.01001.000111.10101.111.1100.011.11010.011.1011.1000.0101.00000.111.0010.11011.1100.111.0101.11011.001110.011.1011.111.101001.1001.1100.1100.011.1100.1100.1000.1001.1011.111.1001.01001.111.01000.00000.111.011.1011.0101.1000.11010.011.111.1100.1001.000111.10101.000110.111.10101.1000.001110.011.111.0101.0010.011.1100.011.111.1100.00110.011.011.0101.111.01000.1001.11010.1011.1000.1011.000101.1100.111.1001.01001.111.1100.101001.11010.1000.1011.000101.111.00110.0010.1000.101000.0010.111.000100.111.011.1011.0000111.1001.00000.111.00110.1000.0101.0010.111.01000.00000.111.00110.0010.1001.10101.011.111.0010.011.11011.11010.0101.0011111.111.000100.111.11011.01000.111.11011.10101.1001.1011.011.000110.111.11011.1011.0

'Digraph.gv.png'