# Modified LZW

- Uses variable length codewords that range from 9 to 16 bits

- Can compress a 4.4MB file down to 1.6MB

In [1]:
# This is the name of the file to be compressed.  
# Yes, you can create your own test cases and you should.

filename = "bible.txt"

# Compressor
---

In [2]:
# keep the function name
def LZW_modified_compress(fname):

    # Read file into content
    with open(fname, 'r') as file:
        content = file.read()

    # initialize dictionary
    # gradual growth of codeword size will be dependent on dictSize.
    dictSize = 256
    dictionary = {chr(i): i for i in range(dictSize)}

    codeword = ""
    compressedData = []

    currentWordSize = 9
    maxDictSize = pow(2, 16)
    maxWordSize = pow(2, currentWordSize)

    for character in content:
        newCodeword = codeword + character
        if newCodeword in dictionary:
            codeword = newCodeword
        else:
            compressedData.append(bin(dictionary[codeword])[2:].zfill(currentWordSize))
            
            # codewords must not exceed 16 bits for this part, so once we are at 2^16 - 1, stop adding to the dictionary.
            if dictSize < maxDictSize - 1:
                dictionary[newCodeword] = dictSize
                dictSize += 1
                if dictSize > maxWordSize:
                    currentWordSize += 1
                    maxWordSize *= 2
            codeword = character

    if codeword in dictionary:
        compressedData.append(bin(dictionary[codeword])[2:].zfill(currentWordSize))

    bitBuffer = ""
    # Output the compressed file to "filename.lzw" 
    with open(fname.split('.')[0] + ".lzw2", 'wb') as file:
        for c in compressedData:
            for b in c:
                bitBuffer += b
                if len(bitBuffer) == 8:
                    bufferInt = int(bitBuffer, 2)
                    byteToWrite = bytes([bufferInt])
                    file.write(byteToWrite)
                    bitBuffer = ""

        # not all bits will come out to fit into a byte. Oftentimes the bit buffer will have bits left over
        # this writes what's left in the bit buffer, pads it and reverses it... The way things get synchronized when decompressing, these bits are automatically pulled out and added to the last codeword
        byteToWrite = int(bitBuffer.zfill(8)[::-1], 2).to_bytes(1, byteorder='big', signed=False)
        file.write(byteToWrite)

# keep this line    
LZW_modified_compress(filename)

# Decompressor
---

In [3]:
# Custom function to convert bytes to a binary string
def bytes_to_binary_string(byte_data):
    return ''.join(format(byte, '08b') for byte in byte_data)

# keep the function name
def LZW_modified_expand(fname):

    # Reconstruct the dictionary used during compression
    dictSize = 256
    dictionary = {i: chr(i) for i in range(dictSize)}

    decompressedData = ""

    currentWordSize = 9
    maxWordSize = pow(2, currentWordSize)
    maxDictSize = pow(2, 16)
    bitBuffer = ""
    newCodeword = ""

    with open(fname.split('.')[0] + ".lzw2", 'rb') as file:

        while True:
            compressedData = file.read(1)

            if len(compressedData) == 0:
                break

            compressedData = bytes_to_binary_string(compressedData)

            for c in compressedData:
                for b in c:
                    bitBuffer += b
                    if len(bitBuffer) == currentWordSize:
                    
                        code = int(bitBuffer, 2)

                        if not (code in dictionary):
                            dictionary[code] = newCodeword + (newCodeword[0])
                            
                        decompressedData += dictionary[code]

                        if not(len(newCodeword) == 0) and dictSize < maxDictSize - 1:
                            dictionary[dictSize] = newCodeword + (dictionary[code][0])
                            dictSize += 1
                            if dictSize >= maxWordSize:
                                currentWordSize += 1
                                maxWordSize *= 2
                        newCodeword = dictionary[code]

                        bitBuffer = ""

    # Write the decompressed data to a new file
    with open(fname.split('.')[0] + ".2M", 'w') as file:
        for d in decompressedData:
            file.write(d)
    
# keep this line    
LZW_modified_expand(filename+".lzw2")

# Check equality of files
---

In [4]:
def compare_files(file_path1, file_path2):
    with open(file_path1, 'rb') as file1, open(file_path2, 'rb') as file2:
        content1 = file1.read()
        content2 = file2.read()

    return content1 == content2

file1_path = filename
file2_path = filename.split(".")[0] + ".2M"

if compare_files(file1_path, file2_path):
    print("The content of the files is identical.")
else:
    print("The content of the files is different.")

The content of the files is identical.
