In [None]:
# DO NOT delete this cell. 
# 
# This is the name of the file to be compressed.  
# Yes, you can create your own test cases and you should.

filename = "bible.txt"

In [None]:
# LZW/ Your code 


In [132]:
# you will compress the file named filename, and save the compressed as filename+".lzw"
# keep the function name
def LZW_compress(fname):

    # Read file into content
    with open(fname, 'r') as file:
        content = file.read()

    # initialize dictionary
    dictSize = 256
    dictionary = {chr(i): i for i in range(dictSize)}

    # used to stop growing the dictionary past 12 bits
    bitsPerWord = 12
    maxDictSize = pow(2, bitsPerWord)

    compressedData = []
    codeword = ""

    for character in content:
        newCodeword = codeword + character
        if newCodeword in dictionary:
            codeword = newCodeword
        else:
            compressedData.append(dictionary[codeword])

            # codewords must not exceed 12 bits for this part, so once we are at 2^12 - 1, stop adding to the dictionary.
            if dictSize < maxDictSize:
                dictionary[newCodeword] = dictSize
                dictSize += 1
            codeword = character

    if codeword in dictionary:
        compressedData.append(dictionary[codeword])

    # Output the compressed file to "filename.lzw" 
    with open(fname.split('.')[0] + ".lzw", 'wb') as file:
        for c in compressedData:

            # the binary integer needs to be 12 bits, packed into 16 bits, broken into two bytes
            # So take the original integer and pad with leading 0's until its 16 bits long
            # so A = 65 = 1000001 -> 1000001.zfill(16) -> 0000000001000001
            binaryInt = int(bin(c)[2:].zfill(16), 2)
            # Split the 16 bit binary int into two bytes
            byte1 = (binaryInt >> 8) & 0xFF
            byte2 = binaryInt & 0xFF
            
            # Pack those bytes together and write it
            bytesToWrite = bytes([byte1, byte2])
            file.write(bytesToWrite)

# keep this line
LZW_compress(filename)

In [133]:
# Custom function to convert bytes to a binary string
def bytes_to_binary_string(byte_data):
    return ''.join(format(byte, '08b') for byte in byte_data)

def LZW_expand(fname):

    # Building and initializing the dictionary.
    dictSize = 256
    dictionary = dict([(i, chr(i)) for i in range(dictSize)])

    compressedData = []
    decompressedData = ""
    newCodeword = ""

    # Open the file to read bytes
    with open(fname.split('.')[0] + ".lzw", "rb") as file:

        while True:
            # read in 2 bytes at a time, since one word is packaged using two bytes
            compressed = file.read(2)

            # check EOF
            if len(compressed) == 0:
                break
        
            # Convert the byte data to binary string
            data = bytes_to_binary_string(compressed)

            compressedData.append(int(data, 2))

    # iterating through the codes
    for code in compressedData:
        if not (code in dictionary):
            dictionary[code] = newCodeword + (newCodeword[0])
            
        decompressedData += dictionary[code]
        if not(len(newCodeword) == 0):
            dictionary[dictSize] = newCodeword + (dictionary[code][0])
            dictSize += 1
        newCodeword = dictionary[code]

    # storing the decompressed newCodeword into a file.
    with open(filename.split(".")[0] + ".2", "w") as file:
        for data in decompressedData:
            file.write(data)
  
# keep this line 
LZW_expand(filename +".lzw")

In [134]:
# Check the original and uncompressed files against each other

def compare_files(file_path1, file_path2):
    with open(file_path1, 'rb') as file1, open(file_path2, 'rb') as file2:
        content1 = file1.read()
        content2 = file2.read()

    return content1 == content2

file1_path = filename
file2_path = filename.split(".")[0] + ".2"

if compare_files(file1_path, file2_path):
    print("The content of the files is identical.")
else:
    print("The content of the files is different.")

The content of the files is identical.


In [158]:
# keep the function name
def LZW_modified_compress(fname):

    # Read file into content
    with open(fname, 'r') as file:
        content = file.read()

    # initialize dictionary
    # gradual growth of codeword size will be dependent on dictSize.
    dictSize = 256
    dictionary = {chr(i): i for i in range(dictSize)}

    codeword = ""
    compressedData = []

    currentWordSize = 9
    maxDictSize = pow(2, 16)
    maxWordSize = pow(2, currentWordSize)

    for character in content:
        newCodeword = codeword + character
        if newCodeword in dictionary:
            codeword = newCodeword
        else:
            compressedData.append(bin(dictionary[codeword])[2:].zfill(currentWordSize))
            
            # codewords must not exceed 16 bits for this part, so once we are at 2^16 - 1, stop adding to the dictionary.
            if dictSize < maxDictSize - 1:
                dictionary[newCodeword] = dictSize
                dictSize += 1
                if dictSize > maxWordSize:
                    currentWordSize += 1
                    maxWordSize *= 2
            codeword = character

    if codeword in dictionary:
        compressedData.append(bin(dictionary[codeword])[2:].zfill(currentWordSize))


    bitBuffer = ""
    # Output the compressed file to "filename.lzw" 
    with open(fname.split('.')[0] + ".lzw2", 'wb') as file:
        for c in compressedData:

            for b in c:
                
        
# keep this line    
LZW_modified_compress(filename)

0
0
0
1
1
0
0
0
1
0
0
0
1
1
1
0
1
0
0
0
0
1
1
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
1
1
0
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
1
1
1
0
1
0
0
0
0
1
1
0
1
0
0
0
0
0
1
1
0
0
1
0
1
0
0
0
1
0
0
0
0
0
0
0
1
1
0
0
0
1
0
0
0
1
1
0
0
1
0
1
0
0
1
1
0
0
1
1
1
0
0
1
1
0
1
0
0
1
0
0
1
1
0
1
1
1
0
0
0
1
1
0
1
1
1
0
1
0
0
0
0
1
1
1
0
0
0
1
1
0
0
1
1
1
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
1
1
1
0
0
1
1
0
1
1
1
1
0
0
1
1
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
1
1
0
0
0
1
1
0
0
1
1
1
0
0
1
0
0
0
1
1
0
0
1
0
1
0
0
1
1
0
0
0
0
1
0
0
1
1
1
0
1
0
0
0
0
1
1
0
0
1
0
1
1
0
0
0
1
0
1
1
0
1
0
0
0
0
0
1
1
1
1
0
0
0
0
1
0
0
1
1
0
0
0
0
1
0
0
0
0
0
1
1
0
0
0
0
1
0
0
1
1
1
0
1
1
0
0
0
1
1
0
0
1
0
1
1
0
0
0
0
0
1
0
1
0
0
1
1
0
0
0
0
1
0
0
1
1
0
1
1
1
0
1
0
0
0
1
1
1
1
0
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
1
1
0
1
0
0
0
1
1
1
0
0
1
0
1
0
0
0
0
0
1
1
1
0
0
0
1
0
1
1
1
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
1
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
0
0
1
0
0
1
1
1
1
0
0
0
0
0
1
1
0
1
0
0
1
0


KeyboardInterrupt: 

In [128]:
# Custom function to convert bytes to a binary string
def bytes_to_binary_string(byte_data):
    return ''.join(format(byte, '08b') for byte in byte_data)

# keep the function name
def LZW_modified_expand(fname):
    
    with open(fname.split('.')[0] + ".lzw2", 'rb') as file:
        compressed_data = file.read()

    # Reconstruct the dictionary used during compression
    dictSize = 256
    dictionary = {bin(i)[2:].zfill(9): chr(i) for i in range(dictSize)}

    currentWordSize = 9
    nextWordSizeLimit = pow(2, currentWordSize)

    uncompressed_data = ""
    codeword = ""
    for byte in compressed_data:
        # Convert byte to binary string
        bits = bin(byte)[2:].zfill(8)
        for bit in bits:
            codeword += bit
            if codeword in dictionary:
                uncompressed_data += dictionary[codeword]
                if dictSize >= nextWordSizeLimit:
                    currentWordSize += 1
                    nextWordSizeLimit *= 2
                dictionary[bin(dictSize)[2:].zfill(currentWordSize)] = dictionary[codeword]
                dictSize += 1
                codeword = ""

    # Write the decompressed data to a new file
    with open(fname.split('.')[0] + ".2M", 'w') as file:
        file.write(uncompressed_data)
    
# keep this line    
LZW_modified_expand(filename+".lzw2")

In [30]:
def compare_files(file_path1, file_path2):
    with open(file_path1, 'rb') as file1, open(file_path2, 'rb') as file2:
        content1 = file1.read()
        content2 = file2.read()

    return content1 == content2

file1_path = filename
file2_path = filename.split(".")[0] + ".2M"

if compare_files(file1_path, file2_path):
    print("The content of the files is identical.")
else:
    print("The content of the files is different.")

The content of the files is different.
