In [2]:
# DO NOT delete this cell. 
# 
# This is the name of the file to be compressed.  
# Yes, you can create your own test cases and you should.

filename = "file.txt"

In [3]:
def LZW_compress(fname):
    print(fname)
    print(fname+".lzw2")
    
    # open and read the file
    with open(fname, 'rb') as f:
        data = f.read()

    # define the dictionary to be build
    entry = 256
    dictionary = {bytes([i]): i for i in range(entry)}

    # compress
    result = []
    w = b""
    for c in data:
        wc = w + bytes([c])
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            if entry < pow(2, 12): # max of 12 bits
                dictionary[wc] = entry
                entry += 1
            w = bytes([c])
    if w:
        result.append(dictionary[w])

    # open and write to file
    with open(fname + ".lzw", 'wb') as f:
        for code in result:
            binary_int = code & 0xFFFF # Convert the LZW code to a 16-bit binary representation
            f.write(binary_int.to_bytes(2, byteorder='big')) # 16 bits = 2 bytes

# keep this line 
LZW_compress(filename)


file.txt
file.txt.lzw2


In [5]:
# you'll expand the file named filename.lzw, and save the decompressed as filename+".2"
# keep the function name
def LZW_expand(fname):
    print(fname)
    print(fname[:-4]+".2")
    
    # open and read the compressed file
    compressed_data = []
    with open(fname, 'rb') as f:
        while True:
            chunk = f.read(2) # 2 bytes for each single codeword
            if not chunk:
                break
            compressed_data.append(int.from_bytes(chunk, 'big'))

    # define the dictionary to be build     
    entry = 256   
    dictionary = {i: bytes([i]) for i in range(entry)}
    
    # decompress
    result = bytearray()
    w = bytearray()
    for code in compressed_data:
        if code in dictionary:
            entry_str = dictionary[code]
        elif code == entry:
            entry_str = w + bytes([w[0]])
        else:
            raise ValueError("Bad compressed code")
        result += entry_str
        if w:
            dictionary[entry] = w + bytes([entry_str[0]])
            entry += 1
        w = entry_str

    # open and write to file
    with open(fname[:-4]+".2", 'wb') as f:
        f.write(result)

# keep this line 
LZW_expand(filename +".lzw")

file.txt.lzw
file.txt.2


In [17]:
def LZW_modified_compress(fname):
    # Read file into content
    with open(fname, 'rb') as file:
        content = file.read()

    # initialize dictionary
    # gradual growth of codeword size will be dependent on dictSize.
    entry = 256
    dictionary = {bytes([i]): i for i in range(entry)}

    w = b""
    compressedData = []

    currentWordSize = 9
    maxDictSize = pow(2, 16)
    maxWordSize = pow(2, currentWordSize)

    for c in content:
        wc = w + bytes([c])
        if wc in dictionary:
            w = wc
        else:
            compressedData.append(dictionary[w])

            # 2^16 - 1 code words
            if entry < maxDictSize - 1:
                dictionary[wc] = entry
                entry += 1
                if entry > maxWordSize:
                    currentWordSize += 1
                    maxWordSize *= 2
            w = bytes([c])
    if w:
        compressedData.append(dictionary[w])

    # Output the compressed file to "filename.lzw" 
    with open(fname + ".lzw2", 'wb') as file:
        for code in compressedData:
            file.write(code.to_bytes((currentWordSize + 7) // 8, 'big'))

# keep this line    
LZW_modified_compress(filename)


In [23]:
def LZW_modified_expand(fname):
    # Reconstruct the dictionary used during compression
    entry = 256
    dictionary = {i: bytes([i]) for i in range(entry)}

    result = b""

    currentWordSize = 9
    MAX_WORD_SIZE = 16
    maxWordSize = pow(2, currentWordSize)
    wc = b""

    with open(fname, 'rb') as file:
        while True:
            # read the data, break if it is the end of the file
            compressedData = file.read(2)  # Read 2 bytes at a time for the compressed data
            if len(compressedData) < 2:
                break
            
            # extract the code
            code = int.from_bytes(compressedData, 'big')  # Convert bytes to integer

            # for a new sequence of characters, create a new entry in the dictionary, and append this to result
            if not (code in dictionary):
                dictionary[code] = wc + bytes([wc[0]])
            result += dictionary[code]

            # Add a new entry, with the first byte of the sequence represented by the current code
            if not (len(wc) == 0) and currentWordSize <= MAX_WORD_SIZE:
                dictionary[entry] = wc + bytes([dictionary[code][0]])
                entry += 1
                # when the word with the current number of bits fills, increase the number of bits by 1
                if entry >= maxWordSize:
                    currentWordSize += 1
                    maxWordSize *= 2
            
            wc = dictionary[code]

    # Write the decompressed data to a new file
    with open(fname[:-4] + "2M", 'wb') as file:
        file.write(result)

# keep this line    
LZW_modified_expand(filename + ".lzw2")


TypeError: cannot convert 'int' object to bytes