# Testing Run Length Encoding


## Imports


In [1]:
from importlib.util import spec_from_loader, module_from_spec
from importlib.machinery import SourceFileLoader
from scipy.io import wavfile
from glob import glob
import numpy as np
import sys, os
import pandas as pd

# Import Encode
spec = spec_from_loader("encode", SourceFileLoader("encode", "../../encode"))
encode = module_from_spec(spec)
spec.loader.exec_module(encode)

# Import Decode
spec = spec_from_loader("decode", SourceFileLoader("decode", "../../decode"))
decode = module_from_spec(spec)
spec.loader.exec_module(decode)

## Function Definitions


In [2]:
UNSIGNED_INTEGER_CUTOFF_VALUE = 65530

In [3]:
def encode_via_rle(rle_l_raw: list):
    """This algorithm will search for contiguous values within the
       array. When the rle_location_count is greater than the value
       65530, the count is reduced by this value in order to
       prevent an overflow of an unsigned 16-bit integer. This allows
       for the data to be stored with 2 bytes when the format of the
       array is a known value in advance of decoding this format. The
       choice of integer 65530 is an arbitrary value less than that of
       the maximum value of an unsigned 16-bit integer (65536). In this
       body of work, the value 65530 is denoted as the
       UNSIGNED_INTEGER_CUTOFF_VALUE.

    Args:
        rle_l_raw (list): This is a list of integer values to be
                          encoded.

    Returns:
        index_array (list): This is the list of run length encoded
                            values.
        rle_locations (list): This is a list of locations of elements
                              that are repeated that are present in the
                              array of indices.
    """
    initial_index = 0
    second_index = 1
    frequency = 0
    index_array = []
    rle_locations = []
    rle_location_count = 0

    while second_index < len(rle_l_raw):
        if rle_l_raw[initial_index] == rle_l_raw[second_index]:
            index_array.append(rle_l_raw[initial_index])
            rle_locations.append(rle_location_count)
            frequency += 1  # This account for the first detected value.

            # continue searching the breadth of the array; increasing
            # the detected frequency of the value. This will break out
            # of the while loop when the first & second indices are not
            # equal. ∴ The first value will not be accounted for because
            # the while loop will be broken where the value would have
            # been incremented.
            while (
                second_index <= len(rle_l_raw)
                and rle_l_raw[initial_index] == rle_l_raw[second_index]
            ):
                frequency += 1
                second_index += 1
            index_array.append(frequency)
            if rle_location_count > UNSIGNED_INTEGER_CUTOFF_VALUE:
                rle_location_count -= UNSIGNED_INTEGER_CUTOFF_VALUE
            rle_location_count += 2
        else:
            index_array.append(rle_l_raw[initial_index])
            if rle_location_count > UNSIGNED_INTEGER_CUTOFF_VALUE:
                rle_location_count -= UNSIGNED_INTEGER_CUTOFF_VALUE
            rle_location_count += 1
        frequency = 0
        initial_index = second_index
        second_index += 1
    if rle_l_raw[-1] != index_array[-2]:
        index_array.append(rle_l_raw[-1])
    return index_array, rle_locations

In [4]:
def identify_index_split(rle_locations, verbose=False):
    """This will identify where the rle_location values reset to a value
       that is below 65530.
    Args:
        rle_locations (list): This is a list of locations of values that
                              are encoded in the index_array.

    Returns:
        split_indices (list): This is a list of indices where the values
                              are reset in the rle_locations_array.
    """
    split_indices = []
    for index, value in enumerate(rle_locations):
        if index == 0:
            prev_value = value
            continue
        if value > prev_value:
            prev_value = value
        else:
            if verbose:
                print(f"index: {index}")
                print(f"prev_value: {prev_value}")
                print(f"value: {value}")
            split_indices.append(index)
            prev_value = value
    return split_indices

In [5]:
# Expand RLE Algorithm
def decode_rle(rle_locations: list, index_array: list):
    """This will expand the index array where the values have been
       compressed by run-length-encoding.

    Args:
        rle_locations (list): This is a list of locations in the
                              index_array where the locations have been
                              run-length-encoded. The subsequent values
                              of these locations are frequencies of the
                              run-length-encoded values.
        index_array (list): This is a list of values which contain
                            either signular values or run-length-encoded
                            values followed by the frequency of the
                            run-length-encoded value.
    """
    reconstructed_array = []
    rle_index = 0
    continue_current_index_past_frequency_value = False

    for current_index in range(0, len(index_array)):
        if current_index > UNSIGNED_INTEGER_CUTOFF_VALUE:
            print("breakpoint")
        if continue_current_index_past_frequency_value:
            continue_current_index_past_frequency_value = False
            continue
        if rle_index <= (len(rle_locations) - 1):
            if current_index > 131060:  # twice the cutoff value of 65530
                rle_location_index = rle_locations[rle_index] + 131060
            elif current_index > 65530 and current_index <= 131060:
                rle_location_index = rle_locations[rle_index] + 65530
            else:  # current_index is below 65530
                rle_location_index = rle_locations[rle_index]
        if current_index != rle_locations[rle_index]:
            # + current_index // UNSIGNED_INTEGER_CUTOFF_VALUE:
            reconstructed_array.append(index_array[current_index])
        else:
            if rle_index <= (len(rle_locations) - 1):
                run_length_expanded_l = [
                    index_array[rle_location_index]
                    for x in range((index_array[rle_location_index + 1]) + 1)
                ]
                reconstructed_array.extend(run_length_expanded_l)
                rle_index += 1
                continue_current_index_past_frequency_value = True
    return reconstructed_array

In [6]:
def format_encoded_rle_to_bytes(index_array: list, rle_locations: list):
    """This is the encoding driver of the run_length_encoded array.

    Args:
        index_array (list): This is the compressed list of values which
                            include run-length-encoded values followed
                            by their frequency.
        rle_locations (list): This is a list of index locations that are
                              run-length-encoded in the index array.
                              The frequency of the run-length-encoded
                              value is defined as the
                              index_array[rle_locations[current_rle_index] + 1]
                              or the subsequent value from the
                              run-length-encoded value in the
                              index_array.

    Returns:
        format_encoded_rle_bytes (bytes): This is the string of bytes of
                                          the formatted
                                          run-length-encoded array.
    """
    format_encoded_rle = [len(rle_locations)]
    format_encoded_rle.extend(rle_locations)
    format_encoded_rle.extend(index_array)
    format_encoded_rle = np.array(format_encoded_rle, dtype=np.uint16)
    format_encoded_rle_bytes = format_encoded_rle.tobytes()
    return format_encoded_rle_bytes

In [7]:
def parse_formatted_rle_bytes(format_encoded_rle_bytes: bytes):
    """This is the encoding driver of the run_length_encoded array.

    Args:
        format_encoded_rle_bytes (bytes): This is the compressed list of
                                          values in the form of a byte
                                          array. The order of the values
                                          is as following: length of the
                                          run-length-encoded locations
                                          array, the run-length-encoded
                                          locations array, and the
                                          index_array.

    Returns:
        rle_locations (list): This is a list of unsigned 16-bit
                              integers.
        index_array (list): This is a list of unsigned 16-bit integers.
                            These values contain run-length-encoded
                            sequences where the run-length-encoded-value
                            is followed by the frequency of the value.
    """
    encoded_rle = np.frombuffer(format_encoded_rle_bytes, dtype=np.uint16)
    len_rle_locations = encoded_rle[0]

    # +1 to skip over the rle_locations length in the line below:
    rle_locations = encoded_rle[1 : len_rle_locations + 1]

    index_array = encoded_rle[(len_rle_locations + 1) :]
    return rle_locations, index_array

## Import Data & Huffman Encode Values


In [8]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

rate, data = wavfile.read(current_file)

data_bytes = data.tobytes()

compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=data_bytes,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

# type(data_huffman_encoded_bytes)

## Testing Compressing a substring of the larger dataset


In [29]:
data_huffman_encoded_bytes_sub_string = data_huffman_encoded_bytes[:]

rle_l_raw = []
for index, data in enumerate(data_huffman_encoded_bytes_sub_string):
    rle_l_raw.append(data)
rle_l_raw

[128,
 4,
 149,
 135,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 125,
 148,
 40,
 140,
 2,
 50,
 48,
 148,
 140,
 4,
 48,
 48,
 48,
 48,
 148,
 140,
 2,
 48,
 56,
 148,
 140,
 5,
 48,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 48,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 48,
 148,
 140,
 2,
 53,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 49,
 148,
 140,
 2,
 48,
 102,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 48,
 148,
 140,
 2,
 54,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 49,
 148,
 140,
 2,
 101,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 49,
 48,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 102,
 102,
 148,
 140,
 6,
 48,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 97,
 48,
 148,
 140,
 4,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 50,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 97,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 100,
 1

In [30]:
rle_l_raw

[128,
 4,
 149,
 135,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 125,
 148,
 40,
 140,
 2,
 50,
 48,
 148,
 140,
 4,
 48,
 48,
 48,
 48,
 148,
 140,
 2,
 48,
 56,
 148,
 140,
 5,
 48,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 48,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 48,
 148,
 140,
 2,
 53,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 49,
 148,
 140,
 2,
 48,
 102,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 48,
 148,
 140,
 2,
 54,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 49,
 148,
 140,
 2,
 101,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 49,
 48,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 102,
 102,
 148,
 140,
 6,
 48,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 97,
 48,
 148,
 140,
 4,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 50,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 97,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 100,
 1

In [31]:
index_array, rle_locations = encode_via_rle(rle_l_raw)

In [32]:
index_array

[128,
 4,
 149,
 135,
 5,
 0,
 6,
 125,
 148,
 40,
 140,
 2,
 50,
 48,
 148,
 140,
 4,
 48,
 4,
 148,
 140,
 2,
 48,
 56,
 148,
 140,
 5,
 48,
 3,
 49,
 48,
 148,
 140,
 2,
 48,
 101,
 148,
 140,
 8,
 48,
 3,
 49,
 2,
 48,
 3,
 148,
 140,
 2,
 53,
 101,
 148,
 140,
 8,
 48,
 3,
 49,
 2,
 48,
 2,
 49,
 148,
 140,
 2,
 48,
 102,
 148,
 140,
 9,
 48,
 3,
 49,
 2,
 48,
 49,
 48,
 2,
 148,
 140,
 2,
 54,
 51,
 148,
 140,
 9,
 48,
 3,
 49,
 2,
 48,
 49,
 48,
 49,
 148,
 140,
 2,
 101,
 51,
 148,
 140,
 9,
 48,
 3,
 49,
 2,
 48,
 49,
 2,
 48,
 148,
 140,
 2,
 49,
 48,
 148,
 140,
 9,
 48,
 3,
 49,
 2,
 48,
 49,
 3,
 148,
 140,
 2,
 102,
 2,
 148,
 140,
 6,
 48,
 3,
 49,
 3,
 148,
 140,
 2,
 97,
 48,
 148,
 140,
 4,
 48,
 2,
 49,
 48,
 148,
 140,
 2,
 50,
 49,
 148,
 140,
 5,
 48,
 2,
 49,
 2,
 48,
 148,
 140,
 2,
 97,
 49,
 148,
 140,
 5,
 48,
 2,
 49,
 3,
 148,
 140,
 2,
 100,
 102,
 148,
 140,
 4,
 48,
 49,
 48,
 2,
 148,
 140,
 2,
 101,
 49,
 148,
 140,
 5,
 48,
 49,
 48,
 49,
 48,
 148,
 

In [33]:
rle_locations

[5,
 17,
 27,
 39,
 41,
 43,
 53,
 55,
 57,
 68,
 70,
 74,
 84,
 86,
 100,
 102,
 105,
 116,
 118,
 121,
 126,
 131,
 133,
 143,
 155,
 157,
 168,
 170,
 182,
 208,
 219,
 221,
 232,
 236,
 247,
 262,
 265,
 276,
 288,
 290,
 301,
 314,
 325,
 336,
 348,
 362,
 375,
 389,
 404,
 406,
 420,
 422,
 435,
 449,
 455,
 463,
 466,
 479,
 482,
 485,
 498,
 501,
 509,
 517,
 520,
 522,
 535,
 551,
 553,
 566,
 585,
 592,
 605,
 607,
 619,
 633,
 643,
 645,
 655,
 657,
 669,
 671,
 673,
 683,
 696,
 699,
 709,
 711,
 721,
 733,
 735,
 745,
 747,
 758,
 760,
 763,
 773,
 775,
 780,
 790,
 792,
 797,
 808,
 810,
 817,
 827,
 829,
 836,
 848,
 850,
 857,
 859,
 869,
 871,
 888,
 890,
 896,
 906,
 908,
 912,
 922,
 924,
 926,
 937,
 939,
 941,
 943,
 953,
 955,
 957,
 961,
 971,
 973,
 975,
 979,
 990,
 992,
 994,
 998,
 1009,
 1011,
 1013,
 1017,
 1020,
 1030,
 1032,
 1034,
 1038,
 1041,
 1052,
 1054,
 1056,
 1060,
 1073,
 1075,
 1077,
 1081,
 1083,
 1093,
 1095,
 1097,
 1112,
 1114,
 1116,
 1121,

##### Scratch Work for values below 65530


In [34]:
index = 0
rle_index = 0
reconstructed_array = []

while rle_index < len(rle_locations):
    if index > 65530:
        print("breakpoint")

    rle_location = rle_locations[rle_index] + (
        (index // UNSIGNED_INTEGER_CUTOFF_VALUE) * UNSIGNED_INTEGER_CUTOFF_VALUE
    )

    reconstructed_array = index_array[index:(rle_location)]

    rle_index_of_value = rle_location
    rle_index_of_frequency = rle_location + 1

    expanded_rle_value = [
        index_array[rle_index_of_value]
        for frequency in range(index_array[rle_index_of_frequency])
    ]

    reconstructed_array.extend(expanded_rle_value)
    index = rle_index_of_value + 2
    rle_index += 1

reconstructed_array.extend(index_array[index:])

In [35]:
reconstructed_array

[82,
 141,
 248,
 94,
 163,
 104,
 218,
 45,
 214,
 40,
 97,
 105,
 75,
 41,
 114,
 221,
 74,
 123,
 99,
 97,
 4,
 111,
 11,
 74,
 113,
 78,
 61,
 226,
 146,
 131,
 44,
 16,
 40,
 20,
 0,
 118,
 82,
 158,
 211,
 219,
 27,
 8,
 97,
 100,
 59,
 173,
 193,
 240,
 188,
 45,
 44,
 150,
 8,
 34,
 210,
 156,
 54,
 202,
 89,
 75,
 27,
 197,
 56,
 94,
 22,
 230,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 221,
 123,
 115,
 110,
 165,
 61,
 178,
 148,
 246,
 198,
 202,
 0,
 57,
 125,
 58,
 95,
 47,
 151,
 202,
 165,
 240,
 252,
 255,
 149,
 42,
 187,
 174,
 177,
 67,
 11,
 195,
 104,
 219,
 189,
 187,
 221,


In [None]:
reconstructed_array_test = index_array[0 : rle_locations[0]]
reconstructed_array_test

In [30]:
expanded_rle_value = [
    index_array[rle_locations[0]]
    for frequency in range(index_array[rle_locations[0] + 1])
]

In [None]:
expanded_rle_value

In [35]:
reconstructed_array_test.extend(expanded_rle_value)

In [None]:
reconstructed_array_test

In [None]:
index_array

In [None]:
index_array[rle_locations[0] + 2 :]

In [None]:
reconstructed_array = decode_rle(rle_locations, index_array)

In [None]:
sys.getsizeof(reconstructed_array)

In [None]:
sys.getsizeof(reconstructed_array[0])

In [None]:
type(reconstructed_array[0])

In [None]:
reconstructed_array[0]

In [None]:
np.array(rle_l_raw, dtype=np.int16)

In [None]:
# Test for equality in reconstructed array
not_equal = False
for index in range(0, len(rle_l_raw)):
    if rle_l_raw[index] != reconstructed_array[index]:
        not_equal = True
        print(f"Reconstructed Array is unequal at position: {index}")

if not_equal:
    print(f"Arrays are not equivalent.")
else:
    print(f"Arrays are equivalent.")

In [192]:
formatted_encoded_rle_bytes = format_encoded_rle_to_bytes(index_array, rle_locations)

In [193]:
encoded_rle = np.frombuffer(formatted_encoded_rle_bytes, dtype=np.uint16)

In [194]:
rle_locations, index_array = parse_formatted_rle_bytes(formatted_encoded_rle_bytes)

In [195]:
reconstructed_array = decode_rle(rle_locations, index_array)

In [196]:
# Test for equality in reconstructed array
for index in range(0, len(rle_l_raw)):
    if rle_l_raw[index] != reconstructed_array[index]:
        print(f"Reconstructed Array is unequal at position: {index}")

In [None]:
len(formatted_encoded_rle_bytes)

In [None]:
len(data_huffman_encoded_bytes[:12])

In [None]:
formatted_encoded_rle_bytes

In [None]:
data_array = []
for index, data in enumerate(formatted_encoded_rle_bytes):
    data_array.append(data)
    print(f"index: {index}\ndata: {data}\n")
formatted_encoded_rle_bytes_pd = pd.DataFrame(data_array)

In [202]:
formatted_encoded_rle_bytes_pd.columns = ["value"]

In [203]:
formatted_encoded_rle_bytes_pd.index.name = "row"

In [None]:
formatted_encoded_rle_bytes_pd

In [None]:
rle_l_raw

In [None]:
data_huffman_encoded_bytes[:12]

In [98]:
data_array = []
for data in data_huffman_encoded_bytes:
    data_array.append(data)

In [None]:
sys.getsizeof(data_array[0])

In [None]:
max(data_array)

In [101]:
data_array = np.array(data_array, dtype=np.int16)

In [None]:
data_array

In [None]:
data_array[0]

In [None]:
print(f"Ratio of the formatted_encoded_rle_bytes ", end="")
print(f"to data_huffman_encoded_bytes: ", end="")
print(f"{len(formatted_encoded_rle_bytes) / len(data_huffman_encoded_bytes[:12])}")

#### Principal Components Example of the Run-Length-Encoded Problem


In [105]:
# Original file size: 197 KB
# Huffman Encoded Compressed File Size: 130 KB
# RLE Encoded Version of the Huffman Encoded Compressed File is of Size: 272 KB

In [106]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

In [107]:
rate, data = wavfile.read(current_file)

data_bytes = data.tobytes()

compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=data_bytes,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

In [None]:
sys.getsizeof(data)

In [None]:
sys.getsizeof(data_bytes)

## Attempting RLE Exclusively


In [110]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

rate, data = wavfile.read(current_file)

data_bytes = data.tobytes()

compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

In [None]:
os.getcwd() + "/" + current_file

In [None]:
data

In [128]:
index_array, rle_locations = encode_via_rle(data)

In [None]:
len(rle_locations)

In [None]:
index_array[:10]

In [None]:
len(data_bytes)

In [None]:
sys.getsizeof(data)

In [134]:
formatted_encoded_rle_bytes = format_encoded_rle_to_bytes(index_array, rle_locations)

In [None]:
sys.getsizeof(data)

In [None]:
sys.getsizeof(data_bytes)

In [None]:
sys.getsizeof(formatted_encoded_rle_bytes)

In [None]:
formatted_encoded_rle_bytes

In [141]:
compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=formatted_encoded_rle_bytes,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

In [None]:
sys.getsizeof(data_huffman_encoded_bytes)