# Testing Run Length Encoding


## Imports


In [5]:
from importlib.util import spec_from_loader, module_from_spec
from importlib.machinery import SourceFileLoader
from scipy.io import wavfile
from glob import glob
import numpy as np
import sys, os
import pandas as pd

# Import Encode
spec = spec_from_loader("encode", SourceFileLoader("encode", "../../encode"))
encode = module_from_spec(spec)
spec.loader.exec_module(encode)

# Import Decode
spec = spec_from_loader("decode", SourceFileLoader("decode", "../../decode"))
decode = module_from_spec(spec)
spec.loader.exec_module(decode)

## Function Definitions


In [6]:
UNSIGNED_INTEGER_CUTOFF_VALUE = 65530

In [105]:
def encode_via_rle(
    original_data_list: list,
    use_rle_locations=False,
    UNSIGNED_INTEGER_CUTOFF_VALUE=65530,
):
    """This algorithm will search for contiguous values within the
       array. When the number_of_values_apriori_in_index_array is greater than the value
       65530, the count is reduced by this value in order to
       prevent an overflow of an unsigned 16-bit integer. This allows
       for the data to be stored with 2 bytes when the format of the
       array is a known value in advance of decoding this format. The
       choice of integer 65530 is an arbitrary value less than that of
       the maximum value of an unsigned 16-bit integer (65536). In this
       body of work, the value 65530 is denoted as the
       UNSIGNED_INTEGER_CUTOFF_VALUE.

    Args:
        original_data_list (list): This is a list of integer values to
        be encoded.

    Returns:
        index_array (list): This is the list of run length encoded
                            values.
        rle_locations_in_index_array (list): This is a list of locations of elements
                              that are repeated that are present in the
                              array of indices.
    """
    initial_index = 0
    second_index = 1
    frequency = 0
    index_array = []
    rle_locations_in_index_array = []
    number_of_values_apriori_in_index_array = 0

    while second_index < len(original_data_list):
        if original_data_list[initial_index] == original_data_list[second_index]:
            index_array.append(original_data_list[initial_index])
            if use_rle_locations:
                rle_locations_in_index_array.append(
                    number_of_values_apriori_in_index_array
                )
            frequency += 1  # This accounts for the first detected value.

            # continue searching the breadth of the array; increasing
            # the detected frequency of the value. This will break out
            # of the while loop when the first & second indices are not
            # equal. ∴, the first value will not be accounted for
            # because the while loop will be broken where the value
            # would have been incremented.
            while (
                second_index <= len(original_data_list)
                and original_data_list[initial_index]
                == original_data_list[second_index]
            ):
                frequency += 1
                second_index += 1
            index_array.append(frequency)
            if use_rle_locations:
                if (
                    number_of_values_apriori_in_index_array
                    > UNSIGNED_INTEGER_CUTOFF_VALUE
                ):
                    number_of_values_apriori_in_index_array -= (
                        UNSIGNED_INTEGER_CUTOFF_VALUE
                    )
                # The code below is to skip over the indices that contain
                # the run-length-encoded value and the frequency of that
                # value.
                number_of_values_apriori_in_index_array += 2
        else:
            index_array.append(original_data_list[initial_index])
            if use_rle_locations:
                if (
                    number_of_values_apriori_in_index_array
                    > UNSIGNED_INTEGER_CUTOFF_VALUE
                ):
                    number_of_values_apriori_in_index_array -= (
                        UNSIGNED_INTEGER_CUTOFF_VALUE
                    )
            # The code below is to skip over the index that contains an
            # individual run-length-encoded value.
            if use_rle_locations:
                number_of_values_apriori_in_index_array += 1
            else:
                index_array.append(1)
        frequency = 0
        initial_index = second_index
        second_index += 1
    if original_data_list[-1] != index_array[-2]:
        index_array.append(original_data_list[-1])
        if use_rle_locations == False:
            index_array.append(1)
    return index_array, rle_locations_in_index_array

In [8]:
def identify_index_split(rle_locations_in_index_array, verbose=False):
    """This will identify where the rle_location values reset to a value
       that is below 65530.
    Args:
        rle_locations_in_index_array (list): This is a list of locations of values that
                              are encoded in the index_array.

    Returns:
        split_indices (deque): This is a list of indices where the values
                              are reset in the rle_locations_array.
    """
    split_indices = []
    for index, value in enumerate(rle_locations_in_index_array):
        if index == 0:
            prev_value = value
            continue
        if value > prev_value:
            prev_value = value
        else:
            if verbose:
                print(f"index: {index}")
                print(f"prev_value: {prev_value}")
                print(f"value: {value}")
            split_indices.append(index)
            prev_value = value
    return split_indices

In [106]:
def decode_rle(
    index_array: list,
    rle_locations_in_index_array=[],
    use_rle_locations=False,
    UNSIGNED_INTEGER_CUTOFF_VALUE=65530,
):
    """This will expand the index array where the values have been
       compressed by run-length-encoding. If use_rle_locations is set to
       'False', then the index_array will be presumed to have a format of
       ['value', 'frequency', 'value', etc...]. A separate
       implementation is defined to parse this format. If
       use_rle_locations is set to 'True', then the
       rle_locations_in_index_array is non-optional and must be used as
       an input.

    Args:
        index_array (list): This is a list of values which contain
                            either signular values or run-length-encoded
                            values followed by the frequency of the
                            run-length-encoded value.
        rle_locations_in_index_array (list, optional): This is a list of
                              locations in the index_array where the
                              locations have been run-length-encoded.
                              The subsequent values of these locations
                              are frequencies of the run-length-encoded
                              values.

    Returns:
        reconstructed_array (list): This is the list of the original
                                    values before run-length-encoding
                                    was applied.
    """

    index = 0
    rle_index = 0
    rle_location = 0
    reconstructed_array = []

    if use_rle_locations:
        try:
            if rle_locations_in_index_array:
                # rle_locations_in_index_array exists
                pass
        except:
            error_string = (
                "rle_locations_in_index_array must be "
                + "declared when 'use_rle_locations' is set to 'True'."
            )
            raise NameError(error_string)
        rle_location_split_array = identify_index_split(rle_locations_in_index_array)
        current_rle_location_split_array_index = 0

        while rle_index < len(rle_locations_in_index_array):
            try:
                if (
                    rle_index
                    >= rle_location_split_array[current_rle_location_split_array_index]
                ):
                    current_rle_location_split_array_index += 1
            except:
                # This is an Index Location Error: the
                # rle_location_split_array contains indices that are
                # innermost boundaries within the
                # larger rle_locations_in_index_array array. That is to say,
                # the rle_locations_in_index_array
                # array will surpass the ultimate location in the
                # rle_location_split_array. The index of the
                # rle_location_split_array will not need to be incremented as
                # the index is being used as a scalar to scale the value of the
                # UNSIGNED_INTEGER_CUTOFF_VALUE to properly calculate the
                # rle_location in the index_array.
                pass

            rle_location = rle_locations_in_index_array[rle_index] + (
                current_rle_location_split_array_index * UNSIGNED_INTEGER_CUTOFF_VALUE
            )

            reconstructed_array.extend(index_array[index:(rle_location)])

            rle_index_of_value = rle_location
            rle_index_of_frequency = rle_location + 1

            expanded_rle_value = [
                index_array[rle_index_of_value]
                for frequency in range(index_array[rle_index_of_frequency])
            ]

            reconstructed_array.extend(expanded_rle_value)
            index = rle_index_of_value + 2
            rle_index += 1

        reconstructed_array.extend(index_array[index:])
        return reconstructed_array
    else:
        while index < len(index_array):
            reconstructed_array.extend(
                [index_array[index] for value in index_array[index + 1]]
            )
            index += 2
        return reconstructed_array

In [10]:
def format_encoded_rle_to_bytes(index_array: list, rle_locations_in_index_array: list):
    """This is the encoding driver of the run_length_encoded array.

    Args:
        index_array (list): This is the compressed list of values which
                            include run-length-encoded values followed
                            by their frequency.
        rle_locations_in_index_array (list): This is a list of index locations that are
                              run-length-encoded in the index array.
                              The frequency of the run-length-encoded
                              value is defined as the
                              index_array[rle_locations_in_index_array[current_rle_index] + 1]
                              or the subsequent value from the
                              run-length-encoded value in the
                              index_array.

    Returns:
        format_encoded_rle_bytes (bytes): This is the string of bytes of
                                          the formatted
                                          run-length-encoded array.
    """
    format_encoded_rle = [len(rle_locations_in_index_array)]
    format_encoded_rle.extend(rle_locations_in_index_array)
    format_encoded_rle.extend(index_array)
    format_encoded_rle = np.array(format_encoded_rle, dtype=np.uint16)
    format_encoded_rle_bytes = format_encoded_rle.tobytes()
    return format_encoded_rle_bytes

In [11]:
def parse_formatted_rle_bytes(format_encoded_rle_bytes: bytes):
    """This is the encoding driver of the run_length_encoded array.

    Args:
        format_encoded_rle_bytes (bytes): This is the compressed list of
                                          values in the form of a byte
                                          array. The order of the values
                                          is as following: length of the
                                          run-length-encoded locations
                                          array, the run-length-encoded
                                          locations array, and the
                                          index_array.

    Returns:
        rle_locations_in_index_array (list): This is a list of unsigned 16-bit
                              integers.
        index_array (list): This is a list of unsigned 16-bit integers.
                            These values contain run-length-encoded
                            sequences where the run-length-encoded-value
                            is followed by the frequency of the value.
    """
    encoded_rle = np.frombuffer(format_encoded_rle_bytes, dtype=np.uint16)
    len_rle_locations = encoded_rle[0]

    # +1 to skip over the rle_locations_in_index_array length in the line below:
    rle_locations_in_index_array = encoded_rle[1 : len_rle_locations + 1]

    index_array = encoded_rle[(len_rle_locations + 1) :]
    return rle_locations_in_index_array, index_array

In [12]:
# Test for equality in reconstructed array
def test_for_equality_in_reconstructed_array(
    original_array: list, reconstructed_array: list
):
    """This function tests to identify if the rle_l_raw array is
       equivalent to the reconstructed_array. It will print if the
       arrays are unequal or not. In the case that there is a
       discrepancy, the index of the current position of the
       discrepancy will be printed. This function has no return values.

    Args:
        rle_l_raw (list): This list is the original uncompressed
                          representation of data.
        reconstructed_array (list): This is the uncompressed
                                    representation of the data after
                                    run-length-encoding has been
                                    performed.
    """

    not_equal = False
    for index in range(0, len(original_array)):
        if original_array[index] != reconstructed_array[index]:
            not_equal = True
            print(f"Reconstructed Array is unequal at position: {index}")

    if not_equal:
        print(f"Arrays are not equivalent.")
    else:
        print(f"Arrays are equivalent.")

In [13]:
def read_file_bytes(file_path: str):
    """This function will read a file at the given path and return a
       string of bytes.

    Args:
        file_path (str): This is the path of the file to be read as
                         bytes.

    Returns:
        file_bytes (bytes): This is the string of bytes that represent
                            the file that was read.
    """
    with open(file_path, "rb") as fp:
        file_bytes = fp.read()
        fp.close()
    return file_bytes

In [14]:
def write_file_bytes(file_path: str, data_bytes: bytes):
    """This function will write the given data as bytes to the defined
       file path.

    Args:
        file_path (str): This is the path of the file to be read as
                         bytes.
        data_bytes (bytes): This is the string of bytes to be written to
                            a file.
    """
    with open(file_path, "wb+") as fp:
        fp.write(data_bytes)
        fp.close()

In [71]:
def print_output_file_size(output_file_path: str):
    """This function prints the size of the input output file in Bytes.

    Args:
        output_file_path (str): This is the path to the file to be
                                analyzed with respect to size in Bytes.

    Returns:
        output_file_size (int): this is the size of the file in Bytes.
    """
    file_size = os.path.getsize(output_file_path)
    print(f'File Name: "{os.path.basename(output_file_path)}".')
    print(f"Size: {file_size} Bytes.")
    return file_size

## Import Data & Huffman Encode Values


In [16]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

rate, data = wavfile.read(current_file)

In [17]:
len(data)

98699

In [18]:
data

array([-352, -416, -288, ...,  287,  223, -288], dtype=int16)

In [19]:
data_bytes = data.tobytes()

In [20]:
len(data_bytes)

197398

In [21]:
data_bytes

b'\xa0\xfe`\xfe\xe0\xfe\xe0\xfe`\xff`\x02_\x00`\xff\xdf\x00\xdf\x00\xdf\x01\xa0\x04\xe0\x04`\x03\xe0\x02 \x05\xe0\x04`\x05\xe0\x04\xe0\x02\xa0\x02`\x05`\x04`\x05\xe1\x06\xe1\x06\xa1\x07\xa1\x06a\x07!\x07 \x05 \x05\xe0\x05`\x05\xe0\x05\xa1\x06\xa1\x08!\t!\t\xe1\t\xa2\x0c\xa2\n\xa1\t\xe1\x07a\x08\xa1\x08\xe1\x06!\x06\xa1\t!\n\xa1\x07!\x08!\x06\xa0\x05\xe0\x05`\x05`\x05`\x04\xe0\x04\xa1\x06\xe1\x07\xe1\x06!\x06a\x06!\x07 \x05\xdf\x01\xdf\x00\x9f\x01\xdf\x01\xa0\x02\xa0\x04!\x06\xe1\x07\xe0\x05\xa1\x06a\x06\xe0\x04\xe0\x05\xa0\x04\xe0\x04\xe0\x04\xa1\x06\xe1\x08\xa1\x06!\x06\xa1\x07\xa0\x04\xe0\x03\xe0\x02\xa0\x03a\x06\xe1\x06!\x06\xe0\x04\xe0\x05\xe1\x06\xa1\x06!\x06`\x04\xe0\x03 \x04\xa0\x02\xa0\x02_\x00`\xff\x1f\x00`\xff`\x03 \x05\xa0\x03\xe0\x02\xa0\x02\x9f\x00\xe0\xfe\x1f\x00_\x00\xdf\x00\x9f\x01`\x02\xdf\x00\xa0\xfe\x9f\x00 \xff\xe0\xfe\x9f\x00\x9f\x00_\x01`\x02\x9f\x01\x9f\x01\x1f\x01\xe0\x02`\x03 \x05\xe0\x04\xe0\x05\xa0\x04\xe0\x05\xa1\x06\xe0\x04\xa0\x04\xe0\x02\xa0\x03`\x02 \x02

In [22]:
compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=data_bytes,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

# type(data_huffman_encoded_bytes)

In [72]:
# Original File Size
original_file_size = print_output_file_size(current_file)

File Name: "102b47d9-371e-412a-8995-0dc6115ab2bb.wav".
Size: 197442 Bytes.


In [73]:
# Huffman Encoded File Size
huffman_encoded_file_size = print_output_file_size(compressed_file_path)

File Name: "102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire".
Size: 130459 Bytes.


## Testing Compressing a substring of the larger dataset


In [36]:
# Convert the Byte array to a list of integers:
data_huffman_encoded_bytes_sub_string = data_huffman_encoded_bytes[:]

rle_l_raw = list(data_huffman_encoded_bytes_sub_string)
rle_l_raw

[128,
 4,
 149,
 135,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 125,
 148,
 40,
 140,
 2,
 50,
 48,
 148,
 140,
 4,
 48,
 48,
 48,
 48,
 148,
 140,
 2,
 48,
 56,
 148,
 140,
 5,
 48,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 48,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 48,
 148,
 140,
 2,
 53,
 101,
 148,
 140,
 8,
 48,
 48,
 48,
 49,
 49,
 48,
 48,
 49,
 148,
 140,
 2,
 48,
 102,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 48,
 148,
 140,
 2,
 54,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 48,
 49,
 148,
 140,
 2,
 101,
 51,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 49,
 48,
 148,
 140,
 9,
 48,
 48,
 48,
 49,
 49,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 102,
 102,
 148,
 140,
 6,
 48,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 97,
 48,
 148,
 140,
 4,
 48,
 48,
 49,
 48,
 148,
 140,
 2,
 50,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 48,
 148,
 140,
 2,
 97,
 49,
 148,
 140,
 5,
 48,
 48,
 49,
 49,
 49,
 148,
 140,
 2,
 100,
 1

In [37]:
len(data_huffman_encoded_bytes)

130459

In [38]:
len(rle_l_raw)

130459

In [59]:
index_array, rle_locations_in_index_array = encode_via_rle(original_data_list=rle_l_raw)

In [60]:
# Convert RLE to Byte String
formatted_encoded_rle_byte_string = format_encoded_rle_to_bytes(
    index_array, rle_locations_in_index_array
)

In [61]:
formatted_encoded_rle_byte_string

b'\xcc\x03\x05\x00\x11\x00\x1b\x00\'\x00)\x00+\x005\x007\x009\x00D\x00F\x00J\x00T\x00V\x00d\x00f\x00i\x00t\x00v\x00y\x00~\x00\x83\x00\x85\x00\x8f\x00\x9b\x00\x9d\x00\xa8\x00\xaa\x00\xb6\x00\xd0\x00\xdb\x00\xdd\x00\xe8\x00\xec\x00\xf7\x00\x06\x01\t\x01\x14\x01 \x01"\x01-\x01:\x01E\x01P\x01\\\x01j\x01w\x01\x85\x01\x94\x01\x96\x01\xa4\x01\xa6\x01\xb3\x01\xc1\x01\xc7\x01\xcf\x01\xd2\x01\xdf\x01\xe2\x01\xe5\x01\xf2\x01\xf5\x01\xfd\x01\x05\x02\x08\x02\n\x02\x17\x02\'\x02)\x026\x02I\x02P\x02]\x02_\x02k\x02y\x02\x83\x02\x85\x02\x8f\x02\x91\x02\x9d\x02\x9f\x02\xa1\x02\xab\x02\xb8\x02\xbb\x02\xc5\x02\xc7\x02\xd1\x02\xdd\x02\xdf\x02\xe9\x02\xeb\x02\xf6\x02\xf8\x02\xfb\x02\x05\x03\x07\x03\x0c\x03\x16\x03\x18\x03\x1d\x03(\x03*\x031\x03;\x03=\x03D\x03P\x03R\x03Y\x03[\x03e\x03g\x03x\x03z\x03\x80\x03\x8a\x03\x8c\x03\x90\x03\x9a\x03\x9c\x03\x9e\x03\xa9\x03\xab\x03\xad\x03\xaf\x03\xb9\x03\xbb\x03\xbd\x03\xc1\x03\xcb\x03\xcd\x03\xcf\x03\xd3\x03\xde\x03\xe0\x03\xe2\x03\xe6\x03\xf1\x03\xf3\x03\xf5\x03\xf9\

In [62]:
sys.getsizeof(formatted_encoded_rle_byte_string)

262507

In [63]:
output_file_path = os.getcwd() + "/data/"
file_name = "formatted_encoded_rle_byte_string.brainwire"
output_file_path += file_name
output_file_path

'/Volumes/T7/Github/Neuralink-Compression-Challenge/analysis/tutorials/data/formatted_encoded_rle_byte_string.brainwire'

In [64]:
write_file_bytes(
    file_path=output_file_path, data_bytes=formatted_encoded_rle_byte_string
)

In [74]:
rle_huffman_encoded_file_size = print_output_file_size(
    output_file_path=output_file_path
)

File Name: "formatted_encoded_rle_byte_string.brainwire".
Size: 262474 Bytes.


##### Comparing File Sizes


In [79]:
original_file_size

197442

In [80]:
huffman_encoded_file_size

130459

In [78]:
rle_huffman_encoded_file_size

262474

In [81]:
"""Summary: 
The huffman encoded file is read as bytes. Each byte is represented
by a single integer. Each integer is run length encoded. This results in
a smaller integer list. However, the original list of amplitudes was
represented by two bytes per int. This means there were approximately
two integers for every original integer that was then run length
encoded. This will never be compressed less than the original amount of
integers. With the addition of an array of indices which define where
values are run-length-encoded, this increases the size of the
run-length-encoding. ∴ the value is approximately greater than twice
the size of the original number of amplitudes. The only solution is to 
run-length-encode the original list of amplitudes rather than 
run-length-encoding the integer representation of the byte 
representation of the original amplitudes.
"""

rle_huffman_encoded_file_size / huffman_encoded_file_size

2.011927118864931

In [82]:
rle_huffman_encoded_file_size / original_file_size

1.3293726765328553

##### Applying Run-Length-Encoding to Original Array of Amplitudes


In [None]:
"""Summary:
Creating an array of indices after run-length-encoding the array of
amplitudes results in an array that is larger than the original file
size. This is due to the fact that the intended advantage is to exploit
the size of a 16-bit integer rather than a 32-bit integer to reduce the
number of values that are populated in the run-length-encoded array.
"""

In [83]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

rate, data = wavfile.read(current_file)

In [84]:
index_array, rle_locations_in_index_array = encode_via_rle(data)

In [86]:
len(data)

98699

In [91]:
(len(index_array) + len(rle_locations_in_index_array)) / len(data)

1.0615001165158715

In [92]:
formatted_encoded_rle_byte_string = format_encoded_rle_to_bytes(
    index_array, rle_locations_in_index_array
)

In [95]:
output_file_path = os.path.dirname(output_file_path)
file_name = "formatted_rle_byte_string.brainwire"
output_file_path += file_name

In [97]:
write_file_bytes(
    file_path=output_file_path, data_bytes=formatted_encoded_rle_byte_string
)

In [98]:
formatted_rle_byte_string_size = print_output_file_size(
    output_file_path=output_file_path
)

File Name: "dataformatted_rle_byte_string.brainwire".
Size: 209540 Bytes.


In [101]:
formatted_rle_byte_string_size / original_file_size

1.0612736905015143

##### Applying Original Run-Length-Encoding to Original Array of Amplitudes (No Array of Indices)


In [None]:
# Testing the new definition of "encode_via_rle":

##### Continuing Compression & Decompression Testing


In [42]:
encoded_rle = np.frombuffer(formatted_encoded_rle_byte_string, dtype=np.uint16)

In [43]:
encoded_rle

array([972,   5,  17, ..., 101,  46,  28], dtype=uint16)

In [None]:
len(encoded_rle)

In [19]:
rle_locations_in_index_array, index_array = parse_formatted_rle_bytes(
    formatted_encoded_rle_byte_string
)

In [20]:
reconstructed_array = decode_rle(rle_locations_in_index_array, index_array)

In [None]:
# Test for equality in reconstructed array
test_for_equality_in_reconstructed_array(rle_l_raw, reconstructed_array)

In [None]:
len(formatted_encoded_rle_byte_string)

In [None]:
len(data_huffman_encoded_bytes[:12])

In [None]:
formatted_encoded_rle_byte_string

In [None]:
data_array = []
for index, data in enumerate(formatted_encoded_rle_byte_string):
    data_array.append(data)
    print(f"index: {index}\ndata: {data}\n")
formatted_encoded_rle_bytes_pd = pd.DataFrame(data_array)

In [26]:
formatted_encoded_rle_bytes_pd.columns = ["value"]

In [27]:
formatted_encoded_rle_bytes_pd.index.name = "row"

In [None]:
formatted_encoded_rle_bytes_pd

In [None]:
formatted_encoded_rle_bytes_pd

In [None]:
rle_l_raw

In [None]:
data_huffman_encoded_bytes[:12]

In [32]:
data_array = []
for data in data_huffman_encoded_bytes:
    data_array.append(data)

In [None]:
sys.getsizeof(data_array[0])

In [None]:
max(data_array)

In [35]:
data_array = np.array(data_array, dtype=np.int16)

In [None]:
data_array

In [None]:
data_array[0]

In [None]:
print(f"Ratio of the formatted_encoded_rle_byte_string ", end="")
print(f"to data_huffman_encoded_bytes: ", end="")
print(f"{len(formatted_encoded_rle_byte_string) / len(data_huffman_encoded_bytes)}")

#### Principal Components Example of the Run-Length-Encoded Problem


In [39]:
# Original file size: 197 KB
# Huffman Encoded Compressed File Size: 130 KB
# RLE Encoded Version of the Huffman Encoded Compressed File is of Size: 272 KB

In [40]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

In [41]:
rate, data = wavfile.read(current_file)

data_bytes = data.tobytes()

compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=data_bytes,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

In [None]:
sys.getsizeof(data)

In [None]:
sys.getsizeof(data_bytes)

## Attempting RLE Exclusively


In [66]:
data_dir = "../../data/"
data_file_l = glob(data_dir + "*.wav")
current_file = data_file_l[0]

rate, data = wavfile.read(current_file)

data_bytes = data.tobytes()

compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

In [None]:
os.getcwd() + "/" + current_file

In [None]:
data

In [69]:
index_array, rle_locations_in_index_array = encode_via_rle(data)

In [None]:
len(rle_locations_in_index_array)

In [None]:
index_array[:10]

In [None]:
len(data_bytes)

In [None]:
sys.getsizeof(data)

In [74]:
formatted_encoded_rle_byte_string = format_encoded_rle_to_bytes(
    index_array, rle_locations_in_index_array
)

In [None]:
sys.getsizeof(data)

In [None]:
sys.getsizeof(data_bytes)

In [None]:
sys.getsizeof(formatted_encoded_rle_byte_string)

In [None]:
formatted_encoded_rle_byte_string

In [79]:
compressed_file_path = "../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire"

encode.huffman_encoding(
    input_data=formatted_encoded_rle_byte_string,
    compressed_file_path="../../data/102b47d9-371e-412a-8995-0dc6115ab2bb.wav.brainwire",
)

with open(compressed_file_path, "rb") as fp:
    data_huffman_encoded_bytes = fp.read()
    fp.close()

In [None]:
sys.getsizeof(data_huffman_encoded_bytes)

#### Testing the Trivial Example of Conversion: numpy array to bytes to rle to bytes to numpy array.


##### Creating Data


In [107]:
test_array = np.array([-2, -1, 0, 1, 2, 2, 2, 2, 1, 0, 1], dtype=np.int16)

In [None]:
len(test_array)

In [None]:
test_array_bytes = test_array.tobytes()
test_array_bytes

In [129]:
test_array_bytes_to_int = np.array(
    [byte for index, byte in enumerate(test_array_bytes)], dtype=np.uint16
)

In [None]:
test_array_bytes_to_int.tobytes()

In [None]:
sys.getsizeof(test_array_bytes_to_int.tobytes())

In [None]:
len(test_array_bytes)

##### Writing Data to a File


In [91]:
output_file_path = os.getcwd() + "/data/"
file_name = "testfile.brainwire"
output_file_path = output_file_path + file_name

In [92]:
write_file_bytes(file_path=output_file_path, data_bytes=test_array_bytes)

In [None]:
print_output_file_size(output_file_path=output_file_path)

##### RLE on Data


In [99]:
index_array, rle_locations_in_index_array = encode_via_rle(
    original_data_list=test_array_bytes
)

formatted_encoded_rle_byte_string = format_encoded_rle_to_bytes(
    index_array, rle_locations_in_index_array
)

In [None]:
formatted_encoded_rle_byte_string

In [101]:
write_file_bytes(
    file_path=output_file_path, data_bytes=formatted_encoded_rle_byte_string
)

In [None]:
os.path.getsize(output_file_path)