In [None]:
pip install imageio

Collecting imageio
  Downloading imageio-2.36.0-py3-none-any.whl (315 kB)
     -------------------------------------- 315.4/315.4 kB 3.3 MB/s eta 0:00:00
Installing collected packages: imageio
Successfully installed imageio-2.36.0
Note: you may need to restart the kernel to use updated packages.




In [None]:
import os
import array
import numpy as np
import imageio  # Use imageio for saving image
from concurrent.futures import ThreadPoolExecutor # for parallelization
import time
import shutil
import pandas as pd

To process .byte file to .png

1. Removed the addresses as addresses don't contain meaningful value

2. Processing of unknown '??' values:
* Ignore (we are choosing to do this, but size of the byte file might be altered)
* Alternative: convert to 00 (the original structure of the byte files is retained, but some are a huge chuck of BLACK lol)

In [None]:
start = time.time()
## THIS IS FOR ALL FILES IN THE TRAIN FOLDER. USES PARALLELIZATION

# Define the directory where the .bytes files are stored and the output directory
train_dir = 'train'  # Folder containing .bytes files
output_dir = 'byte_images'  # Folder to save the converted PNG images

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to clean and convert a single .bytes file to a PNG image
def clean_and_convert_bytes_to_png(file):
    if file.endswith("bytes"):  # Only process files with the '.bytes' extension
        file_base = file.split('.')[0]

        # Open the original .bytes file from the /train folder
        with open(os.path.join(train_dir, file), "r") as fp:
            hex_values = []
            for line in fp:
                # Remove the address (the first element) and keep only the bytes
                byte_values = line.rstrip().split(" ")[1:]
                hex_values.extend(byte_values)  # Add cleaned byte values to the list

        # # Convert hex values to integers, ignoring unknown '??' bytes
        # byte_values = [int(hv, 16) for hv in hex_values if hv != '??']

        # Convert hex values to integers, replacing unknown '??' bytes with '00'
        byte_values = [int(hv, 16) if hv != '??' else 0 for hv in hex_values]

        # Determine the file size in bytes and calculate image dimensions
        ln = len(byte_values)  # Number of byte values
        width = int(np.ceil(np.sqrt(ln)))  # Image width is the square root of the file size
        rem = ln % width  # Remainder for reshaping

        # Truncate any extra bytes that don't fit into the shape
        byte_values = byte_values[:ln - rem]

        # Reshape the byte values into a 2D array for image representation
        g = np.reshape(byte_values, (len(byte_values) // width, width))
        g = np.uint8(g)  # Convert the data to uint8 type

        # Save the grayscale image using imageio
        output_image_path = os.path.join(output_dir, f'{file_base}.png')
        imageio.imwrite(output_image_path, g)

        print(f"Converted {file} to {output_image_path}")

# List of files in the /train directory
files = os.listdir(train_dir)

# Use ThreadPoolExecutor to parallelize the cleaning and conversion process
with ThreadPoolExecutor() as executor:
    executor.map(clean_and_convert_bytes_to_png, files)

print("Conversion of byte files to PNG images completed.")
end = time.time()

print("Time taken: {} m {} s".format(int((end-start)//60), int((end-start)%60)))

Converted 04hSzLv5s2TDYPlcgpHB.bytes to byte_images\04hSzLv5s2TDYPlcgpHB.png
Converted 0aVNj3qFgEZI6Akf4Kuv.bytes to byte_images\0aVNj3qFgEZI6Akf4Kuv.png
Converted 0eN9lyQfwmTVk7C2ZoYp.bytes to byte_images\0eN9lyQfwmTVk7C2ZoYp.png
Converted 0hZEqJ5eMVjU21HAG7Ii.bytes to byte_images\0hZEqJ5eMVjU21HAG7Ii.png
Converted 08BX5Slp2I1FraZWbc6j.bytes to byte_images\08BX5Slp2I1FraZWbc6j.png
Converted 0DbLeKSoxu47wjqVHsi9.bytes to byte_images\0DbLeKSoxu47wjqVHsi9.png
Converted 0GUIi7xAlODwZ4YBenNM.bytes to byte_images\0GUIi7xAlODwZ4YBenNM.png
Converted 0EL7OGZKozbiNCVP61gk.bytes to byte_images\0EL7OGZKozbiNCVP61gk.png
Converted 0gcZkSFr7VnEmLPbTxUe.bytes to byte_images\0gcZkSFr7VnEmLPbTxUe.png
Converted 0ItXlAUOhK8ZYdDf7HW4.bytes to byte_images\0ItXlAUOhK8ZYdDf7HW4.png
Converted 05aiMRw13bYWqZ8OHvjl.bytes to byte_images\05aiMRw13bYWqZ8OHvjl.png
Converted 065EZhxgbLRSHsB87uIF.bytes to byte_images\065EZhxgbLRSHsB87uIF.png
Converted 0gxJ1YmwFUvnOzoM8N53.bytes to byte_images\0gxJ1YmwFUvnOzoM8N53.png

Need to resize images to a certain size? (maybe 256x256??) or pad?? or cut???

## Train Test Split

In [None]:
start = time.time()

# Define the paths
byte_images_folder = './byte_images'
train_folder = './train_byte_image'
test_folder = './test_byte_image'
train_labels_path = './train_labels.csv'
test_labels_path = './test_labels.csv'

# Create train and test directories if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Load the CSV files into DataFrames
train_labels_df = pd.read_csv(train_labels_path)
test_labels_df = pd.read_csv(test_labels_path)

# Get the list of IDs from train and test CSV files
train_ids = train_labels_df['Id'].tolist()
test_ids = test_labels_df['Id'].tolist()

# Iterate through the images in the byte_images folder
for filename in os.listdir(byte_images_folder):
    # Extract the file ID (without the .png extension)
    file_id = filename.replace('.png', '')

    # Check if the file ID is in train or test labels
    source_path = os.path.join(byte_images_folder, filename)
    if file_id in train_ids:
        # Move the file to the train_byte_image folder
        shutil.move(source_path, os.path.join(train_folder, filename))
    elif file_id in test_ids:
        # Move the file to the test_byte_image folder
        shutil.move(source_path, os.path.join(test_folder, filename))

print("Files have been moved to the respective directories.")

end = time.time()

print("Time taken: {} m {} s".format(int((end-start)//60), int((end-start)%60)))


Files have been moved to the respective directories.
Time taken: 0 m 2 s
