In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
# Load the pickle file
with open('training-data-bitstrings-1725968884597180500.pkl', 'rb') as f:
    data = pickle.load(f)

# Unpack the data
X, y = data

# Convert y to a DataFrame for easier handling
df_y = pd.DataFrame(y, columns=['num_included', 'num_excluded'])

In [3]:
def standardize_dataset(X, max_filters=None):
    # Function to convert bytes to binary string
    def bytes_to_binary(byte_string):
        return ''.join(format(byte, '08b') for byte in byte_string)
    
    # Convert all filters to binary strings
    X_binary = [[bytes_to_binary(filter_bytes) for filter_bytes in sample] for sample in X]
    
    # Determine the maximum number of filters if not provided
    if max_filters is None:
        max_filters = max(len(sample) for sample in X_binary)
    
    # Determine the maximum length for each filter position
    max_lengths = [max(len(sample[i]) for sample in X_binary if i < len(sample)) 
                   for i in range(max_filters)]
    
    # Pad each sample to have max_filters, and each filter to have the max length for its position
    X_standardized = []
    for sample in X_binary:
        padded_sample = []
        for i in range(max_filters):
            if i < len(sample):
                # Pad existing filter to max length
                padded_filter = sample[i].ljust(max_lengths[i], '0')
            else:
                # Create empty filter of max length
                padded_filter = '0' * max_lengths[i]
            padded_sample.append(padded_filter)
        X_standardized.append(padded_sample)
    
    # Convert to numpy array for easier handling in deep learning models
    X_standardized = np.array(X_standardized)
    
    return X_standardized, max_filters, max_lengths

In [4]:
# Usage example:
X_standardized, max_filters, max_lengths = standardize_dataset(X)

print(f"Max number of filters: {max_filters}")
print(f"Max lengths of filters: {max_lengths}")
print(f"Shape of standardized dataset: {X_standardized.shape}")

# Display a few samples to verify
for i in range(min(3, len(X_standardized))):
    print(f"\nSample {i+1}:")
    for j, filter_str in enumerate(X_standardized[i]):
        print(f"  Filter {j+1}: {len(filter_str)} bits")
        print(f"    First 100 bits: {filter_str[:100]}...")

Max number of filters: 4
Max lengths of filters: [12848, 632, 192, 96]
Shape of standardized dataset: (10000, 4)

Sample 1:
  Filter 1: 12848 bits
    First 100 bits: 0000011100000000000000000000000000000000000000000000000000000000110000110001100000001000010100100000...
  Filter 2: 632 bits
    First 100 bits: 0000011100000000000000000000000000000000000000000000000000000000001101101110110000000001000010010001...
  Filter 3: 192 bits
    First 100 bits: 0000011100000000000000000000000000000000000000000000000000000000000001100100110101100001001011010000...
  Filter 4: 96 bits
    First 100 bits: 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000...

Sample 2:
  Filter 1: 12848 bits
    First 100 bits: 0000011100000000000000000000000000000000000000000000000000000000101010000110100101101111000110010000...
  Filter 2: 632 bits
    First 100 bits: 0000011100000000000000000000000000000000000000000000000000000000100101000111000111010100000010100100

In [5]:
# Extract 'num_included' and convert to numpy array
y_target = df_y['num_included'].values

# Convert to float type (usually preferred for deep learning models)
y_target = y_target.astype(np.float32)

print("Shape of y_target:", y_target.shape)
print("Data type of y_target:", y_target.dtype)
print("First few values of y_target:", y_target[:5])

Shape of y_target: (10000,)
Data type of y_target: float32
First few values of y_target: [976. 131. 110. 851. 731.]
