In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [9]:
def process_chunk(chunk):
    # Process bitstrings
    X_chunk = np.vstack([np.frombuffer(s.encode('ascii'), dtype=np.uint8) - ord('0') for s in chunk['concatenated_bitstrings']])
    # Process target variable
    y_chunk = chunk['num_included'].values.reshape(-1, 1)
    return X_chunk, y_chunk

# Get total number of rows in the CSV file
total_rows = sum(1 for _ in open('processed_data.csv', 'r')) - 1  # Subtract 1 for header

# Process the CSV file in chunks
chunk_size = 10000  # Adjust this based on your available memory
X_list = []
y_list = []

# Create a tqdm progress bar
with tqdm(total=total_rows, desc="Processing chunks") as pbar:
    for chunk in pd.read_csv('processed_data.csv', chunksize=chunk_size):
        X_chunk, y_chunk = process_chunk(chunk)
        X_list.append(X_chunk)
        y_list.append(y_chunk)
        pbar.update(len(chunk))  # Update progress bar

# Concatenate all chunks
X = np.vstack(X_list)
y = np.vstack(y_list)

Processing chunks: 100%|██████████| 100000/100000 [02:12<00:00, 755.35it/s]


In [10]:
# Normalize target values (num_included) to [0, 1] range
scaler = MinMaxScaler()
y_normalized = scaler.fit_transform(y)

print("X shape:", X.shape)
print("X dtype:", X.dtype)
print("y shape:", y.shape)
print("y dtype:", y.dtype)

# Verify the shape and content
num_rows, num_cols = X.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns (length of each bitstring): {num_cols}")

# If you need to verify the content of some rows:
print("First row of X:", X[0])
print("Last row of X:", X[-1])
print("First 5 values of y:", y[:5].flatten())
print("First 5 normalized values of y:", y_normalized[:5].flatten())

# Save processed data
np.save('processed_X.npy', X)
np.save('processed_y.npy', y_normalized)
print("Processed data saved to 'processed_X.npy' and 'processed_y.npy'")

X shape: (100000, 133008)
X dtype: uint8
y shape: (100000, 1)
y dtype: float64
Number of rows: 100000
Number of columns (length of each bitstring): 133008
First row of X: [0 0 0 ... 0 0 0]
Last row of X: [0 0 0 ... 0 0 0]
First 5 values of y: [ 757. 3108.  711. 5348. 9204.]
First 5 normalized values of y: [0.07560756 0.31073107 0.0710071  0.53475348 0.92039204]
Processed data saved to 'processed_X.npy' and 'processed_y.npy'


In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Define the model
def create_model(input_dim):
    model = Sequential([
        Dense(1024, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dense(1)  # Output layer
    ])
    return model

In [13]:
# Create and compile the model
model = create_model(X_train.shape[1])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    'best_model_weights.weights.h5',
    save_best_only=True,
    save_weights_only=True
)

In [17]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

Epoch 1/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m551s[0m 535ms/step - loss: 33463232.0000 - mae: 5010.2769 - val_loss: 31819438.0000 - val_mae: 4848.5581
Epoch 2/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m521s[0m 521ms/step - loss: 30648082.0000 - mae: 4771.3525 - val_loss: 16409704.0000 - val_mae: 3296.7056
Epoch 3/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 518ms/step - loss: 24550430.0000 - mae: 4313.5923 - val_loss: 52854592.0000 - val_mae: 6633.7822
Epoch 4/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m522s[0m 522ms/step - loss: 17055262.0000 - mae: 3681.2800 - val_loss: 22771616.0000 - val_mae: 3940.8162
Epoch 5/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m505s[0m 505ms/step - loss: 10799917.0000 - mae: 2984.8501 - val_loss: 14050205.0000 - val_mae: 3067.8875
Epoch 6/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 518ms/step - loss: 6232

KeyboardInterrupt: 

In [None]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test MAE: {test_mae}")

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Inverse transform the predictions to get original scale
y_pred_original = scaler.inverse_transform(y_pred)
y_test_original = scaler.inverse_transform(y_test)

In [None]:
# Calculate MAE on the original scale
mae_original = np.mean(np.abs(y_pred_original - y_test_original))
print(f"MAE on original scale: {mae_original}")