In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [14]:
def process_chunk(chunk):
    # Process bitstrings
    X_chunk = np.vstack([np.frombuffer(s.encode('ascii'), dtype=np.uint8) - ord('0') for s in chunk['concatenated_bitstrings']])
    # Process target variable
    y_chunk = chunk['num_included'].values.reshape(-1, 1)
    return X_chunk, y_chunk

# Get total number of rows in the CSV file
total_rows = sum(1 for _ in open('processed_data.csv', 'r')) - 1  # Subtract 1 for header

# Process the CSV file in chunks
chunk_size = 10000  # Adjust this based on your available memory
X_list = []
y_list = []

# Create a tqdm progress bar
with tqdm(total=total_rows, desc="Processing chunks") as pbar:
    for chunk in pd.read_csv('processed_data.csv', chunksize=chunk_size):
        X_chunk, y_chunk = process_chunk(chunk)
        X_list.append(X_chunk)
        y_list.append(y_chunk)
        pbar.update(len(chunk))  # Update progress bar

# Concatenate all chunks
X = np.vstack(X_list)
y = np.vstack(y_list)

Processing chunks: 100%|██████████| 10000/10000 [00:01<00:00, 6166.37it/s]


In [15]:
# Normalize target values (num_included) to [0, 1] range
scaler = MinMaxScaler()
y_normalized = scaler.fit_transform(y)

print("X shape:", X.shape)
print("X dtype:", X.dtype)
print("y shape:", y.shape)
print("y dtype:", y.dtype)

# Verify the shape and content
num_rows, num_cols = X.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns (length of each bitstring): {num_cols}")

# If you need to verify the content of some rows:
print("First row of X:", X[0])
print("Last row of X:", X[-1])
print("First 5 values of y:", y[:5].flatten())
print("First 5 normalized values of y:", y_normalized[:5].flatten())

# Save processed data
np.save('processed_X.npy', X)
np.save('processed_y.npy', y_normalized)
print("Processed data saved to 'processed_X.npy' and 'processed_y.npy'")

X shape: (10000, 13456)
X dtype: uint8
y shape: (10000, 1)
y dtype: float64
Number of rows: 10000
Number of columns (length of each bitstring): 13456
First row of X: [0 0 0 ... 0 0 0]
Last row of X: [0 0 0 ... 0 0 0]
First 5 values of y: [774. 967. 980. 296. 279.]
First 5 normalized values of y: [0.77377377 0.96696697 0.97997998 0.2952953  0.27827828]
Processed data saved to 'processed_X.npy' and 'processed_y.npy'


In [16]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

def create_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    return model

In [25]:
model = create_model(X_train.shape[1])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

In [26]:
# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    'best_model_weights.weights.h5',
    save_best_only=True,
    save_weights_only=True
)

In [27]:
print(np.isnan(X_train).any())
print(np.isinf(X_train).any())
print(np.isnan(y_train).any())
print(np.isinf(y_train).any())

False
False
False
False


In [28]:
print(type(X_train))
print(X_train.shape)
print(type(y_train))
print(y_train.shape)



history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

<class 'numpy.ndarray'>
(8000, 13456)
<class 'numpy.ndarray'>
(8000, 1)


ValueError: object __array__ method not producing an array

In [None]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test MAE: {test_mae}")

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Inverse transform the predictions to get original scale
y_pred_original = scaler.inverse_transform(y_pred)
y_test_original = scaler.inverse_transform(y_test)

In [None]:
# Calculate MAE on the original scale
mae_original = np.mean(np.abs(y_pred_original - y_test_original))
print(f"MAE on original scale: {mae_original}")