In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Conv1D, GlobalMaxPooling1D, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

In [2]:
def create_dataset(X, y, batch_size=32, validation_split=0.2):
    # Calculate split point
    split_point = int(len(X) * (1 - validation_split))
    
    # Create training dataset
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (X[:split_point], y[:split_point])
    ).shuffle(buffer_size=1000).batch(batch_size)
    
    # Create validation dataset
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (X[split_point:], y[split_point:])
    ).batch(batch_size)
    
    return train_dataset, val_dataset

In [3]:
def process_data(data, normalize=False):
    # Split the bitstrings by comma and concatenate
    bitstrings = data['concatenated_bitstrings'].str.split(',').apply(lambda x: ''.join(x) if isinstance(x, list) else x)
    
    # Convert bitstrings to numpy arrays
    X = np.vstack([np.array(list(s), dtype=np.uint8) for s in bitstrings])
    
    # Process target variable
    y = data['num_included'].values.reshape(-1, 1)
    
    # Normalize target values if requested
    if normalize:
        scaler = MinMaxScaler()
        y = scaler.fit_transform(y)
        print("Target values normalized to range [0, 1]")
    
    print(f"Processed data shapes:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    if not normalize:
        print(f"Target value range: {y.min():.2f} to {y.max():.2f}")
    
    return X, y

In [4]:
def create_dense_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        
        # First dense block
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second dense block
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Third dense block
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.1),
        
        # Output layer
        Dense(1)
    ])
    
    return model

In [5]:
def create_conv1d_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape, 1)),
        
        # First convolutional block
        Conv1D(64, kernel_size=5, activation='relu', padding='same'),
        BatchNormalization(),
        
        # Second convolutional block
        Conv1D(32, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        
        # Third convolutional block
        Conv1D(16, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        
        # Global pooling
        GlobalMaxPooling1D(),
        
        # Dense layers
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Output layer
        Dense(1)
    ])
    
    return model

In [6]:
def setup_training(model, learning_rate=0.001):
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse',
        metrics=['mae']
    )
    
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        ModelCheckpoint(
            'best_model.keras',
            monitor='val_loss',
            save_best_only=True
        )
    ]
    
    return callbacks

In [7]:
# Read CSV with semicolon separator
data = pd.read_csv('./../data/single/single-data-1730732093026691400.csv', sep=';')

# Verify the columns
print("Available columns:", data.columns.tolist())

# Process the data
X, y = process_data(data)

Available columns: ['concatenated_bitstrings', 'num_included', 'num_excluded', 'duration', 'tries']
Processed data shapes:
X shape: (1000, 5769440)
y shape: (1000, 1)
Target value range: 992.00 to 499811.00


Dense Neural Network

In [8]:
def train_model(X, y, batch_size=32, epochs=10):
    # Create datasets
    train_dataset, val_dataset = create_dataset(X, y, batch_size)
    
    # Create and compile model
    model = create_dense_model(X.shape[1])
    callbacks = setup_training(model)
    
    # Train model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )
    
    return model, history

In [9]:
batch_size = 12 
trained_model, history = train_model(X, y, batch_size=batch_size)

Epoch 1/10
[1m 2/67[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:42:25[0m 261s/step - loss: 94687715328.0000 - mae: 260291.1250

KeyboardInterrupt: 

In [None]:
# Plot training history
plt.figure(figsize=(15, 5))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot MAE
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()