In [1]:
# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
import matplotlib.ticker as mticker

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
from keras import optimizers
from keras.callbacks import Callback

# Set random seeds for reproducibility
np.random.seed(42)


In [2]:
# =============================================================================
# PEPTIDE ENCODING FUNCTIONS
# =============================================================================

# Standard amino acid alphabet
AA_ALPHABET = 'ARNDCQEGHILKMFPSTWYV'

def encode_peptide_onehot(sequence):
    """
    Convert a peptide sequence to one-hot encoding.
    
    Args:
        sequence (str): Peptide sequence using single-letter amino acid codes
        
    Returns:
        list: One-hot encoded sequence as list of lists, shape (length, 20)
    """
    char_to_idx = {aa: i for i, aa in enumerate(AA_ALPHABET)}
    encoding = []
    
    for amino_acid in sequence:
        # Create zero vector and set appropriate position to 1
        onehot_vector = [0] * len(AA_ALPHABET)
        if amino_acid in char_to_idx:
            onehot_vector[char_to_idx[amino_acid]] = 1
        encoding.append(onehot_vector)
    
    return encoding


In [3]:
# =============================================================================
# CUSTOM EARLY STOPPING CALLBACK
# =============================================================================

class ProportionalEarlyStopping(Callback):
    """
    Early stopping based on train/validation loss ratio.
    
    Stops training when (train_loss / val_loss) <= ratio for consecutive epochs.
    This helps prevent overfitting to noisy labels by monitoring the relationship
    between training and validation performance.
    """
    
    def __init__(self, ratio=0.9, patience=3, verbose=0, restore_best_weights=True):
        super().__init__()
        self.ratio = ratio
        self.patience = patience
        self.verbose = verbose
        self.wait = 0
        self.stopped_epoch = 0
        self.restore_best_weights = restore_best_weights

    def on_train_begin(self, logs=None):
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        train_loss = logs.get('loss')
        val_loss = logs.get('val_loss')
        
        if train_loss is None or val_loss is None:
            return
        
        # Check if training is overfitting (train loss much lower than val loss)
        if (train_loss / val_loss) > self.ratio:
            self.wait = 0  # Reset counter
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0 and self.verbose > 0:
            print(f'Early stopping at epoch {self.stopped_epoch + 1}')



In [4]:
# =============================================================================
# MODEL ARCHITECTURE
# =============================================================================

def create_peptide_lstm_classifier(sequence_length=12, lstm_units_1=140, 
                                 lstm_units_2=20, learning_rate=0.001):
    """
    Create LSTM model for peptide fitness classification.
    
    Args:
        sequence_length (int): Length of input peptide sequences
        lstm_units_1 (int): Units in first LSTM layer
        lstm_units_2 (int): Units in second LSTM layer  
        learning_rate (float): Learning rate for Adam optimizer
        
    Returns:
        keras.Model: Compiled LSTM classifier
    """
    model = Sequential([
        Input(shape=(sequence_length, 20)),  # 20 amino acids one-hot encoded
        LSTM(lstm_units_1, return_sequences=True, name='lstm_1'),
        LSTM(lstm_units_2, return_sequences=False, name='lstm_2'),
        Dense(1, activation='sigmoid', name='output')  # Binary classification
    ])
    
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model


In [5]:
# =============================================================================
# DATA LOADING AND PREPROCESSING
# =============================================================================

def load_and_preprocess_data(csv_file, test_size=0.05, random_state=42):
    """
    Load peptide dataset and prepare for training.
    
    Args:
        csv_file (str): Path to CSV file with 'peptide' and 'label' columns
        test_size (float): Fraction of data to use for testing
        random_state (int): Random seed for reproducible splits
        
    Returns:
        tuple: (X_train_encoded, X_test_encoded, y_train, y_test)
    """
    print("Loading peptide dataset...")
    
    # Load data - assumes CSV has 'peptide' and 'label' columns
    df = pd.read_csv(csv_file, usecols=["peptide", "label"])
    print(f"Loaded {len(df)} peptide sequences")
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        df['peptide'], df['label'], 
        test_size=test_size, 
        random_state=random_state,
        stratify=df['label']  # Maintain class balance
    )
    
    print("Encoding peptide sequences...")
    # One-hot encode sequences
    X_train_encoded = np.array([encode_peptide_onehot(seq) for seq in X_train])
    X_test_encoded = np.array([encode_peptide_onehot(seq) for seq in X_test])
    
    # Convert labels to numpy arrays
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    print(f"Training set: {len(X_train_encoded)} sequences")
    print(f"Test set: {len(X_test_encoded)} sequences")
    print(f"Sequence length: {X_train_encoded.shape[1]}")
    
    return X_train_encoded, X_test_encoded, y_train, y_test


In [6]:
# =============================================================================
# NOISE INJECTION
# =============================================================================

def add_label_noise(labels, noise_ratio):
    """
    Add random label noise by flipping a fraction of labels.
    
    Args:
        labels (np.array): Original binary labels
        noise_ratio (float): Fraction of labels to flip (0.0 to 1.0)
        
    Returns:
        np.array: Noisy labels with flipped values
    """
    if noise_ratio <= 0:
        return labels.copy()
    
    noisy_labels = labels.copy()
    n_flip = int(len(labels) * noise_ratio)
    
    # Randomly select indices to flip
    flip_indices = np.random.choice(len(labels), size=n_flip, replace=False)
    
    # Flip selected labels (0->1, 1->0)
    noisy_labels[flip_indices] = 1 - noisy_labels[flip_indices]
    
    return noisy_labels


In [7]:
# =============================================================================
# TRAINING EXPERIMENTS
# =============================================================================

def run_noise_vs_training_size_experiment(X_train, X_test, y_train, y_test):
    """
    Experiment 1: Effect of label noise at different training set sizes.
    
    This experiment demonstrates how models perform across varying amounts of
    training data and label noise levels.
    """
    print("\n" + "="*60)
    print("EXPERIMENT 1: Label Noise vs Training Set Size")
    print("="*60)
    
    # Experimental parameters
    noise_levels = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
    training_sizes_k = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]  # In thousands
    
    # Batch sizes optimized for each noise level (larger batches for higher noise)
    batch_sizes = [16, 32, 64, 128, 256, 256, 512, 512, 1024, 1024]
    
    learning_rate = 0.001  # Fixed learning rate
    
    # Store results
    results = {noise: {'training_sizes': [], 'accuracies': []} for noise in noise_levels}
    
    print(f"Testing {len(noise_levels)} noise levels x {len(training_sizes_k)} training sizes")
    print("Noise levels:", [f"{int(n*100)}%" for n in noise_levels])
    
    for noise_ratio, batch_size_base in zip(noise_levels, batch_sizes):
        print(f"\nTesting noise level: {int(noise_ratio*100)}%")
        
        for train_size_k in training_sizes_k:
            current_train_size = train_size_k * 1000
            
            # Calculate adaptive batch size based on training size
            train_ratio = current_train_size / len(X_train)
            batch_multiplier = train_ratio * 100
            current_batch_size = max(16, int(batch_multiplier * batch_size_base))
            
            # Sample training data
            sample_indices = np.random.choice(
                len(X_train), size=current_train_size, replace=False
            )
            X_train_sample = X_train[sample_indices]
            y_train_sample = y_train[sample_indices]
            
            # Add label noise
            y_train_noisy = add_label_noise(y_train_sample, noise_ratio)
            
            # Train model
            model = create_peptide_lstm_classifier(
                sequence_length=12, learning_rate=learning_rate
            )
            
            # Train with early stopping
            model.fit(
                X_train_sample, y_train_noisy,
                batch_size=current_batch_size,
                epochs=50,
                validation_split=0.1,
                verbose=0,
                callbacks=[ProportionalEarlyStopping(ratio=0.90, patience=3)]
            )
            
            # Evaluate on clean test set
            test_predictions = model.predict(X_test, verbose=0)
            y_pred_binary = (test_predictions.ravel() >= 0.5).astype(int)
            accuracy = accuracy_score(y_test, y_pred_binary)
            
            # Store results
            results[noise_ratio]['training_sizes'].append(train_size_k)
            results[noise_ratio]['accuracies'].append(accuracy)
            
            print(f"  {train_size_k}K samples, batch={current_batch_size}: {accuracy:.3f}")
    
    # Save results
    save_results_to_csv(results, 'training_size', 'peptide_noise_vs_training_size_results.csv')
    
    return results

def run_noise_vs_batch_size_experiment(X_train, X_test, y_train, y_test):
    """
    Experiment 2: Effect of batch size at different noise levels.
    
    This experiment shows how batch size affects model performance when 
    dealing with label noise.
    """
    print("\n" + "="*60)
    print("EXPERIMENT 2: Label Noise vs Batch Size")
    print("="*60)
    
    noise_levels = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
    batch_sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
    
    fixed_train_size = 100000  # Fixed training size
    learning_rate = 0.001
    
    results = {noise: {'batch_sizes': [], 'accuracies': []} for noise in noise_levels}
    
    print(f"Fixed training size: {fixed_train_size:,} samples")
    print(f"Testing batch sizes: {batch_sizes}")
    
    for noise_ratio in noise_levels:
        print(f"\nTesting noise level: {int(noise_ratio*100)}%")
        
        # Sample training data once for this noise level
        sample_indices = np.random.choice(
            len(X_train), size=fixed_train_size, replace=False
        )
        X_train_sample = X_train[sample_indices]
        y_train_sample = y_train[sample_indices]
        y_train_noisy = add_label_noise(y_train_sample, noise_ratio)
        
        for batch_size in batch_sizes:
            # Train model
            model = create_peptide_lstm_classifier(
                sequence_length=12, learning_rate=learning_rate
            )
            
            model.fit(
                X_train_sample, y_train_noisy,
                batch_size=batch_size,
                epochs=50,
                validation_split=0.1,
                verbose=0,
                callbacks=[ProportionalEarlyStopping(ratio=0.90, patience=3)]
            )
            
            # Evaluate
            test_predictions = model.predict(X_test, verbose=0)
            y_pred_binary = (test_predictions.ravel() >= 0.5).astype(int)
            accuracy = accuracy_score(y_test, y_pred_binary)
            
            results[noise_ratio]['batch_sizes'].append(batch_size)
            results[noise_ratio]['accuracies'].append(accuracy)
            
            print(f"  Batch size {batch_size}: {accuracy:.3f}")
    
    save_results_to_csv(results, 'batch_size', 'peptide_noise_vs_batch_size_results.csv')
    
    return results

def run_noise_vs_learning_rate_experiment(X_train, X_test, y_train, y_test):
    """
    Experiment 3: Effect of learning rate at different noise levels.
    
    This experiment explores whether increasing learning rates help with
    noisy labels, as suggested in the literature.
    """
    print("\n" + "="*60)
    print("EXPERIMENT 3: Label Noise vs Learning Rate")
    print("="*60)
    
    noise_levels = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
    # Batch sizes increase with noise level
    batch_sizes = [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
    learning_rates = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
    
    fixed_train_size = 100000
    
    results = {noise: {'learning_rates': [], 'accuracies': []} for noise in noise_levels}
    
    print("Testing how learning rate affects performance with increasing noise")
    print("(Batch size also increases with noise level)")
    
    for noise_ratio, batch_size in zip(noise_levels, batch_sizes):
        print(f"\nNoise: {int(noise_ratio*100)}%, Batch size: {batch_size}")
        
        # Sample and add noise to training data
        sample_indices = np.random.choice(
            len(X_train), size=fixed_train_size, replace=False
        )
        X_train_sample = X_train[sample_indices]  
        y_train_sample = y_train[sample_indices]
        y_train_noisy = add_label_noise(y_train_sample, noise_ratio)
        
        for learning_rate in learning_rates:
            # Train model
            model = create_peptide_lstm_classifier(
                sequence_length=12, learning_rate=learning_rate
            )
            
            model.fit(
                X_train_sample, y_train_noisy,
                batch_size=batch_size,
                epochs=50,
                validation_split=0.1,
                verbose=0,
                callbacks=[ProportionalEarlyStopping(ratio=0.90, patience=3)]
            )
            
            # Evaluate
            test_predictions = model.predict(X_test, verbose=0)
            y_pred_binary = (test_predictions.ravel() >= 0.5).astype(int)
            accuracy = accuracy_score(y_test, y_pred_binary)
            
            results[noise_ratio]['learning_rates'].append(learning_rate)
            results[noise_ratio]['accuracies'].append(accuracy)
            
            print(f"  LR {learning_rate:.6f}: {accuracy:.3f}")
    
    save_results_to_csv(results, 'learning_rate', 'peptide_noise_vs_learning_rate_results.csv')
    
    return results


In [8]:
# =============================================================================
# RESULTS STORAGE
# =============================================================================

def save_results_to_csv(results, x_column, filename):
    """
    Save experimental results to CSV format.
    
    Args:
        results (dict): Results dictionary from experiments
        x_column (str): Name for the x-axis variable column
        filename (str): Output CSV filename
    """
    csv_data = []
    
    for noise_level, data in results.items():
        x_values = data[list(data.keys())[0]]  # First key contains x-axis values
        accuracies = data['accuracies']
        
        for x_val, acc in zip(x_values, accuracies):
            csv_data.append({
                'noise_level': noise_level,
                x_column: x_val,
                'accuracy': acc
            })
    
    df = pd.DataFrame(csv_data)
    df.to_csv(filename, index=False)
    print(f"\nResults saved to {filename}")
    print(f"Total rows: {len(df)}")


In [9]:

# =============================================================================
# VISUALIZATION FUNCTIONS
# =============================================================================

def create_animated_plot(results, x_key, x_label, title, subtitle, filename_base):
    """
    Create animated GIF showing results progressively.
    
    Args:
        results (dict): Results from experiments
        x_key (str): Key for x-axis data in results dict
        x_label (str): Label for x-axis
        title (str): Main plot title
        subtitle (str): Plot subtitle
        filename_base (str): Base filename for outputs
    """
    # Animation parameters
    frames_per_point = 3
    pause_frames_between_lines = 12
    gif_fps = 30
    figsize = (10, 6)
    
    # Set up figure
    fig, ax = plt.subplots(figsize=figsize)
    
    # Colors for different noise levels
    colors = list(plt.get_cmap("tab10").colors) + ["black"]
    
    # Create line objects
    noise_levels = list(results.keys())
    lines = []
    labels = []
    
    for i, noise in enumerate(noise_levels):
        line, = ax.plot([], [], '-o', 
                       color=colors[i], linewidth=2, markersize=6,
                       label=f'Noise = {int(noise * 100)}%')
        lines.append(line)
        labels.append("")
    
    # Set up axes
    ax.set_xlabel(x_label, fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    ax.set_ylim(0.0, 1.0)
    
    # Add titles
    ax.text(0.5, 1.06, title, ha="center", va="bottom", 
            transform=ax.transAxes, fontsize=14, weight='bold')
    ax.text(0.5, 1.02, subtitle, ha="center", va="bottom",
            transform=ax.transAxes, fontsize=11, style="italic")
    
    # Add signature
    ax.text(0.95, 0.05, "Fatma Elzahraa Eid @ TheBioMLClinic",
            ha="right", va="bottom", transform=ax.transAxes,
            fontsize=10, color="gray")
    
    ax.grid(True, linestyle='--', alpha=0.4)
    
    # Legend
    leg = ax.legend(lines, labels, loc="lower left", frameon=False)
    legend_texts = leg.get_texts()
    
    plt.tight_layout()
    
    # Prepare data arrays
    X_data = []
    Y_data = []
    for noise in noise_levels:
        x_vals = results[noise][x_key] 
        y_vals = results[noise]['accuracies']
        X_data.append(np.array(x_vals))
        Y_data.append(np.array(y_vals))
    
    # Configure x-axis based on data type
    if x_key == 'batch_sizes' or x_key == 'learning_rates':
        # Use log scale
        min_x = min(np.min(x) for x in X_data if len(x) > 0)
        max_x = max(np.max(x) for x in X_data if len(x) > 0)
        
        if x_key == 'batch_sizes':
            ax.set_xscale("log", base=2)
            ax.xaxis.set_major_formatter(plt.FuncFormatter(
                lambda val, pos: rf"$2^{{{int(np.log2(val))}}}$"
            ))
        else:  # learning_rates
            ax.set_xscale("log", base=10) 
            ax.xaxis.set_major_formatter(mticker.LogFormatterSciNotation())
            # Add reference line at 0.001
            ax.axvline(x=1e-3, color="black", linestyle="--", linewidth=1)
            ax.annotate("Chosen fixed learning rate", xy=(1e-3, 0.2), 
                       xytext=(2e-3, 0.9), 
                       arrowprops=dict(arrowstyle="->", color="black", lw=1.2),
                       ha="left", va="center", fontsize=10)
        
        ax.set_xlim(min_x * 0.8, max_x * 1.2)
    else:
        # Linear scale for training sizes
        max_x = max(np.max(x) for x in X_data if len(x) > 0)
        ax.set_xticks(range(0, int(max_x) + 10, 10))
        ax.set_xlim(0, max_x + 5)
    
    # Animation functions
    def init():
        for line in lines:
            line.set_data([], [])
        for text in legend_texts:
            text.set_text("")
        return lines + legend_texts
    
    def animate(frame):
        # Determine which line to draw and how many points
        lines_count = len(noise_levels)
        points_per_line = max(len(x) for x in X_data)
        total_draw_frames = points_per_line * frames_per_point
        frames_per_complete_line = total_draw_frames + pause_frames_between_lines
        
        current_line = min(frame // frames_per_complete_line, lines_count - 1)
        frame_in_line = frame % frames_per_complete_line
        
        # Draw completed lines
        for i in range(current_line):
            lines[i].set_data(X_data[i], Y_data[i])
            legend_texts[i].set_text(f'Noise = {int(noise_levels[i] * 100)}%')
        
        # Draw current line progressively  
        if frame_in_line < total_draw_frames and current_line < lines_count:
            points_to_show = min(
                (frame_in_line // frames_per_point) + 1, 
                len(X_data[current_line])
            )
            lines[current_line].set_data(
                X_data[current_line][:points_to_show], 
                Y_data[current_line][:points_to_show]
            )
            legend_texts[current_line].set_text(f'Noise = {int(noise_levels[current_line] * 100)}%')
        elif current_line < lines_count:
            lines[current_line].set_data(X_data[current_line], Y_data[current_line])
            legend_texts[current_line].set_text(f'Noise = {int(noise_levels[current_line] * 100)}%')
        
        return lines + legend_texts
    
    # Create animation
    total_frames = len(noise_levels) * (max(len(x) for x in X_data) * frames_per_point + pause_frames_between_lines)
    anim = FuncAnimation(fig, animate, init_func=init, frames=total_frames, 
                        interval=1000//gif_fps, blit=True, repeat=True)
    
    # Save animation and static plot
    gif_filename = f"{filename_base}.gif"
    png_filename = f"{filename_base}.png"
    
    writer = PillowWriter(fps=gif_fps)
    anim.save(gif_filename, writer=writer)
    print(f"Animated plot saved as {gif_filename}")
    
    # Save final frame as static image
    animate(total_frames - 1)
    for artist in lines + legend_texts:
        artist.set_animated(False)
    fig.canvas.draw()
    fig.savefig(png_filename, dpi=300, bbox_inches="tight")
    print(f"Static plot saved as {png_filename}")
    
    plt.close()

### =============================================================================
### MAIN EXECUTION
### =============================================================================


In [11]:
"""
Running the complete tutorial.

This cell orchestrates all experiments and creates visualizations
demonstrating the robustness of machine learning to label noise in 
peptide engineering applications.
"""

print("="*80)
print("PEPTIDE ENGINEERING WITH LABEL NOISE - TUTORIAL")
print("="*80)
print("\nThis tutorial demonstrates how ML models can handle massive label noise")
print("in biological data, specifically peptide fitness prediction.")
print("\nKey concepts:")
print("- Label noise robustness through proper hyperparameter tuning") 
print("- Batch size effects on noisy training")
print("- Learning rate considerations for corrupted labels")

# Load and preprocess data
print("\n" + "-"*50)
print("STEP 1: DATA LOADING AND PREPROCESSING")
print("-"*50)

csv_filename = "ThebioMLClinicDatasets_12merPeptideFitness_Classification_250K.csv"

X_train, X_test, y_train, y_test = load_and_preprocess_data(csv_filename)

print(f"Data loaded successfully!")
print(f"Training sequences shape: {X_train.shape}")
print(f"Test sequences shape: {X_test.shape}")

# Run experiments
print("\n" + "-"*50)  
print("STEP 2: RUNNING EXPERIMENTS")
print("-"*50)

# Experiment 1: Training size vs noise
results_1 = run_noise_vs_training_size_experiment(X_train, X_test, y_train, y_test)

# Experiment 2: Batch size vs noise  
results_2 = run_noise_vs_batch_size_experiment(X_train, X_test, y_train, y_test)

# Experiment 3: Learning rate vs noise
results_3 = run_noise_vs_learning_rate_experiment(X_train, X_test, y_train, y_test)

# Create visualizations
print("\n" + "-"*50)
print("STEP 3: CREATING VISUALIZATIONS")
print("-"*50)

create_animated_plot(
    results_1, 'training_sizes', 'Training Set Size (K)',
    'Machine learning is still possible with massive biological label noise',
    'Tutorial: 12mer Peptide fitness - Training Size Effect',
    'peptide_noise_vs_training_size'
)

create_animated_plot(
    results_2, 'batch_sizes', 'Batch Size', 
    'For better performance, increase batch size with increasing noise',
    'Tutorial: 12mer Peptide fitness - Batch Size Effect',
    'peptide_noise_vs_batch_size'
)

create_animated_plot(
    results_3, 'learning_rates', 'Learning Rate',
    'Increasing learning rate is not necessarily good with noisy labels', 
    'Tutorial: 12mer Peptide fitness - Learning Rate Effect',
    'peptide_noise_vs_learning_rate'
)

# Summary
print("\n" + "="*80)
print("TUTORIAL COMPLETE!")
print("="*80)
print("\nKey findings:")
print("1. Models remain effective even with 45% label noise")
print("2. Larger batch sizes help average out noise effects")
print("3. Fixed learning rates work well - increasing with noise not necessary")
print("\nFiles generated:")
print("- 3 CSV files with experimental results")
print("- 3 animated GIFs showing progressive results")
print("- 3 static PNG plots of final results")

print(f"\nThe biological signal in peptide data provides sufficient structure")
print(f"for models to learn meaningful patterns despite substantial label corruption.")
print(f"\nThis demonstrates that 'garbage in, garbage out' may not apply") 
print(f"when the underlying biological signal is strong and structured.")

PEPTIDE ENGINEERING WITH LABEL NOISE - TUTORIAL

This tutorial demonstrates how ML models can handle massive label noise
in biological data, specifically peptide fitness prediction.

Key concepts:
- Label noise robustness through proper hyperparameter tuning
- Batch size effects on noisy training
- Learning rate considerations for corrupted labels

--------------------------------------------------
STEP 1: DATA LOADING AND PREPROCESSING
--------------------------------------------------
Loading peptide dataset...
Loaded 250000 peptide sequences
Encoding peptide sequences...
Training set: 237500 sequences
Test set: 12500 sequences
Sequence length: 12
Data loaded successfully!
Training sequences shape: (237500, 12, 20)
Test sequences shape: (12500, 12, 20)

--------------------------------------------------
STEP 2: RUNNING EXPERIMENTS
--------------------------------------------------

EXPERIMENT 1: Label Noise vs Training Set Size
Testing 10 noise levels x 10 training sizes
Noise levels