In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import tensorflow as tf
from tensorflow.keras import layers, models, Input, Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import string
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from helper_functions.preprocessing import replace_black_with_median, remove_salt_and_pepper_noise
from helper_functions.segmentation import segment_captcha_with_projection
from helper_functions.extract_feature import extract_features

# Tunable parameters
TUNABLE_PARAMETERS = {
    "median_filter_kernel_size": 7,
    "gaussian_blur_kernel_size": (3, 3),
    "salt_pepper_kernel_size": 1,
    "adaptive_threshold_block_size": 31,
    "adaptive_threshold_C": 2,
    "projection_threshold": 0.1
}

# Paths
TRAIN_FOLDER_PATH = "../data/train/combine"
TEST_FOLDER_PATH = "../data/test/combine"
MODEL_SAVE_PATH = "captcha_model.h5"
CHAR_MODEL_SAVE_PATH = "char_recognition_model.h5"

# CNN parameters
IMG_HEIGHT = 40
IMG_WIDTH = 30
BATCH_SIZE = 64
EPOCHS = 15
VALIDATION_SPLIT = 0.2

# Possible characters in captchas (adjust if needed)
CHARACTERS = string.ascii_lowercase + string.digits


# Preprocess an image for segmentation
def preprocess_image(image):
    if len(image.shape) == 3:  # If color image
        denoised = replace_black_with_median(image.copy(), TUNABLE_PARAMETERS['median_filter_kernel_size'])
        gray = cv2.cvtColor(denoised, cv2.COLOR_BGR2GRAY)
    else:  # If already grayscale
        denoised = replace_black_with_median(image.copy(), TUNABLE_PARAMETERS['median_filter_kernel_size'])
        gray = denoised
    
    denoised_after_noise_removal = remove_salt_and_pepper_noise(gray, TUNABLE_PARAMETERS['salt_pepper_kernel_size'])
    blurred = cv2.GaussianBlur(denoised_after_noise_removal, TUNABLE_PARAMETERS['gaussian_blur_kernel_size'], 0)
    
    thresh = cv2.adaptiveThreshold(
        blurred,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        TUNABLE_PARAMETERS['adaptive_threshold_block_size'],
        TUNABLE_PARAMETERS['adaptive_threshold_C']
    )
    
    return thresh


# Prepare data for CNN training
def prepare_training_data(folder_path):
    """
    Denoise and tokenize captcha image files in a folder into individual characters using vertical projection.
    Ignores image files where segmentation has failed (num_of_segmented_char =/= actual_num_of_char).

    Parameters:
    folder_path (str): Path to the folder containing CAPTCHA images. Image files in the folder should be of format "captchachars-0" （e.g "abc123-0")

    Returns:
    tuple：
    - X_img：Numpy array of Image of char with type numpy array (40 x 30)
    - feature_list: Dictionary of features extracted from the corresponding image 
    - y: Numpy array of one-hot encoded label of the corresponding image
    """
    all_images = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    X_img = []  # Images
    X_features_list = []  # Engineered features
    y = []  # Labels
    
    for filename in tqdm(all_images, desc="Preparing Training Data"):
        image_path = os.path.join(folder_path, filename)
        filename_without_suffix = os.path.splitext(filename)[0]
        correct_characters = filename_without_suffix.split('-')[0]
        
        image = cv2.imread(image_path)
        thresh = preprocess_image(image)
        
        character_boundaries, _, _ = segment_captcha_with_projection(thresh, TUNABLE_PARAMETERS['projection_threshold'])
        
        # Skip if segmentation failed or number of segments doesn't match expected characters
        if len(character_boundaries) != len(correct_characters):
            continue
        
        for i, (start, end) in enumerate(character_boundaries):
            char_image = thresh[:, start:end]
            char_label = correct_characters[i]
            
            # Skip if character is not in our expected set
            if char_label not in CHARACTERS:
                continue
            
            # Extract features and resized image
            features, char_image_resized = extract_features(char_image)
            
            # Add to dataset
            X_img.append(char_image_resized)
            X_features_list.append(features)
            y.append(CHARACTERS.index(char_label))
    
    # Convert to numpy arrays
    X_img = np.array(X_img)
    y = np.array(y)
    
    # Convert features to DataFrame and then to numpy array
    features_df = pd.DataFrame(X_features_list)
    X_features = features_df.values
    feature_names = list(features_df.columns)
    
    # Reshape for CNN input
    X_img = X_img.reshape(X_img.shape[0], IMG_HEIGHT, IMG_WIDTH, 1)
    X_img = X_img / 255.0  # Normalize
    
    # One-hot encode labels
    y_one_hot = to_categorical(y, num_classes=len(CHARACTERS))
    
    # Standardize features
    scaler = StandardScaler()
    X_features_scaled = scaler.fit_transform(X_features)
    
    return X_img, X_features_scaled, y_one_hot, feature_names, scaler

# Build hybrid model that combines CNN for images and features
def build_hybrid_model(feature_count):
    # Image input branch
    img_input = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1), name='image_input')
    x = layers.Conv2D(32, (3, 3), activation='relu')(img_input)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)
    img_features = layers.Dense(128, activation='relu')(x)
    
    # Engineered features input branch
    feature_input = Input(shape=(feature_count,), name='feature_input')
    feature_branch = layers.Dense(64, activation='relu')(feature_input)
    feature_branch = layers.Dropout(0.3)(feature_branch)
    feature_branch = layers.Dense(32, activation='relu')(feature_branch)
    
    # Combine both branches
    combined = layers.concatenate([img_features, feature_branch])
    combined = layers.Dense(128, activation='relu')(combined)
    combined = layers.Dropout(0.5)(combined)
    output = layers.Dense(len(CHARACTERS), activation='softmax')(combined)
    
    # Create model
    model = Model(inputs=[img_input, feature_input], outputs=output)
    
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model

# Evaluate the model on test data
def evaluate_model(model, test_folder_path, scaler):
    all_images = [f for f in os.listdir(test_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    correct_chars = 0
    total_chars = 0
    correct_captchas = 0
    total_captchas = 0
    
    all_true_chars = []
    all_pred_chars = []
    
    # For captcha-level metrics
    all_true_captchas = []
    all_pred_captchas = []
    
    results = []
    
    for filename in tqdm(all_images, desc="Evaluating on Test Data"):
        image_path = os.path.join(test_folder_path, filename)
        filename_without_suffix = os.path.splitext(filename)[0]
        correct_characters = filename_without_suffix.split('-')[0]
        
        image = cv2.imread(image_path)
        thresh = preprocess_image(image)
        
        character_boundaries, _, _ = segment_captcha_with_projection(thresh, TUNABLE_PARAMETERS['projection_threshold'])
        
        if len(character_boundaries) == 0:
            total_captchas += 1
            # Add to captcha-level metrics
            all_true_captchas.append(correct_characters)
            all_pred_captchas.append("")  # Empty prediction
            continue
        
        predicted_chars = []
        
        for i, (start, end) in enumerate(character_boundaries):
            char_image = thresh[:, start:end]
            
            # Extract features and resized image
            features, char_image_resized = extract_features(char_image)
            
            # Prepare image for prediction
            char_image_resized = char_image_resized.reshape(1, IMG_HEIGHT, IMG_WIDTH, 1)
            char_image_resized = char_image_resized / 255.0
            
            # Prepare features for prediction
            features_df = pd.DataFrame([features])
            features_array = features_df.values
            features_scaled = scaler.transform(features_array)
            
            # Predict using hybrid model
            prediction = model.predict([char_image_resized, features_scaled], verbose=0)
            predicted_idx = np.argmax(prediction)
            predicted_char = CHARACTERS[predicted_idx]
            
            predicted_chars.append(predicted_char)
            
            # Add to character-level metrics
            if i < len(correct_characters):
                all_true_chars.append(correct_characters[i])
                all_pred_chars.append(predicted_char)
        
        predicted_text = ''.join(predicted_chars)
        
        # Add to captcha-level metrics
        all_true_captchas.append(correct_characters)
        all_pred_captchas.append(predicted_text)
        
        # Count correct characters
        if len(predicted_chars) == len(correct_characters):
            for i in range(len(correct_characters)):
                if predicted_chars[i] == correct_characters[i]:
                    correct_chars += 1
                total_chars += 1
            
            # Check if entire captcha is correct
            if predicted_text == correct_characters:
                correct_captchas += 1
        else:
            total_chars += len(correct_characters)
        
        total_captchas += 1
        
        # Store results for later analysis
        results.append({
            'filename': filename,
            'true_text': correct_characters,
            'predicted_text': predicted_text,
            'correct': predicted_text == correct_characters,
            'char_count': len(correct_characters),
            'segments_found': len(character_boundaries)
        })
    
    # Calculate metrics for character-level evaluation
    char_accuracy = accuracy_score(all_true_chars, all_pred_chars) if all_true_chars else 0
    char_precision = precision_score(all_true_chars, all_pred_chars, average='weighted', zero_division=0)
    char_recall = recall_score(all_true_chars, all_pred_chars, average='weighted', zero_division=0)
    char_f1 = f1_score(all_true_chars, all_pred_chars, average='weighted', zero_division=0)
    
    # Calculate metrics for captcha-level evaluation
    captcha_accuracy = sum([1 if true == pred else 0 for true, pred in zip(all_true_captchas, all_pred_captchas)]) / len(all_true_captchas)
    
    # For captcha-level precision, recall, and F1, we need to convert to binary classification
    # (correct or incorrect captcha)
    captcha_true_binary = [1 if true == pred else 0 for true, pred in zip(all_true_captchas, all_pred_captchas)]
    captcha_pred_binary = [1 if true == pred else 0 for true, pred in zip(all_true_captchas, all_pred_captchas)]
    
    captcha_precision = precision_score(captcha_true_binary, captcha_pred_binary, zero_division=0)
    captcha_recall = recall_score(captcha_true_binary, captcha_pred_binary, zero_division=0)
    captcha_f1 = f1_score(captcha_true_binary, captcha_pred_binary, zero_division=0)
    
    # Print character-level metrics
    print("\nCharacter-level Metrics:")
    print(f"Accuracy: {char_accuracy:.4f} ({correct_chars}/{total_chars})")
    print(f"Precision: {char_precision:.4f}")
    print(f"Recall: {char_recall:.4f}")
    print(f"F1 Score: {char_f1:.4f}")
    
    # Print captcha-level metrics
    print("\nCaptcha-level Metrics:")
    print(f"Accuracy: {captcha_accuracy:.4f} ({correct_captchas}/{total_captchas})")
    print(f"Precision: {captcha_precision:.4f}")
    print(f"Recall: {captcha_recall:.4f}")
    print(f"F1 Score: {captcha_f1:.4f}")
    
    # Create confusion matrix for character recognition
    char_labels = sorted(set(all_true_chars + all_pred_chars))
    conf_matrix = confusion_matrix(all_true_chars, all_pred_chars, labels=char_labels)
    plt.figure(figsize=(12, 10))
    sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=char_labels, yticklabels=char_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Character Recognition Confusion Matrix')
    plt.tight_layout()
    # plt.savefig('confusion_matrix.png')
    plt.close()
    
    # Classification report
    print("\nClassification Report (Character-level):")
    print(classification_report(all_true_chars, all_pred_chars))
    
    # Save results to CSV
    # results_df = pd.DataFrame(results)
    # results_df.to_csv('captcha_recognition_results.csv', index=False)
    
    # Create a metrics summary DataFrame
    metrics_summary = pd.DataFrame({
        'Level': ['Character', 'Captcha'],
        'Accuracy': [char_accuracy, captcha_accuracy],
        'Precision': [char_precision, captcha_precision],
        'Recall': [char_recall, captcha_recall],
        'F1_Score': [char_f1, captcha_f1]
    })
    
    # Save metrics summary to CSV
    # metrics_summary.to_csv('recognition_metrics_summary.csv', index=False)
    
    # Plot metrics comparison
    plt.figure(figsize=(10, 6))
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1_Score']
    char_metrics = [char_accuracy, char_precision, char_recall, char_f1]
    captcha_metrics = [captcha_accuracy, captcha_precision, captcha_recall, captcha_f1]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, char_metrics, width, label='Character Level')
    plt.bar(x + width/2, captcha_metrics, width, label='Captcha Level')
    
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.title('Recognition Metrics Comparison')
    plt.xticks(x, metrics)
    plt.ylim(0, 1)
    plt.legend()
    plt.tight_layout()
    # plt.savefig('metrics_comparison.png')
    plt.close()
    
    return {
        'char_accuracy': char_accuracy,
        'char_precision': char_precision,
        'char_recall': char_recall,
        'char_f1': char_f1,
        'captcha_accuracy': captcha_accuracy,
        'captcha_precision': captcha_precision,
        'captcha_recall': captcha_recall,
        'captcha_f1': captcha_f1
    }, results_df

# Display example predictions
def show_example_predictions(model, test_folder_path, scaler, num_examples=5):
    all_images = [f for f in os.listdir(test_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    selected_images = random.sample(all_images, min(num_examples, len(all_images)))
    
    for filename in selected_images:
        image_path = os.path.join(test_folder_path, filename)
        filename_without_suffix = os.path.splitext(filename)[0]
        correct_characters = filename_without_suffix.split('-')[0]
        
        image = cv2.imread(image_path)
        thresh = preprocess_image(image)
        
        character_boundaries, _, _ = segment_captcha_with_projection(thresh, TUNABLE_PARAMETERS['projection_threshold'])
        
        if len(character_boundaries) == 0:
            print(f"No characters detected in {filename}.")
            continue
        
        predicted_chars = []
        
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title(f"Original: {correct_characters}")
        plt.axis('off')
        
        plt.subplot(1, 2, 2)
        plt.imshow(thresh, cmap='gray')
        plt.title("Preprocessed")
        plt.axis('off')
        
        plt.tight_layout()
        plt.show()
        
        plt.figure(figsize=(12, 3))
        for i, (start, end) in enumerate(character_boundaries):
            char_image = thresh[:, start:end]
            
            # Extract features and resized image
            features, char_image_resized = extract_features(char_image)
            
            # Prepare image for prediction
            char_image_input = char_image_resized.reshape(1, IMG_HEIGHT, IMG_WIDTH, 1)
            char_image_input = char_image_input / 255.0
            
            # Prepare features for prediction
            features_df = pd.DataFrame([features])
            features_array = features_df.values
            features_scaled = scaler.transform(features_array)
            
            # Predict using hybrid model
            prediction = model.predict([char_image_input, features_scaled], verbose=0)
            predicted_idx = np.argmax(prediction)
            predicted_char = CHARACTERS[predicted_idx]
            confidence = prediction[0][predicted_idx]
            
            predicted_chars.append(predicted_char)
            
            # Display segment and prediction
            plt.subplot(1, len(character_boundaries), i+1)
            plt.imshow(char_image, cmap='gray')
            
            true_char = correct_characters[i] if i < len(correct_characters) else "?"
            title = f"True: {true_char}\nPred: {predicted_char}\nConf: {confidence:.2f}"
            plt.title(title)
            plt.axis('off')
        
        plt.tight_layout()
        plt.show()
        
        print(f"Image: {filename}")
        print(f"True Text: {correct_characters}")
        print(f"Predicted: {''.join(predicted_chars)}")
        print(f"Correct: {'Yes' if ''.join(predicted_chars) == correct_characters else 'No'}")
        print("-" * 50)

# Main function
def main():
    # Prepare training data
    print("Preparing training data...")
    X_img, X_features, y, feature_names, scaler = prepare_training_data(TRAIN_FOLDER_PATH)
    
    # Print feature information
    print(f"\nExtracted {len(feature_names)} features:")
    for i, feature_name in enumerate(feature_names):
        print(f"  {i+1}. {feature_name}")
    
    # Build and train hybrid model
    print("\nBuilding and training hybrid model...")
    model = build_hybrid_model(X_features.shape[1])
    
    # Display model summary
    model.summary()
    
    # Train the model
    history = model.fit(
        [X_img, X_features], y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=VALIDATION_SPLIT,
        verbose=1
    )
    
    # Save the model
    # model.save(CHAR_MODEL_SAVE_PATH)
    # print(f"Model saved to {CHAR_MODEL_SAVE_PATH}")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    # plt.savefig('training_history.png')
    plt.close()
    
    # Evaluate the model on test data
    print("\nEvaluating the model on test data...")
    metrics, results_df = evaluate_model(model, TEST_FOLDER_PATH, scaler)
    
    # Show example predictions
    print("\nShowing example predictions...")
    show_example_predictions(model, TEST_FOLDER_PATH, scaler, num_examples=5)
    
    # Analyze segmentation performance
    segmentation_success_rate = (results_df['segments_found'] == results_df['char_count']).mean()
    print(f"\nSegmentation Success Rate: {segmentation_success_rate:.4f}")
    
    # Analyze most common errors
    incorrect_predictions = results_df[results_df['correct'] == False]
    if len(incorrect_predictions) > 0:
        print("\nMost common error cases:")
        for _, row in incorrect_predictions.head(5).iterrows():
            print(f"  Filename: {row['filename']}")
            print(f"  True: {row['true_text']}")
            print(f"  Predicted: {row['predicted_text']}")
            print(f"  Segments found: {row['segments_found']}")
            print("  " + "-" * 30)
    
    # Print overall performance summary
    print("\nOverall Performance Summary:")
    print("Character-level metrics:")
    print(f"  Accuracy:  {metrics['char_accuracy']:.4f}")
    print(f"  Precision: {metrics['char_precision']:.4f}")
    print(f"  Recall:    {metrics['char_recall']:.4f}")
    print(f"  F1 Score:  {metrics['char_f1']:.4f}")
    
    print("\nCaptcha-level metrics:")
    print(f"  Accuracy:  {metrics['captcha_accuracy']:.4f}")
    print(f"  Precision: {metrics['captcha_precision']:.4f}")
    print(f"  Recall:    {metrics['captcha_recall']:.4f}")
    print(f"  F1 Score:  {metrics['captcha_f1']:.4f}")

if __name__ == "__main__":
    main()

Preparing training data...


Preparing Training Data: 100%|█████████████████████████████████████████████████████████| 7437/7437 [01:34<00:00, 78.41it/s]



Extracted 9 features:
  1. aspect_ratio
  2. pixel_density
  3. h_symmetry
  4. v_symmetry
  5. contour_count
  6. h_proj_std
  7. v_proj_std
  8. com_x
  9. com_y

Building and training hybrid model...


Epoch 1/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 36ms/step - accuracy: 0.1887 - loss: 2.9619 - val_accuracy: 0.6487 - val_loss: 1.1960
Epoch 2/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 39ms/step - accuracy: 0.5723 - loss: 1.4611 - val_accuracy: 0.7196 - val_loss: 0.9072
Epoch 3/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 39ms/step - accuracy: 0.6607 - loss: 1.1371 - val_accuracy: 0.7570 - val_loss: 0.7849
Epoch 4/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 37ms/step - accuracy: 0.7089 - loss: 0.9656 - val_accuracy: 0.7763 - val_loss: 0.7133
Epoch 5/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 36ms/step - accuracy: 0.7366 - loss: 0.8765 - val_accuracy: 0.7955 - val_loss: 0.6563
Epoch 6/15
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 37ms/step - accuracy: 0.7554 - loss: 0.8025 - val_accuracy: 0.8087 - val_loss: 0.6076
Epoch 7/15
[1m4

Evaluating on Test Data: 100%|█████████████████████████████████████████████████████████| 1894/1894 [19:17<00:00,  1.64it/s]



Character-level Metrics:
Accuracy: 0.7851 (8266/11340)
Precision: 0.7926
Recall: 0.7851
F1 Score: 0.7860

Captcha-level Metrics:
Accuracy: 0.4324 (819/1894)
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Classification Report (Character-level):
              precision    recall  f1-score   support

           0       0.67      0.67      0.67       343
           1       0.62      0.64      0.63       238
           2       0.89      0.80      0.84       303
           3       0.93      0.85      0.89       319
           4       0.85      0.75      0.80       300
           5       0.78      0.76      0.77       286
           6       0.81      0.87      0.84       287
           7       0.78      0.84      0.81       288
           8       0.92      0.75      0.82       308
           9       0.86      0.78      0.82       308
           a       0.82      0.76      0.79       332
           b       0.78      0.79      0.79       302
           c       0.82      0.86      0.84    

NameError: name 'results_df' is not defined