## ISE: Defect Detection Challenge
#### Description
In this competition, your task is to develop a model that can accurately classify source code snippets as either secure or insecure. With the rise of software vulnerabilities like resource leaks, use-after-free vulnerabilities, and denial-of-service (DoS) attacks, identifying insecure code is crucial for maintaining robust software systems.
Participants will be provided with a dataset containing labeled code snippets. The labels indicate whether the code is secure (0) or insecure (1). Your goal is to create an effective machine learning model that can predict these labels with high accuracy.
#### Key Objectives 
- Analyze code snippets for potential vulnerabilities.
- Develop models to automate the classification of secure and insecure code.
- Ensure the ROC score exceeds 0.63.

In [1]:
import sklearn as sk
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import layers

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

In [None]:
# data loading
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

# data info
print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

print(train_df.head(3))
print(train_df.isnull().sum()) # none num 


Training data shape: (20000, 3)
Test data shape: (7000, 2)

Training data columns: ['ID', 'code', 'Label']
   ID                                               code  Label
0   0  int page_check_range(target_ulong start, targe...      0
1   1  static void pxa2xx_lcdc_dma0_redraw_rot0(PXA2x...      0
2   2  void OPPROTO op_POWER_slq (void)\n\n{\n\n    u...      1
ID       0
code     0
Label    0
dtype: int64


## C++ Code Preprocessing Pipeline 
- Basic Text Cleaning 
- C++ Specific Normalization
- Features Enginerring 
- Tokenization
- Vectorization using TF-IDF 

In [3]:
import re
import string

def clean_cpp_code(code):
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)
    
    # Remove multi-line comments  
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    
    # Remove string literals (but keep structure)
    code = re.sub(r'"[^"]*"', '"STRING"', code)
    code = re.sub(r"'[^']*'", "'CHAR'", code)
    
    # Normalize whitespace
    code = re.sub(r'\s+', ' ', code)
    code = code.strip()
    
    return code

In [None]:
def normalize_cpp_code(code):
    # Normalize variable names 
    code = re.sub(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', 
                  lambda m: normalize_identifier(m.group()), code)
    
    # Normalize numeric literals
    code = re.sub(r'\b\d+\b', 'NUM', code)
    code = re.sub(r'\b0x[0-9a-fA-F]+\b', 'HEX', code)
    
    # Normalize function calls 
    code = re.sub(r'(\w+)\s*\(', r'FUNC(', code)
    
    return code

def normalize_identifier(name):
    # Keep important C++ keywords and functions
    cpp_keywords = {'int', 'char', 'void', 'if', 'else', 'for', 'while', 
                   'malloc', 'free', 'strcpy', 'strlen', 'memcpy', 'sizeof'}
    
    if name.lower() in cpp_keywords:
        return name
    elif len(name) <= 3:
        return name  # Keep short vars
    else:
        return 'VAR'  # Generic variable

In [None]:
def extract_security_features(code):
    features = {}
    
    # Enhanced dangerous function calls with more comprehensive list
    dangerous_funcs = ['strcpy', 'strcat', 'gets', 'sprintf', 'scanf', 
                      'malloc', 'free', 'memcpy', 'strncpy', 'strncat',
                      'vsprintf', 'vsnprintf', 'sscanf', 'fscanf', 'fgets',
                      'alloca', 'realloc', 'calloc']
    
    for func in dangerous_funcs:
        features[f'has_{func}'] = int(func in code.lower())
    
    # Memory operations 
    features['ptr_operations'] = len(re.findall(r'\*|\&', code))
    features['array_access'] = len(re.findall(r'\[.*?\]', code))
    features['memory_alloc'] = len(re.findall(r'malloc|calloc|new|alloca|realloc', code, re.IGNORECASE))
    features['memory_free'] = len(re.findall(r'free|delete', code, re.IGNORECASE))
    
    # Advanced vulnerability patterns
    features['buffer_overflow_risk'] = len(re.findall(r'(strcpy|strcat|gets|sprintf)\s*\(', code, re.IGNORECASE))
    features['format_string_vuln'] = len(re.findall(r'printf\s*\(\s*[a-zA-Z_]\w*\s*[,)]', code))
    features['use_after_free_risk'] = detect_use_after_free_pattern(code)
    features['double_free_risk'] = detect_double_free_pattern(code)
    features['memory_leak_risk'] = abs(features['memory_alloc'] - features['memory_free'])
    
    # Input validation issues
    features['unchecked_input'] = len(re.findall(r'(scanf|gets|fgets)\s*\(', code, re.IGNORECASE))
    features['missing_null_check'] = detect_missing_null_checks(code)
    features['array_bounds_risk'] = detect_array_bounds_issues(code)
    
    # Integer overflow/underflow risks
    features['integer_overflow_risk'] = len(re.findall(r'(\+\+|\-\-|\+=|\-=|\*=).*?(\[|\*)', code))
    features['signed_unsigned_mix'] = len(re.findall(r'(unsigned|signed)\s+\w+.*?(signed|unsigned)', code, re.IGNORECASE))
    
    # Control flow complexity (enhanced)
    features['if_statements'] = len(re.findall(r'\bif\b', code))
    features['nested_loops'] = detect_nested_complexity(code, r'\b(for|while)\b')
    features['switch_statements'] = len(re.findall(r'\bswitch\b', code))
    features['goto_statements'] = len(re.findall(r'\bgoto\b', code))
    features['function_calls'] = len(re.findall(r'\w+\s*\(', code))
    
    # Code quality indicators
    features['magic_numbers'] = len(re.findall(r'\b\d{2,}\b', code))
    features['long_functions'] = int(len(code.split('\n')) > 50)
    features['deep_nesting'] = calculate_max_nesting_depth(code)
    features['cyclomatic_complexity'] = estimate_cyclomatic_complexity(code)
    
    # String and file operations
    features['string_operations'] = len(re.findall(r'str(cpy|cat|cmp|len|chr|str)', code, re.IGNORECASE))
    features['file_operations'] = len(re.findall(r'(fopen|fclose|fread|fwrite|fprintf)', code, re.IGNORECASE))
    
    # Pointer arithmetic and casting
    features['pointer_arithmetic'] = len(re.findall(r'(\*\s*\w+\s*[\+\-]|\w+\s*[\+\-]\s*\d+\s*\))', code))
    features['type_casting'] = len(re.findall(r'\([a-zA-Z_]\w*\s*\*?\s*\)', code))
    features['void_pointer_usage'] = len(re.findall(r'void\s*\*', code, re.IGNORECASE))
    
    # Security-specific patterns
    features['hardcoded_values'] = detect_hardcoded_credentials(code)
    features['privilege_operations'] = len(re.findall(r'(setuid|setgid|chmod|chown|su|sudo)', code, re.IGNORECASE))
    features['system_calls'] = len(re.findall(r'(system|exec|popen|fork)', code, re.IGNORECASE))
    
    # Statistical features
    features['code_length'] = len(code)
    features['line_count'] = len(code.split('\n'))
    features['avg_line_length'] = features['code_length'] / max(1, features['line_count'])
    features['char_entropy'] = calculate_entropy(code)
    features['unique_char_ratio'] = len(set(code.lower())) / max(1, len(code))
    
    return features

In [6]:
# Helper functions for advanced vulnerability detection
def detect_use_after_free_pattern(code):
    """Detect potential use-after-free patterns"""
    patterns = [
        r'free\s*\([^)]+\).*?\*\s*\w+',  # free followed by dereference
        r'delete\s+\w+.*?\w+\s*\[',      # delete followed by array access
        r'free\s*\([^)]+\).*?\w+\s*\(',  # free followed by function call with same var
    ]
    
    count = 0
    for pattern in patterns:
        matches = re.findall(pattern, code, re.DOTALL | re.IGNORECASE)
        count += len(matches)
    
    return count

def detect_double_free_pattern(code):
    """Detect potential double free patterns"""
    free_calls = re.findall(r'free\s*\(\s*(\w+)\s*\)', code, re.IGNORECASE)
    if len(free_calls) != len(set(free_calls)):
        return 1  # Potential double free
    return 0

def detect_missing_null_checks(code):
    """Detect pointer usage without null checks"""
    ptr_usage = len(re.findall(r'\*\s*\w+', code))
    null_checks = len(re.findall(r'if\s*\(\s*\w+\s*[!=]=\s*NULL\s*\)', code, re.IGNORECASE))
    return max(0, ptr_usage - null_checks)

def detect_array_bounds_issues(code):
    """Detect array access without bounds checking"""
    array_access = re.findall(r'\w+\s*\[\s*([^]]+)\s*\]', code)
    bounds_checks = len(re.findall(r'if\s*\([^)]*(<|>|<=|>=)[^)]*\)', code))
    return max(0, len(array_access) - bounds_checks)

def detect_nested_complexity(code, pattern):
    """Detect nested control structures"""
    lines = code.split('\n')
    max_nested = 0
    current_nested = 0
    
    for line in lines:
        if re.search(pattern, line):
            current_nested += 1
            max_nested = max(max_nested, current_nested)
        if '}' in line:
            current_nested = max(0, current_nested - 1)
    
    return max_nested

def calculate_max_nesting_depth(code):
    """Calculate maximum nesting depth"""
    depth = 0
    max_depth = 0
    
    for char in code:
        if char == '{':
            depth += 1
            max_depth = max(max_depth, depth)
        elif char == '}':
            depth = max(0, depth - 1)
    
    return max_depth

def estimate_cyclomatic_complexity(code):
    """Estimate cyclomatic complexity"""
    decision_points = ['if', 'else', 'elif', 'for', 'while', 'case', 'catch', '\?', '&&', '\|\|']
    complexity = 1  # Base complexity
    
    for keyword in decision_points:
        complexity += len(re.findall(rf'\b{keyword}\b', code, re.IGNORECASE))
    
    return complexity

def detect_hardcoded_credentials(code):
    """Detect hardcoded passwords, keys, etc."""
    patterns = [
        r'(password|passwd|pwd)\s*=\s*["\'][^"\']{3,}["\']',
        r'(key|secret|token)\s*=\s*["\'][^"\']{8,}["\']',
        r'(api_key|apikey)\s*=\s*["\'][^"\']{10,}["\']',
    ]
    
    count = 0
    for pattern in patterns:
        count += len(re.findall(pattern, code, re.IGNORECASE))
    
    return count

def calculate_entropy(text):
    """Calculate Shannon entropy of text"""
    if not text:
        return 0
    
    char_counts = {}
    for char in text.lower():
        char_counts[char] = char_counts.get(char, 0) + 1
    
    entropy = 0
    text_len = len(text)
    
    for count in char_counts.values():
        probability = count / text_len
        if probability > 0:
            entropy -= probability * np.log2(probability)
    
    return entropy

In [None]:
def preprocess_cpp_dataset(df):
    """
    Complete preprocessing pipeline for C++ code dataset with enhanced features
    """
    processed_df = df.copy()
    
    print(f"Processing {len(df)} code samples...")
    
    print("Step 1: Cleaning code...")
    processed_df['cleaned_code'] = processed_df['code'].apply(clean_cpp_code)
    
    print("Step 2: Normalizing code...")
    processed_df['normalized_code'] = processed_df['cleaned_code'].apply(normalize_cpp_code)
    
    print("Step 3: Extracting enhanced security features...")
    security_features_list = []
    
    for idx, code in enumerate(processed_df['code']):
        if idx % 5000 == 0:
            print(f"  Processing sample {idx}/{len(processed_df)}")
        
        features = extract_security_features(code) 
        security_features_list.append(features)
    
    security_df = pd.DataFrame(security_features_list)
    
    result_df = pd.concat([
        processed_df[['normalized_code']],  
        security_df,  
    ], axis=1)
    
    if 'Label' in processed_df.columns:
        result_df['Label'] = processed_df['Label']
    
    print(f"Preprocessing complete!")
    print(f"Enhanced features created: {len(security_df.columns)} features")
    print(f"Final shape: {result_df.shape}")
    
    return result_df

In [18]:
# data augmentation
def augment_features(X, y, noise_factor=0.1, augment_ratio=0.5):
    """Add gaussian noise to numerical features"""
    n_samples = int(len(X) * augment_ratio)
    indices = np.random.choice(len(X), n_samples, replace=False)
    
    X_aug = X[indices].copy()
    y_aug = y[indices].copy()
    
    # Add noise to TF-IDF features (first part of feature vector)
    tfidf_end = 2000  # Adjust based on your TF-IDF feature count
    X_aug[:, tfidf_end:] += np.random.normal(0, noise_factor, X_aug[:, tfidf_end:].shape)
    
    # Combine original and augmented data
    X_combined = np.vstack([X, X_aug])
    y_combined = np.hstack([y, y_aug])
    
    # Shuffle
    shuffle_idx = np.random.permutation(len(X_combined))
    return X_combined[shuffle_idx], y_combined[shuffle_idx]


In [None]:
def create_pipeline_features(train_df, test_df):
    """Enhanced pipeline using your existing preprocessing with advanced features"""
    
    print("Step 1: Applying enhanced preprocessing...")

    train_processed = preprocess_cpp_dataset(train_df.copy())
    test_processed = preprocess_cpp_dataset(test_df.copy())
    
    print("Step 2: Creating TF-IDF features...")
    vectorizer = TfidfVectorizer(
        max_features=2000,  
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
        stop_words='english'  
    )
    
    tfidf_train = vectorizer.fit_transform(train_processed['normalized_code'])
    tfidf_test = vectorizer.transform(test_processed['normalized_code'])
    
    print("Step 3: Using enhanced security features...")
    
    feature_cols = [col for col in train_processed.columns 
                   if col not in ['normalized_code', 'Label']]
    
    print(f"Total enhanced features: {len(feature_cols)}")
    print("Feature categories:")
    print(f"  - Dangerous function detection: {len([f for f in feature_cols if f.startswith('has_')])}")
    print(f"  - Vulnerability patterns: {len([f for f in feature_cols if 'risk' in f or 'vuln' in f])}")
    print(f"  - Code complexity: {len([f for f in feature_cols if any(x in f for x in ['complexity', 'nesting', 'depth'])])}")
    print(f"  - Security patterns: {len([f for f in feature_cols if any(x in f for x in ['hardcoded', 'privilege', 'system'])])}")
    
    scaler = StandardScaler()
    train_numerical = scaler.fit_transform(train_processed[feature_cols].fillna(0))
    test_numerical = scaler.transform(test_processed[feature_cols].fillna(0))
    
    X_train = np.hstack([
        tfidf_train.toarray(),
        train_numerical
    ])
    
    X_test = np.hstack([
        tfidf_test.toarray(),
        test_numerical
    ])
    
    y_train = train_processed['Label'].values
    
    print(f"Final shapes - Train: {X_train.shape}, Test: {X_test.shape}")
    print(f"TF-IDF features: {tfidf_train.shape[1]}")
    print(f"Enhanced security features: {len(feature_cols)}")
    print(f"Total features: {X_train.shape[1]}")
    
    return X_train, X_test, y_train

print("Creating enhanced features...")
X_train, X_test, y_train = create_pipeline_features(train_df, test_df)

unique, counts = np.unique(y_train, return_counts=True)
print(f"\nClass distribution:")
for label, count in zip(unique, counts):
    print(f"  Class {label}: {count} samples ({count/len(y_train)*100:.1f}%)")


Creating enhanced features...
Step 1: Applying enhanced preprocessing...
Processing 20000 code samples...
Step 1: Cleaning code...
Step 2: Normalizing code...
Step 3: Extracting enhanced security features...
  Processing sample 0/20000
  Processing sample 5000/20000
  Processing sample 10000/20000
  Processing sample 15000/20000
Preprocessing complete!
Enhanced features created: 54 features
Final shape: (20000, 56)
Processing 7000 code samples...
Step 1: Cleaning code...
Step 2: Normalizing code...
Step 3: Extracting enhanced security features...
  Processing sample 0/7000
  Processing sample 5000/7000
Preprocessing complete!
Enhanced features created: 54 features
Final shape: (7000, 55)
Step 2: Creating TF-IDF features...
Step 3: Using enhanced security features...
Total enhanced features: 54
Feature categories:
  - Dangerous function detection: 18
  - Vulnerability patterns: 7
  - Code complexity: 2
  - Security patterns: 3
Final shapes - Train: (20000, 2054), Test: (7000, 2054)
TF-I

## Model Training 

In [None]:
def create_residual_block(x, units, dropout_rate=0.3):
    # Main path
    main = layers.Dense(units, activation='relu')(x)
    main = layers.BatchNormalization()(main)
    main = layers.Dropout(dropout_rate)(main)
    main = layers.Dense(units)(main)
    main = layers.BatchNormalization()(main)
    
    # skip connection
    if x.shape[-1] == units:
        skip = x
    else:
        skip = layers.Dense(units)(x)
    
    # combine and activate
    output = layers.Add()([main, skip])
    output = layers.Activation('relu')(output)
    output = layers.Dropout(dropout_rate)(output)
    
    return output

def res_net(X_tr, X_val, y_tr, y_val):
    print("Training Residual Network...")
    with tf.device('/GPU:0'):
        input_layer = layers.Input(shape=(X_tr.shape[1],))

        x = layers.Dense(4096, activation='relu')(input_layer)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.2)(x)
        
        x = layers.Dense(3072, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.25)(x)
        
        x = layers.Dense(2048, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.3)(x)

        x = create_residual_block(x, 2048, 0.4)
        x = create_residual_block(x, 1024, 0.45)
        x = create_residual_block(x, 1024, 0.5)
        x = create_residual_block(x, 512, 0.6)
        x = create_residual_block(x, 256, 0.55)
        x = create_residual_block(x, 128, 0.4)

        x = layers.Dense(96, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.35)(x)
        
        x = layers.Dense(64, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.3)(x)
        
        x = layers.Dense(32, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.25)(x)
        
        x = layers.Dense(16, activation='relu')(x)
        x = layers.Dropout(0.2)(x)

        output = layers.Dense(1, activation='sigmoid',
                            kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)

        nn_model = keras.Model(inputs=input_layer, outputs=output)   
    nn_model.compile(
        optimizer=Adam(learning_rate=0.001), 
        loss='binary_crossentropy', 
        metrics=[keras.metrics.AUC(name='auc')]
    )

    early_stop = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, min_lr=1e-6, mode='max')

    nn_model.fit(
        X_tr, y_tr, 
        epochs=100, 
        batch_size=128, 
        validation_data=(X_val, y_val), 
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )
    nn_train_pred = nn_model.predict(X_tr)
    nn_train_score = roc_auc_score(y_tr, nn_train_pred)
    print(f"   ResNet ROC-AUC on train: {nn_train_score:.4f}")
    
    nn_pred = nn_model.predict(X_val)
    nn_score = roc_auc_score(y_val, nn_pred)
    print(f"   ResNet ROC-AUC: {nn_score:.4f}")

    return nn_model, nn_score

In [None]:
def train_models(X_train, y_train):
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    X_tr, y_tr = augment_features(X_tr, y_tr)
    models = {}
    results = {}

    # # 1. Logistic Regression
    # print("Training Logistic Regression...")
    # lr_model = LogisticRegression(max_iter=1000, random_state=42)
    # lr_model.fit(X_tr, y_tr)
    # lr_pred = lr_model.predict(X_val)
    # lr_score = roc_auc_score(y_val, lr_pred)
    # models['Logistic Regression'] = lr_model
    # results['Logistic Regression'] = lr_score
    # print(f"   Logistic Regression ROC-AUC: {lr_score:.4f}")

    # # 2. XGBoost
    # print("Training XGBoost...")
    # xgb_model = xgb.XGBClassifier(random_state=42)
    # xgb_model.fit(X_tr, y_tr)
    # xgb_pred = xgb_model.predict(X_val)
    # xgb_score = roc_auc_score(y_val, xgb_pred)
    # models['XGBoost'] = xgb_model
    # results['XGBoost'] = xgb_score
    # print(f"   XGBoost ROC-AUC: {xgb_score:.4f}")

    # 3. Neural Network
    print("Training Neural Network...")
    with tf.device('/GPU:0'):
        nn_model = keras.Sequential([
            layers.Dense(4096, activation='relu', input_shape=(X_tr.shape[1],)),
            layers.BatchNormalization(),
            layers.Dropout(0.6),

            layers.Dense(2048, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.6),            

            layers.Dense(1024, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.4),
            
            layers.Dense(512, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.6),
            
            layers.Dense(512, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.6),
            
            layers.Dense(256, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.6),
            
            layers.Dense(128, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.5),

            layers.Dense(64, activation='relu'),
            layers.Dropout(0.3),
            
            layers.Dense(1, activation='sigmoid')
        ])
    nn_model.compile(
        optimizer=Adam(learning_rate=0.001), 
        loss='binary_crossentropy', 
        metrics=[keras.metrics.AUC(name='auc')]
    )

    early_stop = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, min_lr=1e-6, mode='max')

    nn_model.fit(
        X_tr, y_tr, 
        epochs=100, 
        batch_size=128, 
        validation_data=(X_val, y_val), 
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )
    nn_train_pred = nn_model.predict(X_tr)
    nn_train_score = roc_auc_score(y_tr, nn_train_pred)
    print(f"   Neural Network ROC-AUC on train: {nn_train_score:.4f}")
    
    nn_pred = nn_model.predict(X_val)
    nn_score = roc_auc_score(y_val, nn_pred)
    models['Neural Network'] = nn_model
    results['Neural Network'] = nn_score
    print(f"   Neural Network ROC-AUC: {nn_score:.4f}")

    nn_res_model, nn_res_score = res_net(X_tr, X_val, y_tr, y_val)
    models['ResNet'] = nn_res_model
    results['ResNet'] = nn_res_score

    best_model_name = max(results, key=results.get)
    best_model = models[best_model_name]
    best_score = results[best_model_name]
    
    print(f"\nBest Model: {best_model_name} (ROC-AUC: {best_score:.4f})")
    print(f"Target (>0.63): {'ACHIEVED' if best_score > 0.63 else 'NOT ACHIEVED'}")
    
    return best_model, best_score, best_model_name

best_model, best_score, best_model_name = train_models(X_train, y_train)

Training Neural Network...
Epoch 1/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 80ms/step - auc: 0.5139 - loss: 0.7996 - val_auc: 0.5407 - val_loss: 0.6946 - learning_rate: 0.0010
Epoch 2/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 79ms/step - auc: 0.5177 - loss: 0.7253 - val_auc: 0.5528 - val_loss: 0.6870 - learning_rate: 0.0010
Epoch 3/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 78ms/step - auc: 0.5317 - loss: 0.7007 - val_auc: 0.5607 - val_loss: 0.6802 - learning_rate: 0.0010
Epoch 4/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 79ms/step - auc: 0.5548 - loss: 0.6840 - val_auc: 0.5740 - val_loss: 0.6764 - learning_rate: 0.0010
Epoch 5/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 78ms/step - auc: 0.5775 - loss: 0.6736 - val_auc: 0.5884 - val_loss: 0.6682 - learning_rate: 0.0010
Epoch 6/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m

In [None]:
print("Training final model on full dataset...")
final_model = best_model

print("Generating predictions...")

if 'Neural Network' in str(type(best_model)) or hasattr(best_model, 'predict') and not hasattr(best_model, 'predict_proba'):
    test_probabilities = final_model.predict(X_test).flatten()
    test_predictions = (test_probabilities > 0.5).astype(int)  
else:
    final_model.fit(X_train, y_train)
    test_predictions = final_model.predict(X_test)  

submission = pd.DataFrame({
    'ID': test_df['ID'],           
    'Label': test_predictions      
})

submission.to_csv(f'data/submission_{best_model_name}.csv', index=False)
print(f"Submission saved! Shape: {submission.shape}")
print(f"Label distribution: {pd.Series(test_predictions).value_counts().sort_index()}")
print(submission.head())

print(f"\nSubmission format check:")
print(f"- Unique labels: {sorted(submission['Label'].unique())}")
print(f"- Should be: [0, 1] only")

Training final model on full dataset...
Generating predictions...
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
✅ Submission saved! Shape: (7000, 2)
Label distribution: 0    4024
1    2976
Name: count, dtype: int64
   ID  Label
0   0      0
1   1      0
2   2      0
3   3      1
4   4      1

Submission format check:
- Unique labels: [np.int64(0), np.int64(1)]
- Should be: [0, 1] only
