## ISE: Defect Detection Challenge
#### Description
In this competition, your task is to develop a model that can accurately classify source code snippets as either secure or insecure. With the rise of software vulnerabilities like resource leaks, use-after-free vulnerabilities, and denial-of-service (DoS) attacks, identifying insecure code is crucial for maintaining robust software systems.
Participants will be provided with a dataset containing labeled code snippets. The labels indicate whether the code is secure (0) or insecure (1). Your goal is to create an effective machine learning model that can predict these labels with high accuracy.
#### Key Objectives 
- Analyze code snippets for potential vulnerabilities.
- Develop models to automate the classification of secure and insecure code.
- Ensure the ROC score exceeds 0.63.

Import Essentials

In [1]:
import sklearn as sk
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import layers

Data Exploration

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

print(train_df.head(3))

print(train_df.isnull().sum()) # none num 

Training data shape: (20000, 3)
Test data shape: (7000, 2)

Training data columns: ['ID', 'code', 'Label']
   ID                                               code  Label
0   0  int page_check_range(target_ulong start, targe...      0
1   1  static void pxa2xx_lcdc_dma0_redraw_rot0(PXA2x...      0
2   2  void OPPROTO op_POWER_slq (void)\n\n{\n\n    u...      1
ID       0
code     0
Label    0
dtype: int64


## C++ Code Preprocessing Pipeline 
- Basic Text Cleaning 
- C++ Specific Normalization
- Features Enginerring 
- Tokenization
- Vectorization using TF-IDF 

In [3]:
import re
import string

def clean_cpp_code(code):
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)
    
    # Remove multi-line comments  
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    
    # Remove string literals (but keep structure)
    code = re.sub(r'"[^"]*"', '"STRING"', code)
    code = re.sub(r"'[^']*'", "'CHAR'", code)
    
    # Normalize whitespace
    code = re.sub(r'\s+', ' ', code)
    code = code.strip()
    
    return code

In [4]:
def normalize_cpp_code(code):
    # Normalize variable names (preserve patterns but reduce vocabulary)
    code = re.sub(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', 
                  lambda m: normalize_identifier(m.group()), code)
    
    # Normalize numeric literals
    code = re.sub(r'\b\d+\b', 'NUM', code)
    code = re.sub(r'\b0x[0-9a-fA-F]+\b', 'HEX', code)
    
    # Normalize function calls (keep structure)
    code = re.sub(r'(\w+)\s*\(', r'FUNC(', code)
    
    return code

def normalize_identifier(name):
    # Keep important C++ keywords and functions
    cpp_keywords = {'int', 'char', 'void', 'if', 'else', 'for', 'while', 
                   'malloc', 'free', 'strcpy', 'strlen', 'memcpy', 'sizeof'}
    
    if name.lower() in cpp_keywords:
        return name
    elif len(name) <= 3:
        return name  # Keep short vars
    else:
        return 'VAR'  # Generic variable

In [5]:
def extract_security_features(code):
    features = {}
    
    # Dangerous function calls
    dangerous_funcs = ['strcpy', 'strcat', 'gets', 'sprintf', 'scanf', 
                      'malloc', 'free', 'memcpy', 'strncpy']
    
    for func in dangerous_funcs:
        features[f'has_{func}'] = int(func in code.lower())
    
    # Memory operations
    features['ptr_operations'] = len(re.findall(r'\*|\&', code))
    features['array_access'] = len(re.findall(r'\[.*?\]', code))
    features['memory_alloc'] = len(re.findall(r'malloc|calloc|new', code))
    features['memory_free'] = len(re.findall(r'free|delete', code))
    
    # Control flow complexity
    features['if_statements'] = len(re.findall(r'\bif\b', code))
    features['loops'] = len(re.findall(r'\b(for|while)\b', code))
    features['function_calls'] = len(re.findall(r'\w+\s*\(', code))
    
    # Potential vulnerability patterns
    features['buffer_ops'] = len(re.findall(r'strcpy|strcat|gets|sprintf', code))
    features['unchecked_input'] = len(re.findall(r'scanf|gets', code))
    features['pointer_arithmetic'] = len(re.findall(r'\+\+|\-\-|\+\s*\d|\-\s*\d', code))
    
    return features

In [6]:
def preprocess_cpp_dataset(df):
    """
    Complete preprocessing pipeline for C++ code dataset
    
    Args:
        df: DataFrame with 'code' column and optionally 'Label' column
        
    Returns:
        DataFrame with processed features:
        - normalized_code: cleaned and normalized code text
        - security features: has_*, ptr_operations, etc.
        - Label: original label (if present)
    """
    processed_df = df.copy()
    
    print(f"Processing {len(df)} code samples...")
    
    # Step 1: Clean the code (remove comments, normalize strings)
    print("Step 1: Cleaning code...")
    processed_df['cleaned_code'] = processed_df['code'].apply(clean_cpp_code)
    
    # Step 2: Normalize the code (identifiers, numbers, functions)
    print("Step 2: Normalizing code...")
    processed_df['normalized_code'] = processed_df['cleaned_code'].apply(normalize_cpp_code)
    
    # Step 3: Extract security features
    print("Step 3: Extracting security features...")
    security_features_list = []
    
    for idx, code in enumerate(processed_df['code']):
        if idx % 5000 == 0:
            print(f"  Processing sample {idx}/{len(processed_df)}")
        
        features = extract_security_features(code)
        security_features_list.append(features)
    
    # Convert security features to DataFrame and combine
    security_df = pd.DataFrame(security_features_list)
    
    # Combine all features
    result_df = pd.concat([
        processed_df[['normalized_code']],  # Keep normalized code for TF-IDF
        security_df,  # Add all security features
    ], axis=1)
    
    # Keep Label column if it exists (for training data)
    if 'Label' in processed_df.columns:
        result_df['Label'] = processed_df['Label']
    
    print(f"✅ Preprocessing complete!")
    print(f"Features created: {list(security_df.columns)}")
    print(f"Final shape: {result_df.shape}")
    
    return result_df

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')


In [8]:
def create_pipeline_features(train_df, test_df):
    """Simple pipeline using your existing preprocessing"""
    
    print("Step 1: Applying your preprocessing...")
    # Use your existing preprocessing function
    train_processed = preprocess_cpp_dataset(train_df.copy())
    test_processed = preprocess_cpp_dataset(test_df.copy())
    
    print("Step 2: Creating TF-IDF features...")
    # TF-IDF on normalized code
    vectorizer = TfidfVectorizer(
        max_features=1800,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        sublinear_tf = True
    )
    
    # Create TF-IDF features
    tfidf_train = vectorizer.fit_transform(train_processed['normalized_code'])
    tfidf_test = vectorizer.transform(test_processed['normalized_code'])
    
    print("Step 3: Using your security features...")
    # Get your security features (numerical)
    feature_cols = [col for col in train_processed.columns 
                   if col.startswith('has_') or col in ['ptr_operations', 'array_access', 
                   'memory_alloc', 'memory_free', 'if_statements', 'loops', 
                   'function_calls', 'buffer_ops', 'unchecked_input', 'pointer_arithmetic']]
    
    # Combine TF-IDF + your security features
    X_train = np.hstack([
        tfidf_train.toarray(),
        train_processed[feature_cols].values
    ])
    
    X_test = np.hstack([
        tfidf_test.toarray(),
        test_processed[feature_cols].values
    ])
    
    y_train = train_processed['Label'].values
    
    print(f"Final shapes - Train: {X_train.shape}, Test: {X_test.shape}")
    print(f"TF-IDF features: {tfidf_train.shape[1]}, Security features: {len(feature_cols)}")
    
    return X_train, X_test, y_train

# Create features
X_train, X_test, y_train = create_pipeline_features(train_df, test_df)


Step 1: Applying your preprocessing...
Processing 20000 code samples...
Step 1: Cleaning code...
Step 2: Normalizing code...
Step 3: Extracting security features...
  Processing sample 0/20000
  Processing sample 5000/20000
  Processing sample 10000/20000
  Processing sample 15000/20000
✅ Preprocessing complete!
Features created: ['has_strcpy', 'has_strcat', 'has_gets', 'has_sprintf', 'has_scanf', 'has_malloc', 'has_free', 'has_memcpy', 'has_strncpy', 'ptr_operations', 'array_access', 'memory_alloc', 'memory_free', 'if_statements', 'loops', 'function_calls', 'buffer_ops', 'unchecked_input', 'pointer_arithmetic']
Final shape: (20000, 21)
Processing 7000 code samples...
Step 1: Cleaning code...
Step 2: Normalizing code...
Step 3: Extracting security features...
  Processing sample 0/7000
  Processing sample 5000/7000
✅ Preprocessing complete!
Features created: ['has_strcpy', 'has_strcat', 'has_gets', 'has_sprintf', 'has_scanf', 'has_malloc', 'has_free', 'has_memcpy', 'has_strncpy', 'ptr_

In [4]:
# Add this cell to check:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

PyTorch version: 2.7.1+cpu
CUDA available: False
CUDA version: None
Number of GPUs: 0


In [17]:
def train_models_with_codebert(X_train, y_train, train_df, test_df):
    """Enhanced training with CodeBERT option"""
    
    global train_test_split, roc_auc_score

    # Your existing models first
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                random_state=42, stratify=y_train)
    
    models = {}
    results = {}
    
    print("Training models...")
    
    # # 1. Your existing Logistic Regression
    # print("1. Logistic Regression...")
    # lr = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    # lr.fit(X_tr, y_tr)
    # lr_pred = lr.predict_proba(X_val)[:, 1]
    # lr_score = roc_auc_score(y_val, lr_pred)
    # models['Logistic Regression'] = lr
    # results['Logistic Regression'] = lr_score
    # print(f"   ROC-AUC: {lr_score:.4f}")
    
    # # 2. Your existing XGBoost
    # print("2. XGBoost...")
    # scale_pos_weight = len(y_tr[y_tr==0]) / len(y_tr[y_tr==1])
    # xgb_model = xgb.XGBClassifier(
    #     n_estimators=1200, max_depth=8, learning_rate=0.1,
    #     subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
    #     min_child_weight=3, gamma=0.1, random_state=42,
    #     scale_pos_weight=scale_pos_weight, eval_metric='auc'
    # )
    # xgb_model.fit(X_tr, y_tr)
    # xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
    # xgb_score = roc_auc_score(y_val, xgb_pred)
    # models['XGBoost'] = xgb_model
    # results['XGBoost'] = xgb_score
    # print(f"   ROC-AUC: {xgb_score:.4f}")
    
    # 3. CodeBERT (Improved Simple Version)
    print("3. CodeBERT (Improved Simple Version)...")
    try:
        import time
        import torch
        from torch.utils.data import DataLoader
        
        classifier = CodeBERTClassifier(max_length=384)  # Longer sequences
        
        # Use MORE data - this is key!
        subset_size = 15000  # Much more data
        train_texts = train_df['code'].tolist()
        train_labels = train_df['Label'].tolist()
        
        cb_train_texts, cb_val_texts, cb_train_labels, cb_val_labels = train_test_split(
            train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
        )
        
        print(f"   Training on {len(cb_train_texts)} samples...")
        
        # Better preprocessing
        def better_preprocess(code):
            # Keep more structure
            code = code.replace('\n', ' NEWLINE ')
            code = code.replace('\t', ' TAB ')
            return code
        
        # Apply better preprocessing
        cb_train_texts = [better_preprocess(text) for text in cb_train_texts]
        cb_val_texts = [better_preprocess(text) for text in cb_val_texts]
        
        # Tokenize
        train_encodings = classifier.tokenizer(
            cb_train_texts, truncation=True, padding=True, max_length=384, return_tensors="pt"
        )
        val_encodings = classifier.tokenizer(
            cb_val_texts, truncation=True, padding=True, max_length=384, return_tensors="pt"
        )
        
        # Dataset
        class SimpleDataset:
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels
            def __getitem__(self, idx):
                item = {key: val[idx] for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
                return item
            def __len__(self):
                return len(self.labels)
        
        train_dataset = SimpleDataset(train_encodings, cb_train_labels)
        val_dataset = SimpleDataset(val_encodings, cb_val_labels)
        
        # Better dataloaders
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Slightly smaller batch
        val_loader = DataLoader(val_dataset, batch_size=24, shuffle=False)
        
        # Better training setup
        optimizer = torch.optim.AdamW(classifier.model.parameters(), lr=1e-5, weight_decay=0.01)  # Lower LR
        
        print(f"   Training for 4 epochs...")  # More epochs
        best_auc = 0
        
        for epoch in range(4):  # More training
            classifier.model.train()
            epoch_loss = 0
            
            for batch_idx, batch in enumerate(train_loader):
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(classifier.device)
                attention_mask = batch['attention_mask'].to(classifier.device)
                labels = batch['labels'].to(classifier.device)
                
                outputs = classifier.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                
                epoch_loss += loss.item()
                
                if batch_idx % 100 == 0:
                    print(f"   Epoch {epoch+1}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.3f}")
            
            # Validation
            classifier.model.eval()
            val_preds = []
            val_labels_list = []
            
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(classifier.device)
                    attention_mask = batch['attention_mask'].to(classifier.device)
                    labels = batch['labels'].to(classifier.device)
                    
                    outputs = classifier.model(input_ids=input_ids, attention_mask=attention_mask)
                    probs = torch.softmax(outputs.logits, dim=-1)[:, 1]
                    
                    val_preds.extend(probs.cpu().numpy())
                    val_labels_list.extend(labels.cpu().numpy())
            
            val_auc = roc_auc_score(val_labels_list, val_preds)
            avg_loss = epoch_loss / len(train_loader)
            print(f"   Epoch {epoch+1} - Val AUC: {val_auc:.4f}, Avg Loss: {avg_loss:.3f}")
            
            if val_auc > best_auc:
                best_auc = val_auc
                print(f"   ✓ New best: {best_auc:.4f}")
        
        models['CodeBERT'] = classifier
        results['CodeBERT'] = best_auc
        print(f"   🎯 Final CodeBERT AUC: {best_auc:.4f}")
        
        if best_auc > 0.63:
            print(f"   ✅ SUCCESS! Beat target of 0.63")
        else:
            print(f"   ❌ Still below 0.63, need: {0.63 - best_auc:.3f} more")

    except Exception as e:
        print(f"   CodeBERT failed: {str(e)}")
        
        # 4. Your existing Neural Network as fallback
        nn_model = keras.Sequential([
            layers.Dense(1024, activation='relu', input_shape=(X_tr.shape[1],)),
            layers.Dropout(0.2),
            layers.Dense(512, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(64, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(1, activation='sigmoid')
        ])
        nn_model.compile(
            optimizer=Adam(learning_rate=0.001), 
            loss='binary_crossentropy', 
            metrics=[keras.metrics.AUC(name='auc')]
        )
        
        early_stop = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')
        reduce_lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, min_lr=1e-6, mode='max')
        
        nn_model.fit(X_tr, y_tr, epochs=40, batch_size=64, 
                    validation_data=(X_val, y_val), callbacks=[early_stop, reduce_lr])
        
        nn_pred = nn_model.predict(X_val)
        nn_score = roc_auc_score(y_val, nn_pred)
        models['Neural Network'] = nn_model
        results['Neural Network'] = nn_score
        print(f"   Neural Network ROC-AUC: {nn_score:.4f}")
    
    # Find best model
    best_model_name = max(results, key=results.get)
    best_model = models[best_model_name]
    best_score = results[best_model_name]
    
    print(f"\nBest Model: {best_model_name} (ROC-AUC: {best_score:.4f})")
    print(f"Target (>0.63): {'ACHIEVED' if best_score > 0.63 else 'NOT ACHIEVED'}")
    
    return best_model, best_score, best_model_name

# Train models
best_model, best_score, best_model_name = train_models_with_codebert(X_train, y_train, train_df, test_df)


Training models...
3. CodeBERT (Improved Simple Version)...
Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Training on 16000 samples...
   Training for 4 epochs...
   Epoch 1, Batch 0/2000, Loss: 0.691
   Epoch 1, Batch 100/2000, Loss: 0.687
   Epoch 1, Batch 200/2000, Loss: 0.691
   Epoch 1, Batch 300/2000, Loss: 0.670
   Epoch 1, Batch 400/2000, Loss: 0.683
   Epoch 1, Batch 500/2000, Loss: 0.653
   Epoch 1, Batch 600/2000, Loss: 0.654
   Epoch 1, Batch 700/2000, Loss: 0.662
   Epoch 1, Batch 800/2000, Loss: 0.738
   Epoch 1, Batch 900/2000, Loss: 0.720
   Epoch 1, Batch 1000/2000, Loss: 0.507
   Epoch 1, Batch 1100/2000, Loss: 0.759
   Epoch 1, Batch 1200/2000, Loss: 0.630
   Epoch 1, Batch 1300/2000, Loss: 0.535
   Epoch 1, Batch 1400/2000, Loss: 0.668
   Epoch 1, Batch 1500/2000, Loss: 0.626
   Epoch 1, Batch 1600/2000, Loss: 0.602
   Epoch 1, Batch 1700/2000, Loss: 0.665
   Epoch 1, Batch 1800/2000, Loss: 0.574
   Epoch 1, Batch 1900/2000, Loss: 0.480
   Epoch 1 - Val AUC: 0.6123, Avg Loss: 0.673
   ✓ New best: 0.6123
   Epoch 2, Batch 0/2000, Loss: 0.721
   Epoch 2, Batch 100/2000

In [23]:
# Simple CodeBERT Prediction (Batched)
print("Making predictions with CodeBERT...")

# Clear GPU memory first
torch.cuda.empty_cache()

# Get trained model
codebert_model = best_model
codebert_model.model.eval()

# Process in small batches
batch_size = 32
all_predictions = []

with torch.no_grad():
    for i in range(0, len(test_df), batch_size):
        # Get batch
        batch_texts = test_df['code'].iloc[i:i+batch_size].tolist()
        
        def better_preprocess(code):
            code = code.replace('\n', ' NEWLINE ')
            code = code.replace('\t', ' TAB ')
            return code

        batch_texts = [better_preprocess(text) for text in batch_texts]
        
        # Tokenize batch
        batch_inputs = codebert_model.tokenizer(
            batch_texts, 
            truncation=True, 
            padding=True, 
            max_length=384, 
            return_tensors="pt"
        )
        
        # Move to GPU
        batch_inputs = {k: v.to(codebert_model.device) for k, v in batch_inputs.items()}
        
        # Predict
        outputs = codebert_model.model(**batch_inputs)
        probs = torch.softmax(outputs.logits, dim=-1)[:, 1]
        predictions = (probs > 0.5).int().cpu().numpy()
        
        all_predictions.extend(predictions)
        
        # Clear GPU memory
        del batch_inputs, outputs, probs, predictions
        torch.cuda.empty_cache()
        
        # Progress
        if i % 500 == 0:
            print(f"   Processed {i}/{len(test_df)}")

# Save
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Label': all_predictions
})
submission.to_csv('data/submission.csv', index=False)

print(f"✅ Done! {len(all_predictions)} predictions saved.")

Making predictions with CodeBERT...
   Processed 0/7000
   Processed 4000/7000
✅ Done! 7000 predictions saved.


In [18]:
# Train final model and generate predictions
print("Training final model on full dataset...")
final_model = best_model

# Generate predictions on test set
print("Generating predictions...")

# Get probabilities first, then convert to binary labels
if 'Neural Network' in str(type(best_model)) or hasattr(best_model, 'predict') and not hasattr(best_model, 'predict_proba'):
    # Neural Network case - gives probabilities, convert to labels
    test_probabilities = final_model.predict(X_test).flatten()
    test_predictions = (test_probabilities > 0.5).astype(int)  # Convert to 0 or 1
else:
    # Sklearn models - retrain and get predictions
    final_model.fit(X_train, y_train)
    test_predictions = final_model.predict(X_test)  # Direct binary predictions

# Create submission file with binary labels
submission = pd.DataFrame({
    'ID': test_df['ID'],           
    'Label': test_predictions      
})

# Save submission
submission.to_csv('data/submission.csv', index=False)
print(f"✅ Submission saved! Shape: {submission.shape}")
print(f"Label distribution: {pd.Series(test_predictions).value_counts().sort_index()}")
print(submission.head())
# print(test_df.shape)

# Verify format
print(f"\nSubmission format check:")
print(f"- Unique labels: {sorted(submission['Label'].unique())}")
print(f"- Should be: [0, 1] only")

Training final model on full dataset...
Generating predictions...


AttributeError: 'CodeBERTClassifier' object has no attribute 'fit'

In [13]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

class CodeBERTClassifier:
    def __init__(self, model_name="microsoft/codebert-base", max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f"Using device: {self.device}")
        
        # Initialize tokenizer and model with safetensors
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2,
            problem_type="single_label_classification",
            use_safetensors=True  # Force safetensors format
        )
        self.model.to(self.device)
    
    def preprocess_code(self, code_text):
        """Minimal preprocessing for CodeBERT"""
        code_text = ' '.join(code_text.split())
        if len(code_text) > 2000:
            code_text = code_text[:2000]
        return code_text

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = torch.softmax(torch.tensor(predictions), dim=-1)[:, 1]  # Get positive class probabilities
    
    # ROC-AUC
    roc_auc = roc_auc_score(labels, predictions)
    
    # Accuracy with threshold 0.5
    pred_labels = (predictions > 0.5).astype(int)
    accuracy = accuracy_score(labels, pred_labels)
    
    return {
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }

def train_codebert_model(train_df, test_df):
    """Complete training pipeline for CodeBERT"""
    
    print("🚀 Starting CodeBERT training...")
    
    # Initialize classifier
    classifier = CodeBERTClassifier()
    
    # Prepare training data
    print("📝 Preparing training data...")
    train_texts = train_df['code'].tolist()
    train_labels = train_df['Label'].tolist()
    
    # Create train/validation split
    from sklearn.model_selection import train_test_split
    train_texts_split, val_texts_split, train_labels_split, val_labels_split = train_test_split(
        train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
    )
    
    # Tokenize datasets
    train_dataset = classifier.tokenize_data(train_texts_split, train_labels_split)
    val_dataset = classifier.tokenize_data(val_texts_split, val_labels_split)
    test_dataset = classifier.tokenize_data(test_df['code'].tolist())
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir='./codebert_results',
        num_train_epochs=3,              # Start with 3 epochs
        per_device_train_batch_size=8,   # Adjust based on GPU memory
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        load_best_model_at_end=True,
        metric_for_best_model="roc_auc",
        greater_is_better=True,
        report_to=None,  # Disable wandb/tensorboard
        dataloader_num_workers=0,  # Prevent multiprocessing issues
    )
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=classifier.tokenizer)
    
    # Initialize trainer
    trainer = Trainer(
        model=classifier.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=classifier.tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Train model
    print("🏋️ Training model...")
    trainer.train()
    
    # Evaluate on validation set
    print("📊 Evaluating on validation set...")
    eval_results = trainer.evaluate()
    print(f"Validation ROC-AUC: {eval_results['eval_roc_auc']:.4f}")
    print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")
    
    # Generate predictions on test set
    print("🔮 Generating test predictions...")
    test_predictions = trainer.predict(test_dataset)
    test_probs = torch.softmax(torch.tensor(test_predictions.predictions), dim=-1)[:, 1]
    test_labels = (test_probs > 0.5).int().numpy()
    
    return classifier, trainer, test_labels, test_probs.numpy(), eval_results['eval_roc_auc']

# Alternative: Ensemble CodeBERT with your existing models
def ensemble_codebert_with_existing(train_df, test_df, existing_predictions):
    """Combine CodeBERT with your existing models"""
    
    # Train CodeBERT
    classifier, trainer, codebert_test_labels, codebert_test_probs, codebert_score = train_codebert_model(train_df, test_df)
    
    # Simple ensemble: average probabilities
    # Convert your existing predictions to probabilities if needed
    if hasattr(existing_predictions, 'predict_proba'):
        existing_probs = existing_predictions
    else:
        existing_probs = existing_predictions  # Assume already probabilities
    
    # Weighted ensemble (CodeBERT likely better, so higher weight)
    ensemble_probs = 0.7 * codebert_test_probs + 0.3 * existing_probs
    ensemble_labels = (ensemble_probs > 0.5).astype(int)
    
    print(f"CodeBERT ROC-AUC: {codebert_score:.4f}")
    print(f"Ensemble predictions ready!")
    
    return ensemble_labels, ensemble_probs

# Usage example:
"""
# Option 1: Pure CodeBERT (Recommended)
classifier, trainer, test_predictions, test_probabilities, validation_score = train_codebert_model(train_df, test_df)

# Create submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Label': test_predictions
})
submission.to_csv('data/codebert_submission.csv', index=False)
print(f"CodeBERT ROC-AUC: {validation_score:.4f}")

# Option 2: Ensemble with existing models
ensemble_labels, ensemble_probs = ensemble_codebert_with_existing(train_df, test_df, your_existing_probabilities)
"""

'\n# Option 1: Pure CodeBERT (Recommended)\nclassifier, trainer, test_predictions, test_probabilities, validation_score = train_codebert_model(train_df, test_df)\n\n# Create submission\nsubmission = pd.DataFrame({\n    \'ID\': test_df[\'ID\'],\n    \'Label\': test_predictions\n})\nsubmission.to_csv(\'data/codebert_submission.csv\', index=False)\nprint(f"CodeBERT ROC-AUC: {validation_score:.4f}")\n\n# Option 2: Ensemble with existing models\nensemble_labels, ensemble_probs = ensemble_codebert_with_existing(train_df, test_df, your_existing_probabilities)\n'