In [3]:
"""
CodePilot: AI-Powered Test Failure Prediction for CI/CD Pipelines
Research Proposal Implementation - Hypothesis 2

This script implements the supervised ML classifier to predict CI test failures
based on pull request characteristics as described in the research methodology.

Expected Features (from Research Proposal):
- Files changed
- Lines added
- Lines deleted
- Average function complexity
- Code coverage change
- Previous test failure rate
- Contains test changes
- Build duration
- Module type
- Label_test_failed (target variable)
"""

import pandas as pd
import json
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("CodePilot: Test Failure Prediction Model")
print("Research Proposal Implementation - Hypothesis 2")
print("="*70)

# Load dataset
print("\n[1/8] Loading synthetic PR dataset...")
try:
    df = pd.read_csv('../data/synthetic_pr_dataset_v2.csv')
    print(f"✓ Dataset loaded successfully: {df.shape[0]} PRs, {df.shape[1]} features")
except FileNotFoundError:
    print("✗ Error: Dataset not found at '../data/synthetic_pr_dataset_v2.csv'")
    exit(1)

print(f"\nColumn names: {df.columns.tolist()}")

# Check for target variable
if 'label_test_failed' not in df.columns:
    print("\n✗ Error: Target variable 'label_test_failed' not found!")
    print(f"Available columns: {df.columns.tolist()}")
    exit(1)

# Display initial data info
print(f"\n[2/8] Initial Data Assessment")
print(f"Dataset shape: {df.shape}")
print(f"\nMissing values per column:")
missing_summary = df.isnull().sum()
if missing_summary.sum() == 0:
    print("✓ No missing values detected")
else:
    print(missing_summary[missing_summary > 0])

# Handle missing values if present
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col != 'timestamp' and df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0], inplace=True)
    print("✓ Missing values handled")

# Process timestamp column (if exists)
if 'timestamp' in df.columns:
    print(f"\n[3/8] Processing timestamp features...")
    try:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        
        if df['timestamp'].isnull().sum() > 0:
            print(f"Warning: {df['timestamp'].isnull().sum()} invalid timestamps - dropping those rows")
            df = df.dropna(subset=['timestamp'])
        
        # Extract temporal features for CI analysis
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17)).astype(int)
        
        df = df.drop('timestamp', axis=1)
        print("✓ Temporal features extracted: hour, day_of_week, is_weekend, is_business_hours")
    except Exception as e:
        print(f"Warning: Could not process timestamp - {e}")
        if 'timestamp' in df.columns:
            df = df.drop('timestamp', axis=1)
else:
    print(f"\n[3/8] No timestamp column found - skipping temporal features")

# Encode categorical variables
print(f"\n[4/8] Encoding categorical variables...")
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if len(categorical_cols) > 0:
    print(f"Categorical columns found: {categorical_cols}")
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print(f"✓ Encoded using one-hot encoding")
else:
    print("✓ No categorical columns to encode")

# Separate features and target
print(f"\n[5/8] Preparing features and target variable...")
X = df.drop('label_test_failed', axis=1)
y = df['label_test_failed']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())
print(f"\nClass proportions:")
class_props = y.value_counts(normalize=True)
print(f"  Test Failed (1): {class_props.get(1, 0):.2%}")
print(f"  Test Passed (0): {class_props.get(0, 0):.2%}")

# Data quality checks
print("\nData quality checks:")
if np.isinf(X).any().any() or X.isnull().any().any():
    print("⚠ Found infinite or NaN values - cleaning...")
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    print("✓ Data cleaned")
else:
    print("✓ No data quality issues detected")

# Remove constant features
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
if constant_features:
    print(f"\n⚠ Removing {len(constant_features)} constant features")
    X = X.drop(columns=constant_features)

print(f"\nFinal feature set: {X.shape[1]} features")
print(f"Feature names: {X.columns.tolist()}")

# Split data with stratification (as per research methodology: 70/15/15)
print(f"\n[6/8] Splitting data (70% train, 15% validation, 15% test)...")
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp  # 0.176 * 0.85 ≈ 0.15
)

print(f"✓ Training set: {X_train.shape[0]} samples ({len(X_train)/len(X):.1%})")
print(f"✓ Validation set: {X_val.shape[0]} samples ({len(X_val)/len(X):.1%})")
print(f"✓ Test set: {X_test.shape[0]} samples ({len(X_test)/len(X):.1%})")

# Load or create model parameters
print(f"\n[7/8] Loading model configuration...")
try:
    with open('../model/model.json', 'r') as f:
        content = f.read().strip()
        if not content:
            raise ValueError("model.json is empty")
        model_params = json.loads(content)
    print(f"✓ Model parameters loaded from model.json")
except (FileNotFoundError, json.JSONDecodeError, ValueError) as e:
    print(f"⚠ Could not load model.json - creating default configuration")
    model_params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 5,
        'min_samples_leaf': 2,
        'random_state': 42,
        'n_jobs': -1,
        'class_weight': 'balanced'
    }
    
    try:
        os.makedirs('../model', exist_ok=True)
        with open('../model/model.json', 'w') as f:
            json.dump(model_params, f, indent=4)
        print("✓ Created new model.json with default parameters")
    except Exception as e:
        print(f"⚠ Could not save model.json: {e}")

print(f"\nModel configuration:")
for key, value in model_params.items():
    print(f"  {key}: {value}")

# Train Random Forest model (as per research methodology)
print(f"\n[8/8] Training Random Forest Classifier...")
model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)
print("✓ Model training completed")

# Perform 5-fold cross-validation (as per research methodology)
print("\nPerforming 5-fold cross-validation...")
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
print(f"✓ Cross-validation F1 scores: {cv_scores}")
print(f"✓ Mean CV F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Validate on validation set
print("\n" + "="*70)
print("VALIDATION SET PERFORMANCE")
print("="*70)
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_proba)

print(f"\nValidation Metrics:")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")
print(f"  F1-Score:  {val_f1:.4f}")
print(f"  ROC-AUC:   {val_roc_auc:.4f}")

# Final evaluation on test set
print("\n" + "="*70)
print("TEST SET PERFORMANCE (Final Evaluation)")
print("="*70)
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Calculate all metrics as per research methodology
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_proba)

print(f"\nTest Metrics:")
print(f"  Accuracy:  {test_accuracy:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall:    {test_recall:.4f}")
print(f"  F1-Score:  {test_f1:.4f}")
print(f"  ROC-AUC:   {test_roc_auc:.4f}")

# Check against research hypotheses
print("\n" + "="*70)
print("HYPOTHESIS 2 VALIDATION")
print("="*70)
print(f"Expected Outcomes (from Research Proposal):")
print(f"  • F1-score > 0.80: {'✓ PASS' if test_f1 > 0.80 else '✗ FAIL'} (Actual: {test_f1:.4f})")
print(f"  • ROC-AUC > 0.85: {'✓ PASS' if test_roc_auc > 0.85 else '✗ FAIL'} (Actual: {test_roc_auc:.4f})")
print(f"  • Consistent CV results: ✓ (Mean: {cv_scores.mean():.4f}, Std: {cv_scores.std():.4f})")

# Confusion Matrix
print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)
cm = confusion_matrix(y_test, y_test_pred)
print(f"\n{cm}")
print(f"\nBreakdown:")
print(f"  True Negatives (TN):  {cm[0,0]} - Correctly predicted test passed")
print(f"  False Positives (FP): {cm[0,1]} - Incorrectly predicted test failed")
print(f"  False Negatives (FN): {cm[1,0]} - Incorrectly predicted test passed")
print(f"  True Positives (TP):  {cm[1,1]} - Correctly predicted test failed")

# Detailed Classification Report
print("\n" + "="*70)
print("DETAILED CLASSIFICATION REPORT")
print("="*70)
print(classification_report(y_test, y_test_pred, target_names=['Test Passed', 'Test Failed']))

# Feature Importance Analysis (for interpretability)
print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*70)
print("(Supporting interpretability requirement from Hypothesis 2)")

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 15 Most Important Features for Predicting Test Failures:")
print(feature_importance.head(15).to_string(index=False))

# Save results summary
print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)
results_summary = {
    'model_type': 'RandomForestClassifier',
    'dataset_size': len(df),
    'num_features': X.shape[1],
    'test_set_metrics': {
        'accuracy': float(test_accuracy),
        'precision': float(test_precision),
        'recall': float(test_recall),
        'f1_score': float(test_f1),
        'roc_auc': float(test_roc_auc)
    },
    'cross_validation': {
        'mean_f1': float(cv_scores.mean()),
        'std_f1': float(cv_scores.std())
    },
    'confusion_matrix': cm.tolist(),
    'hypothesis_2_validation': {
        'f1_threshold_0.80': test_f1 > 0.80,
        'roc_auc_threshold_0.85': test_roc_auc > 0.85
    },
    'top_10_features': feature_importance.head(10).to_dict('records')
}

try:
    os.makedirs('../results', exist_ok=True)
    with open('../results/model_evaluation_results.json', 'w') as f:
        json.dump(results_summary, f, indent=4)
    print("✓ Results saved to '../results/model_evaluation_results.json'")
except Exception as e:
    print(f"⚠ Could not save results: {e}")

print("\n" + "="*70)
print("CODEPILOT TEST FAILURE PREDICTION - COMPLETE")
print("="*70)
print(f"\nSummary: The model {'MEETS' if (test_f1 > 0.80 and test_roc_auc > 0.85) else 'DOES NOT MEET'} the expected outcomes of Hypothesis 2")
print(f"Ready for integration into CodePilot CI/CD pipeline")

CodePilot: Test Failure Prediction Model
Research Proposal Implementation - Hypothesis 2

[1/8] Loading synthetic PR dataset...
✓ Dataset loaded successfully: 15000 PRs, 12 features

Column names: ['timestamp', 'developer', 'module_type', 'lines_added', 'lines_deleted', 'files_changed', 'avg_function_complexity', 'code_coverage_change', 'build_duration', 'contains_test_changes', 'previous_failure_rate', 'label_test_failed']

[2/8] Initial Data Assessment
Dataset shape: (15000, 12)

Missing values per column:
✓ No missing values detected

[3/8] Processing timestamp features...
✓ Temporal features extracted: hour, day_of_week, is_weekend, is_business_hours

[4/8] Encoding categorical variables...
Categorical columns found: ['developer', 'module_type']
✓ Encoded using one-hot encoding

[5/8] Preparing features and target variable...
Features shape: (14999, 46)
Target shape: (14999,)

Class distribution:
label_test_failed
0    11447
1     3552
Name: count, dtype: int64

Class proportions:
