# Campus Placement Prediction - Modeling

This notebook implements the complete modeling pipeline for campus placement prediction.

## Objectives:
1. Load and preprocess data
2. Train multiple models (Logistic Regression, Random Forest, XGBoost)
3. Perform hyperparameter tuning with cross-validation
4. Evaluate models with comprehensive metrics
5. Select and save the best model
6. Ensure reproducibility with fixed random seed

## 1. Setup and Imports

In [None]:
# Import required librariesimport sysimport os# Add src directory to pathsys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom data import load_data, RANDOM_SEEDfrom preprocess import prepare_train_test_splitfrom train import train_all_models, select_best_model, save_modelfrom evaluate import (    evaluate_model,     print_evaluation_metrics,    print_classification_report,    plot_confusion_matrix,    plot_roc_curve,    compare_models,    plot_model_comparison,    get_feature_importance,    plot_feature_importance)# Set random seednp.random.seed(RANDOM_SEED)# Configure visualization%matplotlib inline# Configure plot styletry:    plt.style.use('seaborn-v0_8-darkgrid')except:    plt.style.use('seaborn-darkgrid')print("Setup complete!")print(f"Random seed: {RANDOM_SEED}")

## 2. Load and Preprocess Data

In [None]:
# Load data
try:
    df = load_data()
    print("Data loaded successfully!")
except FileNotFoundError as e:
    print(e)
    print("Please download the dataset following instructions in data/README.md")
    df = None

In [None]:
# Prepare train-test split with preprocessing
if df is not None:
    X_train, X_test, y_train, y_test, preprocessor = prepare_train_test_split(
        df, 
        test_size=0.2, 
        random_seed=RANDOM_SEED
    )
    print("\nData preprocessing complete!")

## 3. Train Models with Cross-Validation and Hyperparameter Tuning

In [None]:
# Train all models
if df is not None:
    print("Training models with cross-validation and hyperparameter tuning...")
    print("This may take several minutes...\n")
    
    models = train_all_models(
        X_train, 
        y_train, 
        cv=5, 
        random_seed=RANDOM_SEED
    )
    
    print("\nAll models trained successfully!")

## 4. Select Best Model

In [None]:
# Select best model based on CV score
if df is not None:
    best_model_name, best_model, best_cv_score = select_best_model(models)

## 5. Evaluate Models on Test Set

In [None]:
# Evaluate all models on test set
if df is not None:
    models_dict = {name: info['model'] for name, info in models.items()}
    results_df = compare_models(models_dict, X_test, y_test)
    
    print("\nTest Set Performance:")
    print(results_df)

In [None]:
# Plot model comparison
if df is not None:
    plot_model_comparison(results_df)

## 6. Detailed Evaluation of Best Model

In [None]:
# Detailed metrics for best model
if df is not None:
    best_metrics = evaluate_model(best_model, X_test, y_test, best_model_name)
    print_evaluation_metrics(best_metrics)
    print_classification_report(best_model, X_test, y_test)

In [None]:
# Confusion matrix
if df is not None:
    plot_confusion_matrix(best_model, X_test, y_test, best_model_name)

In [None]:
# ROC curve
if df is not None:
    plot_roc_curve(best_model, X_test, y_test, best_model_name)

## 7. Feature Importance Analysis

In [None]:
# Feature importance (for tree-based models)
if df is not None:
    feature_names = preprocessor.feature_names
    importance_df = get_feature_importance(best_model, feature_names, top_n=10)
    
    if importance_df is not None:
        print("\nTop 10 Most Important Features:")
        print(importance_df)
        plot_feature_importance(importance_df, best_model_name)

## 8. Save Best Model

In [None]:
# Save the best model
if df is not None:
    model_path = save_model(best_model, 'best_model.pkl')
    print(f"\nBest model ({best_model_name}) saved successfully!")
    print(f"Model path: {model_path}")

## 9. Summary

In [None]:
if df is not None:
    print("=" * 80)
    print("MODELING PIPELINE SUMMARY")
    print("=" * 80)
    print(f"\nDataset: {df.shape[0]} samples")
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    print(f"\nModels trained: {len(models)}")
    print(f"Best model: {best_model_name}")
    print(f"CV Score (ROC AUC): {best_cv_score:.4f}")
    print(f"Test Score (ROC AUC): {best_metrics['roc_auc']:.4f}")
    print(f"\nTest Metrics:")
    print(f"  Accuracy:  {best_metrics['accuracy']:.4f}")
    print(f"  Precision: {best_metrics['precision']:.4f}")
    print(f"  Recall:    {best_metrics['recall']:.4f}")
    print(f"  F1 Score:  {best_metrics['f1']:.4f}")
    print(f"\nRandom seed used: {RANDOM_SEED}")
    print(f"Model saved as: best_model.pkl")
    print("=" * 80)
    print("Modeling complete! All results are reproducible.")
    print("=" * 80)