# Prediction Model for Student Risk Assessment

This notebook focuses on building machine learning models to predict students at high and medium risk.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

## 2. Load Engineered Data

In [None]:
# Load the dataset with engineered features
df = pd.read_csv('../data/refined_data_for_model/Student_Data_With_Features.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset columns: {list(df.columns)}")

In [None]:
# Display first few rows
df.head()

## 3. Data Preparation

In [None]:
# Identify target variable
target_cols = [col for col in df.columns if any(x in col.lower() for x in ['risk', 'target', 'label', 'outcome'])]
print(f"Potential target columns: {target_cols}")

# If no clear target exists, create one based on available features
if not target_cols:
    # Check for academic status or performance indicators
    if 'avg_assessment' in df.columns:
        df['risk_level'] = pd.cut(df['avg_assessment'], 
                                 bins=3, 
                                 labels=['high_risk', 'medium_risk', 'low_risk'])
        target_col = 'risk_level'
    elif any('academic_status' in col.lower() for col in df.columns):
        academic_col = [col for col in df.columns if 'academic_status' in col.lower()][0]
        target_col = academic_col
    else:
        print("No suitable target variable found. Please specify the target column.")
        target_col = None
else:
    target_col = target_cols[0]

print(f"Using target variable: {target_col}")

In [None]:
# Analyze target distribution
if target_col and target_col in df.columns:
    print(f"Target variable distribution:")
    print(df[target_col].value_counts())
    
    plt.figure(figsize=(8, 5))
    df[target_col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {target_col}')
    plt.xlabel('Risk Level')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Prepare features and target
if target_col and target_col in df.columns:
    # Remove non-predictive columns
    drop_cols = ['student_id'] + [col for col in df.columns if 'id' in col.lower()]
    drop_cols = [col for col in drop_cols if col in df.columns]
    
    # Feature matrix
    X = df.drop(columns=[target_col] + drop_cols)
    
    # Target variable
    y = df[target_col]
    
    # Encode target if categorical
    if y.dtype == 'object':
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        print(f"Target encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    else:
        y_encoded = y
        le = None
    
    print(f"Feature matrix shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Features: {list(X.columns)}")
else:
    print("Cannot proceed without a valid target variable.")

## 4. Train-Test Split

In [None]:
# Split the data
if 'X' in locals() and 'y_encoded' in locals():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Training target distribution: {np.bincount(y_train)}")
    print(f"Test target distribution: {np.bincount(y_test)}")

## 5. Feature Scaling

In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully.")

## 6. Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Store results
results = {}
trained_models = {}

In [None]:
# Train and evaluate models
for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    # Use scaled data for SVM and Logistic Regression
    if name in ['SVM', 'Logistic Regression']:
        X_train_model = X_train_scaled
        X_test_model = X_test_scaled
    else:
        X_train_model = X_train
        X_test_model = X_test
    
    # Train model
    model.fit(X_train_model, y_train)
    trained_models[name] = model
    
    # Make predictions
    y_pred = model.predict(X_test_model)
    y_pred_proba = model.predict_proba(X_test_model)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Classification report
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

## 7. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model]['accuracy'] for model in results.keys()],
    'Precision': [results[model]['precision'] for model in results.keys()],
    'Recall': [results[model]['recall'] for model in results.keys()],
    'F1-Score': [results[model]['f1_score'] for model in results.keys()]
})

print("Model Comparison:")
print(comparison_df.round(4))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    comparison_df.plot(x='Model', y=metric, kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'{metric} Comparison')
    ax.set_xlabel('Models')
    ax.set_ylabel(metric)
    ax.legend().remove()
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
tree_models = ['Random Forest', 'Gradient Boosting']

for model_name in tree_models:
    if model_name in trained_models:
        model = trained_models[model_name]
        importance = model.feature_importances_
        
        # Create feature importance dataframe
        feature_importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        print(f"\n{model_name} - Top 10 Important Features:")
        print(feature_importance_df.head(10))
        
        # Plot feature importance
        plt.figure(figsize=(10, 6))
        top_features = feature_importance_df.head(10)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Importance')
        plt.title(f'{model_name} - Feature Importance')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

## 9. Best Model Selection and Hyperparameter Tuning

In [None]:
# Select best model based on F1-score
best_model_name = max(results.keys(), key=lambda x: results[x]['f1_score'])
print(f"Best performing model: {best_model_name}")
print(f"F1-Score: {results[best_model_name]['f1_score']:.4f}")

In [None]:
# Hyperparameter tuning for best model
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    base_model = RandomForestClassifier(random_state=42)
    X_tune = X_train
    
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.2],
        'max_depth': [3, 5]
    }
    base_model = GradientBoostingClassifier(random_state=42)
    X_tune = X_train
    
elif best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    }
    base_model = LogisticRegression(random_state=42, max_iter=1000)
    X_tune = X_train_scaled
    
else:  # SVM
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    }
    base_model = SVC(random_state=42, probability=True)
    X_tune = X_train_scaled

# Grid search
print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_tune, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

## 10. Final Model Evaluation

In [None]:
# Evaluate tuned model on test set
best_model = grid_search.best_estimator_

# Use appropriate test data
if best_model_name in ['SVM', 'Logistic Regression']:
    X_test_final = X_test_scaled
else:
    X_test_final = X_test

# Final predictions
y_pred_final = best_model.predict(X_test_final)
y_pred_proba_final = best_model.predict_proba(X_test_final)

# Final metrics
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final, average='weighted')
final_recall = recall_score(y_test, y_pred_final, average='weighted')
final_f1 = f1_score(y_test, y_pred_final, average='weighted')

print(f"\nFinal {best_model_name} Performance:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1-Score: {final_f1:.4f}")

print(f"\nFinal Classification Report:")
print(classification_report(y_test, y_pred_final))

## 11. Risk Prediction Analysis

In [None]:
# Analyze predictions by risk level
if le is not None:
    # Convert predictions back to original labels
    y_test_labels = le.inverse_transform(y_test)
    y_pred_labels = le.inverse_transform(y_pred_final)
    
    # Create prediction summary
    prediction_summary = pd.DataFrame({
        'Actual': y_test_labels,
        'Predicted': y_pred_labels
    })
    
    print("Prediction Summary:")
    print(pd.crosstab(prediction_summary['Actual'], prediction_summary['Predicted'], margins=True))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
if le is not None:
    plt.xticks(range(len(le.classes_)), le.classes_)
    plt.yticks(range(len(le.classes_)), le.classes_)
plt.tight_layout()
plt.show()

## 12. Model Insights and Recommendations

In [None]:
# Summary of model performance
print("\n" + "="*60)
print("MODEL SUMMARY AND RECOMMENDATIONS")
print("="*60)

print(f"\n1. BEST MODEL: {best_model_name}")
print(f"   Final F1-Score: {final_f1:.4f}")
print(f"   Final Accuracy: {final_accuracy:.4f}")

print(f"\n2. MODEL PARAMETERS:")
for param, value in grid_search.best_params_.items():
    print(f"   {param}: {value}")

if best_model_name in ['Random Forest', 'Gradient Boosting']:
    print(f"\n3. TOP RISK INDICATORS:")
    importance = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    for i, (_, row) in enumerate(feature_importance_df.head(5).iterrows()):
        print(f"   {i+1}. {row['feature']}: {row['importance']:.4f}")

print(f"\n4. RECOMMENDATIONS:")
print(f"   - Monitor students with high-risk predictions closely")
print(f"   - Implement early intervention programs")
print(f"   - Focus on top risk indicators for prevention strategies")
print(f"   - Regularly retrain model with new data")

## Next Steps

1. **Model Deployment**: Prepare the model for production use
2. **Monitoring**: Set up model performance monitoring
3. **Intervention Design**: Create targeted intervention strategies based on risk factors
4. **Data Collection**: Continuously collect new data for model retraining
5. **Feature Engineering**: Explore additional features that might improve prediction accuracy