# Predictive Analytics for Educational Outcomes

This notebook focuses on predicting educational outcomes and identifying key factors that influence student success in Bangladesh.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Set random seed for reproducibility
np.random.seed(42)

# Load the data
df = pd.read_csv('../processed_data/cleaned/cleaned_student_data.csv')

## 1. Feature Engineering

In [None]:
def prepare_features(data):
    """Prepare features for predictive modeling."""
    # Create copy of data
    df_model = data.copy()
    
    # Create target variable (success = GPA >= 3.0)
    df_model['success'] = (df_model['gpa'] >= 3.0).astype(int)
    
    # Create derived features
    df_model['attendance_category'] = pd.qcut(df_model['attendance_rate'], 
                                             q=4, labels=['Low', 'Medium', 'High', 'Very High'])
    
    # Encode categorical variables
    categorical_columns = ['division', 'gender', 'location_type', 'attendance_category']
    df_model = pd.get_dummies(df_model, columns=categorical_columns)
    
    return df_model

# Prepare features
df_model = prepare_features(df)
print("Features created:")
print(df_model.columns.tolist())

## 2. Model Development

In [None]:
def train_predict_model(data, target='success'):
    """Train and evaluate prediction model."""
    # Separate features and target
    X = data.drop([target, 'gpa', 'student_id'], axis=1)
    y = data[target]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train models
    models = {
        'logistic': LogisticRegression(),
        'random_forest': RandomForestClassifier(n_estimators=100)
    }
    
    results = {}
    for name, model in models.items():
        # Train model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Evaluate
        results[name] = {
            'classification_report': classification_report(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        # Feature importance (for random forest)
        if name == 'random_forest':
            feature_importance = pd.DataFrame({
                'feature': X.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            results[name]['feature_importance'] = feature_importance
    
    return results

# Train and evaluate models
model_results = train_predict_model(df_model)

# Display results
for model_name, result in model_results.items():
    print(f"\nResults for {model_name}:")
    print("\nClassification Report:")
    print(result['classification_report'])
    
    if 'feature_importance' in result:
        print("\nTop 10 Important Features:")
        print(result['feature_importance'].head(10))

## 3. Risk Factor Analysis

In [None]:
def analyze_risk_factors(data, model_results):
    """Analyze factors that contribute to academic risk."""
    # Get feature importance
    feature_importance = model_results['random_forest']['feature_importance']
    
    # Plot feature importance
    plt.figure(figsize=(12, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Factors Influencing Academic Success')
    plt.show()
    
    # Analyze high-risk patterns
    high_risk = data[data['success'] == 0]
    low_risk = data[data['success'] == 1]
    
    # Compare distributions
    for feature in feature_importance['feature'].head(5):
        plt.figure(figsize=(10, 6))
        sns.kdeplot(data=high_risk, x=feature, label='High Risk')
        sns.kdeplot(data=low_risk, x=feature, label='Low Risk')
        plt.title(f'Distribution of {feature} by Risk Level')
        plt.legend()
        plt.show()

analyze_risk_factors(df_model, model_results)

## 4. Intervention Recommendations

In [None]:
def generate_recommendations(data, model_results):
    """Generate targeted intervention recommendations."""
    # Identify high-risk groups
    risk_factors = model_results['random_forest']['feature_importance']
    top_factors = risk_factors['feature'].head(5).tolist()
    
    # Analyze patterns in high-risk group
    high_risk = data[data['success'] == 0]
    
    # Generate recommendations based on patterns
    recommendations = {
        'attendance': {
            'pattern': high_risk['attendance_rate'].mean(),
            'recommendation': 'Implement attendance monitoring and support system'
        },
        'resources': {
            'pattern': high_risk['resource_access'].mean(),
            'recommendation': 'Increase access to educational resources'
        },
        'support': {
            'pattern': high_risk['support_services'].mean(),
            'recommendation': 'Enhance academic support services'
        }
    }
    
    return recommendations

recommendations = generate_recommendations(df_model, model_results)
print("\nIntervention Recommendations:")
for area, details in recommendations.items():
    print(f"\n{area.title()}:")
    print(f"Pattern: {details['pattern']:.2f}")
    print(f"Recommendation: {details['recommendation']}")

## 5. Future Trends Prediction

In [None]:
def predict_future_trends(data):
    """Predict future educational trends."""
    if 'year' in data.columns:
        # Analyze historical trends
        yearly_trends = data.groupby('year').agg({
            'success': 'mean',
            'enrollment_rate': 'mean',
            'dropout_rate': 'mean'
        })
        
        # Plot trends
        plt.figure(figsize=(12, 6))
        yearly_trends.plot()
        plt.title('Educational Trends Over Time')
        plt.xlabel('Year')
        plt.ylabel('Rate')
        plt.legend()
        plt.show()
        
        # Simple projection for next year
        projection = yearly_trends.diff().mean() + yearly_trends.iloc[-1]
        print("\nProjected Metrics for Next Year:")
        print(projection)

predict_future_trends(df_model)

## 6. Key Insights and Recommendations

### Predictive Factors
1. Most influential factors in academic success:
   - List top factors from model
   - Quantify their impact
   - Identify intervention points

### Risk Patterns
1. Common characteristics of at-risk students:
   - Behavioral patterns
   - Environmental factors
   - Resource access issues

### Intervention Strategies
1. Targeted support programs:
   - Early warning systems
   - Resource allocation
   - Support services

### Future Outlook
1. Projected trends:
   - Success rates
   - Risk factors
   - Resource needs

### Implementation Plan
1. Short-term actions:
   - Immediate interventions
   - Resource deployment
   - Monitoring setup

2. Long-term strategy:
   - System improvements
   - Policy recommendations
   - Capacity building