In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import shap
import plotly.express as px


In [None]:
students_dataset = pd.read_csv('students_dataset.csv')  
students_dataset_1m = pd.read_csv('students_dataset_1m.csv')  

df = pd.concat([students_dataset, students_dataset_1m], ignore_index=True)  

df["disability_status"] = df["disability_status"].fillna("No Disability")
df["orphan_status"] = df["orphan_status"].fillna("No Parents")

X = df.drop(columns=["dropout_status"])
y = df["dropout_status"]
sns.set(style="whitegrid")

In [None]:
numerical_features = ['age', 'attendance_rate', 'days_absent_last_semester', 'average_grade',
       'household_size', 'behavioral_infractions', 'suspensions',
       'distance_to_school', 'transportation_time', 'activities_participation',
       'repetitions_in_class', 'previous_dropout_count']
categorical_features = ['gender', 'school_category', 'family_income_bracket', 'parental_education_level', 'school_fee_payment_source', 'current_class', 'orphan_status', 'disability_status', 'transportation_mean']

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 12))
for i, feature in enumerate(numerical_features):
    ax = axes[i // 3, i % 3]
    sns.histplot(data=df, x=feature, kde=True, ax=ax)
    ax.set_title(f"Distribution of {feature}")
    ax.set_ylabel("Density")
    ax.set_xlabel(feature)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
for i, feature in enumerate(categorical_features):
    ax = axes[i // 3, i % 3]
    sns.countplot(x=feature, data=df, ax=ax, legend=False, palette="Set2", hue=feature)
    ax.set_title(f"Distribution of {feature}")
    ax.set_xlabel(feature)
    ax.set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 12))
for i, feature in enumerate(numerical_features):
  ax = axes[i // 3, i % 3]
  sns.boxplot(x="dropout_status", y=feature, data=df, ax=ax, hue="dropout_status", legend=False, palette="Set2")
  ax.set_title(f"{feature} vs Dropout Status")
  ax.set_ylabel(feature)
  ax.set_xlabel("Dropout Status")

plt.tight_layout()
plt.show()

In [None]:
# Preprocess numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first"))
])

# Combine the transformers in a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_processed = preprocessor.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", pd.Series(y_train_resampled).value_counts())

In [None]:
logistic_model = LogisticRegression(C=0.1, solver='saga', random_state=42, max_iter=500)
logistic_model.fit(X_train_resampled, y_train_resampled)

joblib.dump(logistic_model, "logistic_regression_dropout_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")

In [None]:
# Evaluate the model
y_pred = logistic_model.predict(X_test)
y_pred_proba = logistic_model.predict_proba(X_test)[:, 1]

# Print classification report and ROC-AUC score
print("Classification Report:\n", classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC Score:", roc_auc)

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"Logistic Regression (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
from tabulate import tabulate

# Get encoded column names from the preprocessor
encoded_columns = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)
all_features = numerical_features + list(encoded_columns)

# Extract model coefficients and sort by importance
model_coef = logistic_model.coef_[0]
feature_importance = pd.DataFrame({
    "Feature": all_features,
    "Importance": np.abs(model_coef)
})

# Calculate percentage importance
total_importance = feature_importance["Importance"].sum()
feature_importance["Importance (%)"] = (feature_importance["Importance"] / total_importance) * 100

feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

fig = go.Figure(data=[go.Table(
    header=dict(values=["Feature", "Importance (%)"]),
    cells=dict(values=[feature_importance["Feature"], feature_importance["Importance (%)"]])
)])

fig.update_layout(
    title="Feature Importance Table",
    title_x=0.5,
)

fig.show()

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance (%)", y="Feature", data=feature_importance.head(15))
plt.title("Top 15 Features Influencing Dropout Probability")
plt.show()

In [None]:
# Create SHAP explainer
explainer = shap.Explainer(logistic_model, shap.maskers.Independent(X_train_resampled))
shap_values_test = explainer.shap_values(X_test)

# Plot SHAP summary plot for test set
shap.summary_plot(shap_values_test, X_test, feature_names=all_features)

In [None]:
joblib.dump(explainer.masker, 'explainer_masker.pkl')
joblib.dump(explainer, 'shap_explainer.pkl')

In [None]:
def feature_explanations(feature):
    explanations = {
        "attendance_rate": "A lower attendance rate suggests a higher likelihood of dropout.",
        "behavioral_infractions": "More infractions can indicate disengagement or disciplinary issues.",
        "activities_participation": "Lower participation in activities can reflect a lack of engagement.",
        "repetitions_in_class": "More repetitions suggest academic struggles.",
        "family_income_bracket_Low": "Lower family income can be a risk factor.",
        "suspensions": "Higher suspensions may correlate with dropout risk.",
        "average_grade": "Lower grades may indicate academic difficulty.",
        "school_category_Secondary": "In some cases, secondary level students show higher dropout rates."
    }
    return explanations.get(feature, "No specific explanation available.")

def analyze_student_dropout_advanced(student_data, model, preprocessor, explainer, top_n_features=10):
    """
    Advanced analysis of dropout risk for a given student, displaying the predicted dropout probability,
    dropout risk status, top contributing features, and additional visualizations.

    Parameters:
    - student_data (DataFrame): Data for the student to analyze (1-row DataFrame).
    - model: Trained model to make predictions.
    - preprocessor: Preprocessing pipeline used for transforming input data.
    - explainer: SHAP explainer to interpret the modelâ€™s predictions.
    - top_n_features (int): Number of top contributing features to display.
    """
    # Transform the student's data using the preprocessing pipeline
    student_processed = preprocessor.transform(student_data)

    # Predict dropout probability and dropout status
    dropout_probability = model.predict_proba(student_processed)[:, 1][0]
    dropout_prediction = model.predict(student_processed)[0]

    # Extract feature names from the preprocessor after transformation
    numerical_features = preprocessor.transformers_[0][2]
    categorical_features = preprocessor.transformers_[1][1].get_feature_names_out(preprocessor.transformers_[1][2])
    all_features = list(numerical_features) + list(categorical_features)

    # Convert processed student data to DataFrame with feature names
    student_processed_df = pd.DataFrame(student_processed, columns=all_features)

    # Calculate SHAP values for the student data
    shap_values = explainer(student_processed_df)

    # Convert SHAP values to DataFrame with aligned feature names
    shap_values_df = pd.DataFrame(shap_values.values[0], index=all_features, columns=['SHAP Value'])
    shap_values_df['Impact'] = shap_values_df['SHAP Value'].abs()  # Add absolute impact for sorting

    # Sort SHAP values by absolute impact and get top N features
    top_features = shap_values_df.sort_values(by='Impact', ascending=False).head(top_n_features)

    # Display results with detailed descriptions
    print("\n----- Student Dropout Risk Analysis Report -----")
    print(f"Predicted Dropout Probability: {dropout_probability * 100:.8f}%")
    print(f"Predicted Dropout Status: {'High Risk' if dropout_prediction == 1 else 'Low Risk'}")
    print("\nKey Contributing Factors (Positive values increase dropout risk, Negative values decrease it):")

    # Visualization 1: SHAP force plot for this individual prediction (non-interactive)
    print("\n--- SHAP Force Plot ---")
    shap.force_plot(explainer.expected_value, shap_values.values[0], feature_names=all_features, matplotlib=True)
    plt.show()  # Displaying this plot in a separate container

    # Visualization 2: Top Contributing Features Bar Plot using Plotly (Interactive)
    top_features_sorted = top_features.sort_values(by="SHAP Value", ascending=True)
    fig_bar = px.bar(top_features_sorted, x='SHAP Value', y=top_features_sorted.index,
                     labels={'SHAP Value': 'Impact on Prediction', 'index': 'Feature'},
                     title=f"Top {top_n_features} Features Contributing to Dropout Risk")
    fig_bar.show()  # Displaying this plot in a separate container

    # Visualization 3: Summary Table of Features, SHAP Values, Impact, Importance, and Explanation
    # Prepare data for the summary table
    summary_table = top_features.copy()
    summary_table['Impact Direction'] = summary_table['SHAP Value'].apply(lambda x: 'Increases' if x > 0 else 'Decreases')
    summary_table['Feature Explanation'] = summary_table.index.map(lambda x: feature_explanations(x))

    # Calculate the Importance (%) for each feature
    total_impact = summary_table['Impact'].sum()
    summary_table['Importance (%)'] = (summary_table['Impact'] / total_impact) * 100

    # Display an interactive table using Plotly's go.Table method
    fig_table = go.Figure(data=[go.Table(
        header=dict(values=["Feature", "SHAP Value", "Impact (%)", "Impact Direction", "Explanation"]),
        cells=dict(values=[summary_table.index, summary_table['SHAP Value'], summary_table['Importance (%)'], summary_table['Impact Direction'], summary_table['Feature Explanation']])
    )])

    fig_table.update_layout(title="Summary Table of Dropout Risk Analysis")
    fig_table.show()

    return dropout_probability, dropout_prediction

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline

loaded_model = joblib.load("logistic_regression_dropout_model.pkl")
loaded_preprocessor = joblib.load("preprocessor.pkl")
loaded_explainer = joblib.load("shap_explainer.pkl")

new_student_data = {
    "age": [18],
    "gender": ["Male"],
    "disability_status": ["No Disability"],
    "school_category": ["Secondary"],
    "attendance_rate": [100],
    "days_absent_last_semester": [0],
    "average_grade": [85],
    "household_size": [5],
    "orphan_status": ["Double"],
    "family_income_bracket": ["Middle"],
    "parental_education_level": ["Secondary"],
    "parental_employment_status": ["Full-Time"],
    "school_fee_payment_source": ["Parents"],
    "activities_participation": [0],
    "behavioral_infractions": [0],
    "suspensions": [0],
    "previous_dropout_count": [0],
    "distance_to_school": [400],
    "transportation_mean": ["Foot"],
    "transportation_time": [4],
    "current_class": ["S6"],
    "repetitions_in_class": [0]
}

new_student_df = pd.DataFrame(new_student_data)

dropout_probability, dropout_prediction = analyze_student_dropout_advanced(new_student_df, loaded_model, loaded_preprocessor, loaded_explainer)

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline

loaded_model = joblib.load("logistic_regression_dropout_model.pkl")
loaded_preprocessor = joblib.load("preprocessor.pkl")
loaded_explainer = joblib.load("shap_explainer.pkl")

def validate_input(prompt, data_type, min_value=None, max_value=None, choices=None):
    """
    Validates and retrieves user input, ensuring it matches the required data type and constraints.
    """
    while True:
        try:
            value = data_type(input(prompt))
            if min_value is not None and value < min_value:
                print(f"Value must be at least {min_value}.")
                continue
            if max_value is not None and value > max_value:
                print(f"Value must not exceed {max_value}.")
                continue
            if choices and value not in choices:
                print(f"Invalid choice. Available options are: {choices}")
                continue
            return value
        except ValueError:
            print(f"Invalid input. Please enter a valid {data_type.__name__}.")


def collect_student_data_for_prediction():
    """
    Collects student data interactively from the user to predict dropout probability with validation.

    Returns:
    - new_student_df (DataFrame): The new student's data in a DataFrame format.
    """

    # Dictionary for features and validation rules
    feature_info = {
        "age": {"type": int, "min": 5, "max": 25, "prompt": "Age (5-25): "},
        "gender": {"type": str, "choices": ["Male", "Female"], "prompt": "Gender (Male/Female): "},
        "disability_status": {"type": str, "choices": ["No Disability", "Physical", "Learning"], "prompt": "Disability Status (No Disability/Physical/Learning): "},
        "school_category": {"type": str, "choices": ["Primary", "Secondary"], "prompt": "School Category (Primary/Secondary): "},
        "attendance_rate": {"type": float, "min": 0, "max": 100, "prompt": "Attendance Rate (0-100): "},
        "days_absent_last_semester": {"type": int, "min": 0, "max": 40, "prompt": "Days Absent Last Semester: "},
        "average_grade": {"type": float, "min": 0, "max": 100, "prompt": "Average Grade (0-100): "},
        "household_size": {"type": int, "min": 1, "max": 15, "prompt": "Household Size (1-15): "},
        "orphan_status": {"type": str, "choices": ["No Parents", "Single", "Double"], "prompt": "Orphan Status (No Parents/Single/Double): "},
        "family_income_bracket": {"type": str, "choices": ["Low", "Middle", "High"], "prompt": "Family Income Bracket (Low/Middle/High): "},
        "parental_education_level": {"type": str, "choices": ["Not Schooled", "Primary", "Secondary", "Tertiary"], "prompt": "Parental Education Level (Not Schooled/Primary/Secondary/Tertiary): "},
        "parental_employment_status": {"type": str, "choices": ["Unemployed", "Temporary Work", "Full-Time", "Part-Time", "Self-Employed"], "prompt": "Parental Employment Status (Unemployed/Temporary Work/Full-Time/Part-Time/Self-Employed): "},
        "school_fee_payment_source": {"type": str, "choices": ["Parents", "Sponsor", "Other"], "prompt": "School Fee Payment Source (Parents/Sponsor/Other): "},
        "activities_participation": {"type": int, "min": 0, "max": 5, "prompt": "Activities Participation (0-10): "},
        "behavioral_infractions": {"type": int, "min": 0, "max": 10, "prompt": "Behavioral Infractions (0-20): "},
        "suspensions": {"type": int, "min": 0, "max": 5, "prompt": "Suspensions (0-10): "},
        "previous_dropout_count": {"type": int, "min": 0, "max": 5, "prompt": "Previous Dropout Count (0-10): "},
        "distance_to_school": {"type": float, "min": 0, "max": 10000, "prompt": "Distance to School (m): "},
        "transportation_mean": {"type": str, "choices": ["Foot", "Public Transport", "Bicycle", "Car"], "prompt": "Transportation Mean (Foot/Public Transport/Bicycle/Car): "},
        "transportation_time": {"type": int, "min": 0, "max": 120, "prompt": "Transportation Time (minutes): "},
        "current_class": {"type": str, "choices": ["P1", "P2", "P3", "P4", "P5", "P6", "S1", "S2", "S3", "S4", "S5", "S6"], "prompt": "Current Class (e.g., P1, P2, ..., S6): "},
        "repetitions_in_class": {"type": int, "min": 0, "max": 3, "prompt": "Repetitions in Class (0-3): "}
    }

    # Collect data for each feature
    student_data = {}
    for feature, info in feature_info.items():
        print(f"\n--- {feature.replace('_', ' ').title()} ---")
        student_data[feature] = validate_input(info["prompt"], info["type"], min_value=info.get("min"), max_value=info.get("max"), choices=info.get("choices"))

    # Convert the collected data into a DataFrame
    new_student_df = pd.DataFrame([student_data])
    return new_student_df

In [None]:
new_student_df = collect_student_data_for_prediction()
analyze_student_dropout_advanced(new_student_df, loaded_model, loaded_preprocessor, explainer=loaded_explainer)