# Student Risk Prediction Model - Week 5 Classification

**Objective**: Build a binary classification model to predict student risk level (high vs medium) by Week 5

## Project Overview
- **Target Variable**: `risk` (binary: high=1, medium=0)
- **Timeline**: Week 5 prediction capability  
- **Dataset**: 282 students with 41+ features (majority categorical)
- **Expected Distribution**: 95 high-risk, 187 medium-risk students
- **Success Metric**: High recall for high-risk students (minimize false negatives)

## Key Resources
- **Input Data**: `data/refined_data_for_model/fully_engineered_student_data.csv`
- **Feature Mappings**: `project_info/info_for_predictive_model/feature_mappings.json`
- **Implementation Guide**: `project_info/info_for_predictive_model/predictive_model_steps.md`

This notebook implements the 11-step process defined in CLAUDE.md for systematic model development."

## Step 1: Objective Definition ✅

### Classification Task Definition
- **Problem Type**: Binary Classification
- **Target Variable**: `risk` with values ['high', 'medium'] 
- **Encoding**: medium=0, high=1
- **Business Goal**: Identify students at high risk by Week 5 to enable early intervention

### Success Criteria
1. **Primary**: High recall for high-risk students (minimize missed high-risk cases)
2. **Secondary**: Overall F1-score and accuracy
3. **Tertiary**: Model interpretability for educational stakeholders

### Dataset Specifications
- **Size**: 282 students, 41+ features
- **Class Distribution**: 95 high-risk (33.7%), 187 medium-risk (66.3%)
- **Feature Types**: Mixed (numeric assessments, categorical demographics, engineered weights)
- **Data Source**: Fully engineered dataset with risk-based feature weights"

In [ ]:
# Import required libraries for the prediction model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)

# For XGBoost (if available)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available - will use alternative models")

# For Jupyter notebook display
try:
    from IPython.display import display
    DISPLAY_AVAILABLE = True
except ImportError:
    DISPLAY_AVAILABLE = False
    # Fallback display function
    def display(obj):
        print(obj)

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")
print("Scikit-learn imported")

# Define constants for our binary classification task
TARGET_COLUMN = 'risk'
HIGH_RISK_LABEL = 'high'
MEDIUM_RISK_LABEL = 'medium'
BINARY_ENCODING = {MEDIUM_RISK_LABEL: 0, HIGH_RISK_LABEL: 1}

print(f"\n📋 PROJECT SETUP:")
print(f"Target variable: {TARGET_COLUMN}")
print(f"Binary encoding: {BINARY_ENCODING}")
print(f"Focus: Minimize false negatives for {HIGH_RISK_LABEL}-risk students")

In [1]:
# Step 2: Load and validate the fully engineered dataset
print("="*60)
print("STEP 2: DATA OVERVIEW & VALIDATION")
print("="*60)

# Load the fully engineered dataset
data_path = '../data/refined_data_for_model/fully_engineered_student_data.csv'
print(f"Loading data from: {data_path}")

try:
    df = pd.read_csv(data_path)
    print("✅ Data loaded successfully!")
except FileNotFoundError:
    print("❌ Error: fully_engineered_student_data.csv not found!")
    print("Available files:")
    import os
    data_dir = '../data/refined_data_for_model/'
    if os.path.exists(data_dir):
        for file in os.listdir(data_dir):
            if file.endswith('.csv'):
                print(f"  - {file}")
    else:
        print("  - Data directory not found")

    # Try alternative files
    alt_files = [
        '../data/refined_data_for_model/engineered_student_data.csv',
        '../data/refined_data_for_model/Student_At_Risk_Student_Data.csv'
    ]

    df = None
    for alt_file in alt_files:
        try:
            df = pd.read_csv(alt_file)
            print(f"✅ Using alternative file: {alt_file}")
            break
        except FileNotFoundError:
            continue

    if df is None:
        print("❌ No suitable data file found. Please check file paths.")
        raise FileNotFoundError("Required data file not found")

# Basic dataset information
print(f"\n📊 DATASET OVERVIEW:")
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Validate expected dimensions
expected_rows = 282
expected_cols_min = 41

print(f"\n🔍 VALIDATION CHECKS:")
print(f"Expected rows: {expected_rows}, Actual: {df.shape[0]} {'✅' if df.shape[0] == expected_rows else '⚠️'}")
print(f"Expected columns: {expected_cols_min}+, Actual: {df.shape[1]} {'✅' if df.shape[1] >= expected_cols_min else '⚠️'}")

# Check for target variable
if TARGET_COLUMN in df.columns:
    print(f"Target variable '{TARGET_COLUMN}': ✅ Found")
    target_values = df[TARGET_COLUMN].unique()
    print(f"Target values: {target_values}")

    # Check expected risk levels
    expected_values = [HIGH_RISK_LABEL, MEDIUM_RISK_LABEL]
    has_expected = all(val in target_values for val in expected_values)
    print(f"Expected risk levels {expected_values}: {'✅' if has_expected else '⚠️'}")
else:
    print(f"Target variable '{TARGET_COLUMN}': ❌ Not found")
    print(f"Available columns: {list(df.columns)}")

    # Look for similar columns
    risk_cols = [col for col in df.columns if 'risk' in col.lower()]
    if risk_cols:
        print(f"Possible risk columns: {risk_cols}")

print(f"\n📋 COLUMN SUMMARY:")
print(f"Total columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")

STEP 2: DATA OVERVIEW & VALIDATION
Loading data from: ../data/refined_data_for_model/fully_engineered_student_data.csv


NameError: name 'pd' is not defined

In [None]:
# Detailed data analysis and validation
print("="*60)
print("DETAILED DATA ANALYSIS")
print("="*60)

# Risk distribution analysis
if TARGET_COLUMN in df.columns:
    print(f"\n🎯 TARGET VARIABLE ANALYSIS:")
    risk_counts = df[TARGET_COLUMN].value_counts()
    risk_percentages = df[TARGET_COLUMN].value_counts(normalize=True) * 100

    print(f"Risk Distribution:")
    for risk_level in risk_counts.index:
        count = risk_counts[risk_level]
        percentage = risk_percentages[risk_level]
        print(f"  {risk_level}: {count} students ({percentage:.1f}%)")

    # Validate expected distribution
    if HIGH_RISK_LABEL in risk_counts and MEDIUM_RISK_LABEL in risk_counts:
        high_count = risk_counts[HIGH_RISK_LABEL]
        medium_count = risk_counts[MEDIUM_RISK_LABEL]

        print(f"\n📊 DISTRIBUTION VALIDATION:")
        print(f"Expected high-risk: ~95, Actual: {high_count} {'✅' if 85 <= high_count <= 105 else '⚠️'}")
        print(f"Expected medium-risk: ~187, Actual: {medium_count} {'✅' if 177 <= medium_count <= 197 else '⚠️'}")

        # Class imbalance ratio
        total = high_count + medium_count
        imbalance_ratio = medium_count / high_count if high_count > 0 else 0
        print(f"Class imbalance ratio (medium:high): {imbalance_ratio:.2f}:1")

        if imbalance_ratio > 1.5:
            print("⚠️  Imbalanced dataset detected - will need class weighting")
        else:
            print("✅ Reasonably balanced dataset")

# Feature type analysis
print(f"\n🔍 FEATURE TYPE ANALYSIS:")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target and ID columns from feature lists
if TARGET_COLUMN in numeric_cols:
    numeric_cols.remove(TARGET_COLUMN)
if TARGET_COLUMN in categorical_cols:
    categorical_cols.remove(TARGET_COLUMN)

id_cols = [col for col in df.columns if 'id' in col.lower()]
numeric_cols = [col for col in numeric_cols if col not in id_cols]
categorical_cols = [col for col in categorical_cols if col not in id_cols]

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print(f"ID columns: {len(id_cols)} {id_cols}")

# Check for engineered features
weighted_cols = [col for col in df.columns if col.endswith('_weighted')]
sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower()]

print(f"\n🔧 ENGINEERED FEATURES:")
print(f"Weighted categorical features: {len(weighted_cols)}")
if len(weighted_cols) > 0:
    print(f"  Examples: {weighted_cols[:3]}")
print(f"Sentiment features: {len(sentiment_cols)}")
if len(sentiment_cols) > 0:
    print(f"  Examples: {sentiment_cols[:3]}")

# Missing data analysis
print(f"\n🔍 MISSING DATA ANALYSIS:")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

if len(missing_data) > 0:
    print(f"Columns with missing data: {len(missing_data)}")
    print("Top missing data columns:")
    for col, count in missing_data.head(10).items():
        percentage = (count / len(df)) * 100
        print(f"  {col}: {count} missing ({percentage:.1f}%)")
else:
    print("✅ No missing data found")

print(f"\n✅ Step 2 validation completed!")
print(f"Dataset ready for preprocessing: {df.shape}")

In [None]:
# Visualize key data insights
print("="*60)
print("DATA VISUALIZATION")
print("="*60)

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Risk distribution
if TARGET_COLUMN in df.columns:
    risk_counts = df[TARGET_COLUMN].value_counts()
    axes[0, 0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%',
                   colors=['#ff7f7f', '#87ceeb'])
    axes[0, 0].set_title('Risk Level Distribution')

# 2. Feature types breakdown
feature_types = {
    'Numeric': len(numeric_cols),
    'Categorical': len(categorical_cols),
    'Weighted': len(weighted_cols),
    'Sentiment': len(sentiment_cols)
}

axes[0, 1].bar(feature_types.keys(), feature_types.values(), color=['skyblue', 'lightgreen', 'gold', 'pink'])
axes[0, 1].set_title('Feature Types Overview')
axes[0, 1].set_ylabel('Count')

# 3. Missing data visualization (if any)
if len(missing_data) > 0:
    top_missing = missing_data.head(10)
    axes[1, 0].barh(range(len(top_missing)), top_missing.values)
    axes[1, 0].set_yticks(range(len(top_missing)))
    axes[1, 0].set_yticklabels(top_missing.index)
    axes[1, 0].set_title('Top 10 Columns with Missing Data')
    axes[1, 0].set_xlabel('Missing Count')
else:
    axes[1, 0].text(0.5, 0.5, 'No Missing Data ✅', ha='center', va='center',
                    transform=axes[1, 0].transAxes, fontsize=16)
    axes[1, 0].set_title('Missing Data Status')

# 4. Dataset size visualization
size_info = {
    'Total Rows': df.shape[0],
    'Total Columns': df.shape[1],
    'Expected Rows': 282,
    'Expected Cols': 41
}

x_pos = np.arange(len(size_info))
values = list(size_info.values())
axes[1, 1].bar(x_pos, values, color=['lightcoral', 'lightblue', 'lightcoral', 'lightblue'], alpha=0.7)
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(size_info.keys(), rotation=45)
axes[1, 1].set_title('Dataset Size Validation')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Display sample data
print(f"\n📋 SAMPLE DATA:")
print("First 5 rows of the dataset:")
display_cols = df.columns[:10].tolist()  # Show first 10 columns
print(df[display_cols].head())

# Show data types
print(f"\n📊 DATA TYPES SUMMARY:")
dtype_summary = df.dtypes.value_counts()
for dtype, count in dtype_summary.items():
    print(f"  {dtype}: {count} columns")

print(f"\n🎯 Step 2 Complete: Data loaded and validated successfully!")
print(f"Ready to proceed to Step 3: Data Preprocessing")