# 1. Setup

## 1.A Summary

## 1.B Libraries Import

In [17]:
# Project-specific utilities (keep only if used elsewhere)
from tools import Tools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


## 1.C Invoke Classes

In [18]:
tools = Tools()

## 1.D Load Configuration

In [19]:
config = tools.load_toml_file("config.toml")
tools.print_message('success', 'Loaded configuration', format_dict={'number of keys': len(config)})

## 1.E Load the dataset

In [20]:
# Open dataset
# Realinho, V., Martins, M.V., Machado, J. and Baptista, L.M.T., 2021. Predict Students' Dropout and Academic Success. UCI Machine Learning Repository. Available at: https://doi.org/10.24432/C5MC89 [Accessed 31 May 2025].
df_dataset = tools.load_dataset(file_name='dataset_raw.csv')
df_dataset.head()

Unnamed: 0,marital_status,application_mode,application_order,course,daytime_evening_attendance,previous_qualification,previous_qualification_grade,nationality,mothers_qualification,fathers_qualification,...,curricular_units_2nd_sem_credited,curricular_units_2nd_sem_enrolled,curricular_units_2nd_sem_evaluations,curricular_units_2nd_sem_approved,curricular_units_2nd_sem_grade,curricular_units_2nd_sem_without_evaluations,unemployment_rate,inflation_rate,gdp,target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


## 1.F Apply Target Binary Transformation

In [21]:
# Add a new target column with renamed values for one vs rest classification
df_dataset['target_binary'] = df_dataset['target'].map({'Dropout': 0, 'Graduate': 1, 'Enrolled': 1})
df_dataset['target_binary'].value_counts()

target_binary
1    3003
0    1421
Name: count, dtype: int64

## 1.G Data Shape Check

In [22]:
shape = df_dataset.shape
tools.print_message('success', 'Dataset loaded', format_dict={'rows': shape[0], 'columns': shape[1]})

# 2. Feature Selection

## 2.A Summary

### <span style="color: #e74c3c;">**Feature Selection for Logistic Regression**</span>

This analysis prepared the dataset for Logistic Regression by removing problematic features and encoding categorical variables to create stable, interpretable predictors suitable for linear classification.

### <span style="color: #2E86AB;">**1. Feature Removal Strategy**</span>

**Removed 13 features** from original 36:
- **Data leakage**: 5 second semester features (students who withdraw early show zeros)
- **Severely imbalanced**: Nationality, educational special needs, international status (>97% in one category)
- **Zero information**: Daytime/evening attendance, displaced status
- **Weak predictors**: Economic indicators, previous qualification grade (correlation <0.10)

**Result**: 23 meaningful features suitable for linear classification.

### <span style="color: #2E86AB;">**2. High Cardinality Feature Engineering**</span>

**Parents' qualifications and occupations** (150+ total categories) were reduced to **2 binary indicators**:
- `parental_higher_education` - at least one parent with higher education
- `parental_professional_occupation` - at least one parent in professional role

**Benefits**: Captures family background whilst avoiding **coefficient inflation** from too many categories.

### <span style="color: #2E86AB;">**3. Withdrawal Rate Encoding**</span>

**Medium cardinality features** were replaced with their historical withdrawal rates:
- `application_mode` (18 categories) → `application_mode_withdrawal_rate`
- `course` (17 categories) → `course_withdrawal_rate` 
- `previous_qualification` (17 categories) → `previous_qualification_withdrawal_rate`

**Advantage**: Creates continuous predictors where higher values = higher risk, avoiding **sparse matrices** that harm model convergence.

### <span style="color: #2E86AB;">**4. One-Hot Encoding**</span>

**Low cardinality features** (2-8 categories) used **one-hot encoding** with `drop_first=True`:
- Marital status, application order, gender, scholarship holder, tuition fees status, debtor status
- Creates ~15 binary features
- **Drop-first prevents multicollinearity** - avoids perfect correlation between category indicators

### <span style="color: #e74c3c;">**Impact on Logistic Regression**</span>

**Model stability**: Strategic encoding prevents **multicollinearity** and **overfitting** whilst maintaining interpretable coefficients. **Early intervention capability**: Uses only first semester data for timely withdrawal prediction. **Regularisation ready**: 25 well-encoded features work effectively with **L1/L2 regularisation** techniques.

This feature selection balances predictive power with model stability, creating optimal conditions for Logistic Regression deployment in student retention systems.

## 2.B Features to Remove

In [23]:
# Data leakage - 2nd semester features
data_leakage_features = [
    'curricular_units_2nd_sem_credited',
    'curricular_units_2nd_sem_enrolled', 
    'curricular_units_2nd_sem_evaluations',
    'curricular_units_2nd_sem_approved',
    'curricular_units_2nd_sem_grade'
]

# Severely imbalanced features (>97% in one category)
imbalanced_features = [
    'nationality',
    'educational_special_needs',
    'international'
]

# Zero information value features
zero_info_features = [
    'daytime_evening_attendance',
    'displaced'
]

# Weak predictors (correlation < 0.10)
weak_predictors = [
    'unemployment_rate',
    'inflation_rate',
    'gdp',
    'previous_qualification_grade'
]

# Old target column
target_column = ['target']

# Combine all features to drop
features_to_drop = (data_leakage_features + 
                   imbalanced_features + 
                   zero_info_features + 
                   weak_predictors + 
                   target_column)

df_dataset.drop(columns=features_to_drop, inplace=True, errors='ignore')
tools.print_message('success', 'Dropped features', format_dict={'number of features': len(features_to_drop)})

## 2.C Reduce High Cardinality Features

In [24]:
# Check if parental features still exist in dataset
parental_features = ['mothers_qualification', 'fathers_qualification', 'mothers_occupation', 'fathers_occupation']
existing_features = [f for f in parental_features if f in df_dataset.columns]
print(f"Remaining parental features: {existing_features}")

Remaining parental features: ['mothers_qualification', 'fathers_qualification', 'mothers_occupation', 'fathers_occupation']


In [25]:
# To reduce the number of categories in the parental qualification and occupation features, we will group them into broader categories.
def create_parental_higher_ed(df):
    """
    Creates binary indicator for parental higher education.
    Returns 1 if at least one parent has higher education, 0 otherwise.
    """
    higher_ed_codes = [2, 3, 4, 5, 6, 39, 40, 41, 42, 43, 44]
    
    mother_higher_ed = df['mothers_qualification'].isin(higher_ed_codes)
    father_higher_ed = df['fathers_qualification'].isin(higher_ed_codes)
    
    # At least one parent has higher education
    df['parental_higher_education'] = (mother_higher_ed | father_higher_ed).astype(int)
    df = df.drop(columns=['mothers_qualification', 'fathers_qualification'])
    
    return df

# Usage:
df_dataset = create_parental_higher_ed(df_dataset)
df_dataset.parental_higher_education.value_counts()

parental_higher_education
0    3616
1     808
Name: count, dtype: int64

In [26]:
def create_parental_professional_occupation(df):
    """
    Creates binary indicator for parental professional occupation.
    Returns 1 if at least one parent has professional/managerial role, 0 otherwise.
    """
    professional_codes = [1, 2, 3, 101, 102, 112, 114, 121, 122, 123, 124, 
                          131, 132, 134, 135]
    
    mother_professional = df['mothers_occupation'].isin(professional_codes)
    father_professional = df['fathers_occupation'].isin(professional_codes)
    
    # At least one parent has professional occupation
    df['parental_professional_occupation'] = (mother_professional | father_professional).astype(int)
    df = df.drop(columns=['mothers_occupation', 'fathers_occupation'])
    
    return df

# Usage:
df_dataset = create_parental_professional_occupation(df_dataset)
df_dataset.parental_professional_occupation.value_counts()

parental_professional_occupation
0    3270
1    1154
Name: count, dtype: int64

In [27]:
print(f"Dataset shape after parental feature engineering: {df_dataset.shape}")
print(f"Remaining features: {df_dataset.columns.tolist()}")

Dataset shape after parental feature engineering: (4424, 21)
Remaining features: ['marital_status', 'application_mode', 'application_order', 'course', 'previous_qualification', 'admission_grade', 'debtor', 'tuition_fees_up_to_date', 'gender', 'scholarship_holder', 'age_at_enrollment', 'curricular_units_1st_sem_credited', 'curricular_units_1st_sem_enrolled', 'curricular_units_1st_sem_evaluations', 'curricular_units_1st_sem_approved', 'curricular_units_1st_sem_grade', 'curricular_units_1st_sem_without_evaluations', 'curricular_units_2nd_sem_without_evaluations', 'target_binary', 'parental_higher_education', 'parental_professional_occupation']


In [28]:
def encode_categorical_withdrawal_rate(df, cat_col, target_col='target_binary'):
    """
    Replace categorical column with withdrawal rate encoding.
    
    Parameters:
    df: pandas DataFrame
    cat_col: name of categorical column to encode
    target_col: name of target column where 0=withdrawn
    
    Returns:
    pandas DataFrame with categorical column replaced by withdrawal_rate
    """
    import pandas as pd
    
    df_encoded = df.copy()
    
    # Calculate withdrawal rate for each category
    withdrawal_rates = (df[target_col] == 0).groupby(df[cat_col]).mean()
    
    # Create new withdrawal rate column
    new_col_name = f'{cat_col.lower().replace(" ", "_")}_withdrawal_rate'
    df_encoded[new_col_name] = df[cat_col].map(withdrawal_rates)
    
    # Remove original column
    df_encoded = df_encoded.drop(columns=[cat_col])
    
    return df_encoded

# Usage:
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'application_mode')
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'course')
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'previous_qualification')
df_dataset.describe()

Unnamed: 0,marital_status,application_order,admission_grade,debtor,tuition_fees_up_to_date,gender,scholarship_holder,age_at_enrollment,curricular_units_1st_sem_credited,curricular_units_1st_sem_enrolled,...,curricular_units_1st_sem_approved,curricular_units_1st_sem_grade,curricular_units_1st_sem_without_evaluations,curricular_units_2nd_sem_without_evaluations,target_binary,parental_higher_education,parental_professional_occupation,application_mode_withdrawal_rate,course_withdrawal_rate,previous_qualification_withdrawal_rate
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,1.178571,1.727848,126.978119,0.113698,0.880651,0.351718,0.248418,23.265145,0.709991,6.27057,...,4.7066,10.640822,0.137658,0.150316,0.678797,0.18264,0.26085,0.321203,0.321203,0.321203
std,0.605747,1.313793,14.482001,0.31748,0.324235,0.47756,0.432144,7.587816,2.360507,2.480178,...,3.094238,4.843663,0.69088,0.753774,0.466991,0.386415,0.439148,0.140266,0.121256,0.09994
min,1.0,0.0,95.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154047,0.166667
25%,1.0,1.0,117.9,0.0,1.0,0.0,0.0,19.0,0.0,5.0,...,3.0,11.0,0.0,0.0,0.0,0.0,0.0,0.201991,0.183099,0.290019
50%,1.0,1.0,126.1,0.0,1.0,0.0,0.0,20.0,0.0,6.0,...,5.0,12.285714,0.0,0.0,1.0,0.0,0.0,0.293578,0.330233,0.290019
75%,1.0,2.0,134.8,0.0,1.0,1.0,0.0,25.0,0.0,7.0,...,6.0,13.4,0.0,0.0,1.0,0.0,1.0,0.36859,0.381395,0.290019
max,6.0,9.0,190.0,1.0,1.0,1.0,1.0,70.0,20.0,26.0,...,26.0,18.875,12.0,12.0,1.0,1.0,1.0,1.0,0.666667,1.0


In [29]:
def one_hot_encode_features(df, categorical_columns):
    """
    One-hot encode specified categorical columns.
    
    Parameters:
    df: pandas DataFrame
    categorical_columns: list of column names to encode
    
    Returns:
    pandas DataFrame with categorical columns one-hot encoded
    """
    import pandas as pd
    
    # Check which features actually exist in the dataset
    existing_categorical = [col for col in categorical_columns if col in df.columns]
    missing_categorical = [col for col in categorical_columns if col not in df.columns]
    
    if missing_categorical:
        print(f"Warning: These columns not found in dataset: {missing_categorical}")
    
    print(f"Categorical features to encode: {existing_categorical}")
    
    # One-hot encode with drop_first=True to avoid multicollinearity
    df_encoded = pd.get_dummies(df, 
                               columns=existing_categorical, 
                               drop_first=True, 
                               dtype=int)
    
    print(f"Shape before encoding: {df.shape}")
    print(f"Shape after encoding: {df_encoded.shape}")
    print(f"Features added: {df_encoded.shape[1] - df.shape[1]}")
    
    # Show new encoded column names
    new_columns = [col for col in df_encoded.columns if any(cat in col for cat in existing_categorical)]
    print(f"New encoded features: {new_columns}")
    
    return df_encoded

# Usage:
remaining_categorical = [
    'marital_status',
    'application_order',  
    'gender',
    'scholarship_holder',
    'tuition_fees_up_to_date',
    'debtor'
]

df_dataset = one_hot_encode_features(df_dataset, remaining_categorical)

Categorical features to encode: ['marital_status', 'application_order', 'gender', 'scholarship_holder', 'tuition_fees_up_to_date', 'debtor']
Shape before encoding: (4424, 21)
Shape after encoding: (4424, 31)
Features added: 10
New encoded features: ['marital_status_2', 'marital_status_3', 'marital_status_4', 'marital_status_5', 'marital_status_6', 'application_order_1', 'application_order_2', 'application_order_3', 'application_order_4', 'application_order_5', 'application_order_6', 'application_order_9', 'gender_1', 'scholarship_holder_1', 'tuition_fees_up_to_date_1', 'debtor_1']


# 3. Processing Pipeline

## 3.A Summary

### <span style="color: #e74c3c;">**Logistic Regression Processing Pipeline Summary**</span>

This section implemented a modular processing pipeline for logistic regression, separating train/test splitting and feature scaling into distinct functions to prevent data leakage and ensure proper workflow.

### <span style="color: #2E86AB;">**1. Two-Function Design**</span>

**Modular approach** improved maintainability:
- `create_train_test_split_logistic()` - handles stratified 80/20 data splitting
- `scale_features_logistic()` - applies configurable feature scaling

**Benefits**: **Single responsibility principle** ensures each function performs one task, **reusable components** work with different configurations, **flexible parameters** enable easy adjustments.

### <span style="color: #2E86AB;">**2. Critical Workflow Order**</span>

**Split first, scale second** prevents **data leakage**. **Data leakage** occurs when test set information influences preprocessing, creating artificially optimistic performance estimates.

**Correct sequence:**
1. Split data (training/test)
2. Fit scaler on training data only  
3. Apply same scaler to test data

**Why this matters**: Scaling before splitting would use test set statistics, compromising model evaluation reliability.

### <span style="color: #2E86AB;">**3. Results Achieved**</span>

**Perfect stratification**: Both training (3,539 samples) and test (885 samples) maintain identical 67.9%/32.1% class distributions.

**Configurable scaling** via `scaler_type` parameter enables easy comparison testing between `standard`, `minmax`, and `none` options. **StandardScaler preferred for logistic regression** because linear models like logistic regression initialise weights to 0 or small random values, making standardised features (mean=0, std=1) easier to learn. StandardScaler is particularly important when using regularisation, as it ensures fair penalty application across all features, preventing features with larger scales from dominating the model coefficients.

### <span style="color: #e74c3c;">**Pipeline Outcome**</span>

The pipeline produces **deployment-ready data** (X_train_scaled, X_test_scaled, y_train, y_test, scaler) whilst preventing data leakage and ensuring reliable model evaluation. **Modular design** enables easy debugging and configuration adjustments for optimal logistic regression performance.

## 3.B Train Test Split

In [32]:
def create_train_test_split_logistic(X_features, df_dataset, logistic_config):
    """
    Create stratified train/test split for logistic regression.
    
    Parameters:
    X_features: Feature matrix (unscaled)
    df_dataset: DataFrame containing target variable
    logistic_config: Logistic regression configuration dictionary
    
    Returns:
    tuple: X_train, X_test, y_train, y_test
    """
    
    # Define target variable y
    y = df_dataset['target_binary']

    # Get required parameters with defaults
    train_size = logistic_config.get('train_size', 0.8)
    use_stratify = logistic_config.get('stratify', True)

    # Validate parameter values
    if not 0 < train_size < 1:
        raise ValueError(f'train_size must be between 0 and 1, got {train_size}')

    if not isinstance(use_stratify, bool):
        raise ValueError(f'stratify must be true or false, got {use_stratify}')

    # Set stratify parameter
    stratify_param = y if use_stratify else None

    # Perform train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_features,
        y,
        train_size=train_size,
        random_state=42,
        stratify=stratify_param
    )

    # Print split information
    print(f'Training set: {len(X_train)} samples ({train_size*100:.0f}%)')
    print(f'Test set: {len(X_test)} samples ({(1-train_size)*100:.0f}%)')

    if use_stratify:
        print(f'Training class distribution: {y_train.value_counts(normalize=True).round(3).to_dict()}')
        print(f'Test class distribution: {y_test.value_counts(normalize=True).round(3).to_dict()}')

    return X_train, X_test, y_train, y_test

logistic_config = config.get('logistic_regression_model')
X_features = df_dataset.drop('target_binary', axis=1)

# Split the data
X_train, X_test, y_train, y_test = create_train_test_split_logistic(
    X_features, df_dataset, logistic_config
)


Training set: 3539 samples (80%)
Test set: 885 samples (20%)
Training class distribution: {1: 0.679, 0: 0.321}
Test class distribution: {1: 0.679, 0: 0.321}


In [33]:
def scale_features_logistic(X_train, X_test, logistic_config):
    """
    Scale features using specified scaling method.
    
    Parameters:
    X_train: Training feature matrix
    X_test: Test feature matrix
    logistic_config: Logistic regression configuration dictionary
    
    Returns:
    tuple: X_train_scaled, X_test_scaled, scaler
    """
    
    # Get scaling method
    scaler_type = logistic_config.get('scaler_type', 'standard')
    
    # Validate scaler type
    if scaler_type not in ['standard', 'minmax', 'none']:
        raise ValueError(f'scaler_type must be "standard", "minmax", or "none", got {scaler_type}')

    # Create scaler
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:  # scaler_type == 'none'
        scaler = None

    # Apply scaling
    if scaler is not None:
        # Fit scaler on training data only
        X_train_scaled = scaler.fit_transform(X_train)
        # Apply same scaler to test data (no re-fitting)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    print(f'Scaling method applied: {scaler_type}')
    
    return X_train_scaled, X_test_scaled, scaler

# Scale the features
X_train_scaled, X_test_scaled, scaler = scale_features_logistic(
    X_train, X_test, logistic_config
)

Scaling method applied: standard


## 4. Training and Evaluation