# Data Cleaning and Preprocessing

This notebook demonstrates the cleaning and preprocessing of educational data from various Bangladesh sources:
- BANBEIS (Bangladesh Bureau of Educational Information and Statistics)
- Education Board Results
- DSHE (Directorate of Secondary and Higher Education)
- DPE (Directorate of Primary Education)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add project root to Python path
sys.path.append('..')
from src.data_processing.data_processor import DataProcessor

## 1. Load Raw Data

Load data from different sources and inspect their structure.

In [None]:
# Initialize data processor
processor = DataProcessor()

# Load data from different sources
student_data = processor.load_student_data('academic')
student_data.head()

## 2. Data Quality Assessment

Check for common data quality issues:
- Missing values
- Duplicates
- Invalid values
- Inconsistent formats

In [None]:
def assess_data_quality(df):
    """Assess data quality and return summary."""
    quality_report = {
        'total_rows': len(df),
        'missing_values': df.isnull().sum(),
        'duplicates': df.duplicated().sum(),
        'data_types': df.dtypes
    }
    return quality_report

quality_report = assess_data_quality(student_data)
print("Data Quality Report:")
for key, value in quality_report.items():
    print(f"\n{key.replace('_', ' ').title()}:")
    print(value)

## 3. Data Cleaning

Apply cleaning operations to address identified issues.

In [None]:
# Clean the data using our processor
cleaned_data = processor.clean_student_data(student_data)

# Verify cleaning results
print("\nCleaning Results:")
print(f"Original shape: {student_data.shape}")
print(f"Cleaned shape: {cleaned_data.shape}")

# Show sample of cleaned data
cleaned_data.head()

## 4. Data Standardization

Standardize values and formats across different data sources.

In [None]:
def standardize_geographic_data(df):
    """Standardize division and district names."""
    # Standard mappings for Bangladesh administrative divisions
    division_mapping = {
        'dhaka': 'Dhaka',
        'chittagong': 'Chattogram',
        'khulna': 'Khulna',
        'rajshahi': 'Rajshahi',
        'sylhet': 'Sylhet',
        'barisal': 'Barishal',
        'rangpur': 'Rangpur',
        'mymensingh': 'Mymensingh'
    }
    
    df = df.copy()
    if 'division' in df.columns:
        df['division'] = df['division'].str.lower().map(division_mapping)
    return df

# Standardize geographic information
standardized_data = standardize_geographic_data(cleaned_data)

# Show unique values in standardized columns
if 'division' in standardized_data.columns:
    print("\nUnique Divisions:")
    print(standardized_data['division'].unique())

## 5. Data Validation

Validate cleaned and standardized data against expected formats and ranges.

In [None]:
def validate_data(df):
    """Validate data against business rules."""
    validation_results = {
        'invalid_student_ids': 0,
        'invalid_dates': 0,
        'invalid_grades': 0
    }
    
    # Validate student IDs (assuming format requirements)
    if 'student_id' in df.columns:
        validation_results['invalid_student_ids'] = (
            df['student_id'].str.match(r'^S\d{4}$') == False
        ).sum()
    
    # Validate dates
    if 'date_of_birth' in df.columns:
        validation_results['invalid_dates'] = pd.to_datetime(
            df['date_of_birth'], errors='coerce'
        ).isna().sum()
    
    # Validate grades
    if 'gpa' in df.columns:
        validation_results['invalid_grades'] = (
            (df['gpa'] < 0) | (df['gpa'] > 5)
        ).sum()
    
    return validation_results

# Run validation
validation_results = validate_data(standardized_data)
print("\nValidation Results:")
for key, value in validation_results.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

## 6. Save Processed Data

Save the cleaned and validated data for further analysis.

In [None]:
# Create processed data directory if it doesn't exist
processed_dir = Path('../processed_data/cleaned')
processed_dir.mkdir(parents=True, exist_ok=True)

# Save processed data
output_file = processed_dir / 'cleaned_student_data.csv'
standardized_data.to_csv(output_file, index=False)
print(f"\nProcessed data saved to: {output_file}")

## Summary

This notebook demonstrated:
1. Loading raw educational data
2. Assessing data quality
3. Cleaning and standardizing data
4. Validating processed data
5. Saving results for further analysis

Next steps:
- Perform exploratory data analysis
- Generate performance metrics
- Create visualizations
- Conduct statistical analysis