# Validation du Preprocessing

Validation du pipeline complet:
1. Absence de NaN
2. MathScore non modifi√©
3. Distributions avant/apr√®s
4. Statistiques descriptives

In [None]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Chargement des donn√©es

In [None]:
# Charger donn√©es originales
df_original = pl.read_csv('../data/X_numerical_grouped_cleaned_train.csv')

# Charger donn√©es pr√©process√©es (le plus r√©cent)
preprocessed_files = glob.glob('../data/X_train_preprocessed_*.csv')
if preprocessed_files:
    latest_file = max(preprocessed_files, key=os.path.getctime)
    df_preprocessed = pl.read_csv(latest_file)
    print(f"Fichier charg√©: {os.path.basename(latest_file)}")
else:
    print("‚ö†Ô∏è Aucun fichier preprocessed trouv√©")
    df_preprocessed = None

print(f"\nOriginal: {df_original.shape}")
if df_preprocessed is not None:
    print(f"Preprocessed: {df_preprocessed.shape}")
    print(f"R√©duction: {df_original.shape[1] - df_preprocessed.shape[1]} colonnes")

## 2. Validation: Absence de NaN

In [None]:
if df_preprocessed is not None:
    # Compter les NaN par colonne
    nan_counts = df_preprocessed.null_count()
    total_nans = sum(nan_counts.row(0))
    
    print(f"Total NaN: {total_nans}")
    
    if total_nans > 0:
        print("\n‚ö†Ô∏è Colonnes avec NaN:")
        for col, count in zip(df_preprocessed.columns, nan_counts.row(0)):
            if count > 0:
                print(f"  {col}: {count} ({count/len(df_preprocessed)*100:.2f}%)")
    else:
        print("‚úÖ Aucun NaN d√©tect√©")

## 3. Validation: MathScore non modifi√©

In [None]:
if df_preprocessed is not None and 'MathScore' in df_preprocessed.columns:
    if 'MathScore' in df_original.columns:
        # Comparer les valeurs
        original_scores = df_original.select('MathScore').to_series().to_numpy()
        preprocessed_scores = df_preprocessed.select('MathScore').to_series().to_numpy()
        
        if np.array_equal(original_scores, preprocessed_scores, equal_nan=True):
            print("‚úÖ MathScore identique (aucune modification)")
        else:
            print("‚ö†Ô∏è MathScore a √©t√© modifi√©!")
            print(f"  Diff√©rences: {np.sum(original_scores != preprocessed_scores)} valeurs")
    else:
        print("‚ö†Ô∏è MathScore absent du fichier original")
else:
    print("‚ö†Ô∏è MathScore absent du fichier preprocessed")

## 4. Distributions avant/apr√®s

In [None]:
if df_preprocessed is not None:
    # Nombre de colonnes conserv√©es
    common_cols = [col for col in df_original.columns if col in df_preprocessed.columns]
    print(f"Colonnes communes: {len(common_cols)}")
    
    # Comparer distributions pour quelques colonnes
    sample_cols = common_cols[:min(6, len(common_cols))]
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()
    
    for i, col in enumerate(sample_cols):
        ax = axes[i]
        
        # Original
        data_orig = df_original.select(col).to_series().drop_nulls().to_numpy()
        ax.hist(data_orig, bins=30, alpha=0.5, label='Original', color='blue')
        
        # Preprocessed
        data_prep = df_preprocessed.select(col).to_series().drop_nulls().to_numpy()
        ax.hist(data_prep, bins=30, alpha=0.5, label='Preprocessed', color='orange')
        
        ax.set_title(col, fontsize=10)
        ax.legend()
    
    plt.tight_layout()
    plt.show()

## 5. Statistiques descriptives

In [None]:
if df_preprocessed is not None:
    print("=" * 80)
    print("STATISTIQUES DESCRIPTIVES")
    print("=" * 80)
    
    print(f"\nüìä Original:")
    print(f"  Shape: {df_original.shape}")
    print(f"  NaN total: {df_original.null_count().sum(axis=1)[0]}")
    print(f"  Types: {df_original.dtypes}")
    
    print(f"\nüìä Preprocessed:")
    print(f"  Shape: {df_preprocessed.shape}")
    print(f"  NaN total: {df_preprocessed.null_count().sum(axis=1)[0]}")
    print(f"  Types: {df_preprocessed.dtypes}")
    
    print(f"\nüìâ R√©ductions:")
    print(f"  Colonnes: {df_original.shape[1]} ‚Üí {df_preprocessed.shape[1]} (-{df_original.shape[1] - df_preprocessed.shape[1]})")
    print(f"  R√©duction: {(1 - df_preprocessed.shape[1]/df_original.shape[1])*100:.1f}%")

## 6. R√©sum√© validation

In [None]:
if df_preprocessed is not None:
    print("\n" + "="*80)
    print("R√âSUM√â VALIDATION")
    print("="*80)
    
    checks = []
    
    # Check 1: Pas de NaN
    total_nans = sum(df_preprocessed.null_count().row(0))
    checks.append(("Absence de NaN", total_nans == 0))
    
    # Check 2: MathScore inchang√©
    if 'MathScore' in df_original.columns and 'MathScore' in df_preprocessed.columns:
        original_scores = df_original.select('MathScore').to_series().to_numpy()
        preprocessed_scores = df_preprocessed.select('MathScore').to_series().to_numpy()
        mathscore_ok = np.array_equal(original_scores, preprocessed_scores, equal_nan=True)
        checks.append(("MathScore non modifi√©", mathscore_ok))
    
    # Check 3: R√©duction de colonnes
    reduction_ok = df_preprocessed.shape[1] < df_original.shape[1]
    checks.append(("R√©duction de colonnes", reduction_ok))
    
    # Afficher
    for check_name, passed in checks:
        status = "‚úÖ" if passed else "‚ùå"
        print(f"{status} {check_name}")
    
    all_passed = all(passed for _, passed in checks)
    
    if all_passed:
        print("\nüéâ Tous les checks sont pass√©s!")
    else:
        print("\n‚ö†Ô∏è Certains checks ont √©chou√©")