# Analyse X_train_preprocessed

Notebook d'analyse des donn√©es preprocess√©es pour valider les transformations appliqu√©es.

In [68]:
import polars as pl
import numpy as np
from pathlib import Path
import glob

## 1. Chargement des donn√©es

In [69]:
# Trouver le dernier fichier preprocessed
data_path = Path('../../data')
preprocessed_files = sorted(data_path.glob('X_train_preprocessed_*.csv'))

if not preprocessed_files:
    raise FileNotFoundError("Aucun fichier X_train_preprocessed_*.csv trouv√©")

latest_file = preprocessed_files[-1]
print(f"Fichier charg√©: {latest_file.name}")

# Charger les donn√©es
df_preprocessed = pl.read_csv(latest_file)
df_original = pl.read_csv(data_path / 'X_train.csv')

Fichier charg√©: X_train_preprocessed_20251129_180836.csv


## 2. Statistiques g√©n√©rales

In [70]:
print("="*80)
print("STATISTIQUES G√âN√âRALES")
print("="*80)

print("\nüìä DIMENSIONS")
print(f"  Donn√©es originales:     {df_original.shape}")
print(f"  Donn√©es preprocess√©es:  {df_preprocessed.shape}")
print(f"  R√©duction colonnes:     {df_original.shape[1] - df_preprocessed.shape[1]} ({(df_original.shape[1] - df_preprocessed.shape[1])/df_original.shape[1]*100:.1f}%)")

print("\nüìâ VALEURS MANQUANTES")
null_count_original = df_original.null_count().sum_horizontal()[0]
null_count_preprocessed = df_preprocessed.null_count().sum_horizontal()[0]
total_cells_original = df_original.shape[0] * df_original.shape[1]
total_cells_preprocessed = df_preprocessed.shape[0] * df_preprocessed.shape[1]

print(f"  Original:      {null_count_original:,} ({null_count_original/total_cells_original*100:.2f}%)")
print(f"  Preprocess√©:   {null_count_preprocessed:,} ({null_count_preprocessed/total_cells_preprocessed*100:.2f}%)")
print(f"  Am√©lioration:  {null_count_original - null_count_preprocessed:,} valeurs manquantes en moins")

STATISTIQUES G√âN√âRALES

üìä DIMENSIONS
  Donn√©es originales:     (1172086, 307)
  Donn√©es preprocess√©es:  (1172086, 110)
  R√©duction colonnes:     197 (64.2%)

üìâ VALEURS MANQUANTES
  Original:      213,250,711 (59.26%)
  Preprocess√©:   0 (0.00%)
  Am√©lioration:  213,250,711 valeurs manquantes en moins


In [71]:
# Afficher aper√ßu
print("\nüìã APER√áU DES DONN√âES PREPROCESS√âES (5 premi√®res lignes)")
df_preprocessed.head(5)


üìã APER√áU DES DONN√âES PREPROCESS√âES (5 premi√®res lignes)


Unnamed: 0_level_0,OECD,ADMINMODE,LANGTEST_COG,ST003D02T,ST004D01T,EFFORT1,EFFORT2,GRADE,IMMIG,IC171,ST038,ST034,PA042,IC173,WB154,IC174,ST021,PA183,WB163,PA166,ST250,ST296,ST353,ST349,WB164,ST062,ST254,IC170,AGE,COBN_S,WB165,FL169,WB166,ST355,IC177,ST345,…,reading_q1_total_timing,reading_q2_total_timing,reading_q3_total_timing,reading_q4_total_timing,reading_q5_total_timing,reading_q6_total_timing,reading_q7_total_timing,reading_q8_total_timing,math_q1_total_timing,math_q2_total_timing,math_q3_total_timing,math_q4_total_timing,math_q5_total_timing,math_q6_total_timing,math_q7_total_timing,math_q8_total_timing,math_q9_total_timing,math_q10_total_timing,math_q11_total_timing,math_q12_total_timing,science_q1_total_timing,science_q2_total_timing,science_q3_total_timing,science_q4_total_timing,science_q5_total_timing,science_q6_total_timing,science_q7_total_timing,science_q8_total_timing,science_q9_total_timing,science_q10_total_timing,Year,CNTRYID,CNTSCHID,CNTSTUID,Score_Support_Enseignant,OCOD1_grouped,OCOD2_grouped
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
384002,1.0,2.0,322.0,2.0,1.0,8.0,8.0,0.0,2.0,0.0,0.0,2.333333,0.0,0.0,2.0,0.0,0.0,0.0,6.5,0.0,4.0,1.0,0.0,0.0,0.0,0.333333,3.333333,0.0,15.75,52800.0,1.0,0.0,1.0,0.0,0.0,2.0,…,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.736,23.05525,162.72,110.641333,96.368333,71.8135,98.875,63.6894,37.232,88.274667,-1.0,-1.0,84.299333,90.156,66.612,73.5476,68.912,56.62325,104.1905,132.62325,105.785667,-1.0,2022.0,528.0,52800132.0,52801144.0,1.666667,3,2
1118072,0.0,2.0,803.0,11.0,2.0,7.0,6.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.42,3100.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,…,70.44475,10.03575,4.8738,67.025602,78.74843,8.368284,112.711,69.975711,50.234,80.788,48.059,116.718,61.003,34.117,56.901668,76.615,21.500334,27.087,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2018.0,31.0,3100106.0,3100424.0,1.666667,2,9
845454,1.0,2.0,493.0,5.0,1.0,8.0,10.0,0.0,1.0,3.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,1.166667,0.0,3.25,0.0,0.0,8.0,1.5,16.0,25000.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,…,110.500602,64.958,35.507602,63.326145,4.517952,39.792332,92.942984,82.727664,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,81.443664,190.243406,36.257,148.0735,2.12125,90.540336,57.954,84.633,45.332,87.6865,2018.0,250.0,25000010.0,25005207.0,1.333333,3,3
1728613,2.0,2.0,156.0,3.0,2.0,8.0,10.0,0.0,1.0,1.6,0.0,2.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.6,0.0,0.75,0.0,0.333333,2.0,1.4,16.17,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,…,79.73975,68.604664,53.5655,61.712332,63.6005,114.817336,81.293336,59.014332,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,62.224668,83.289664,91.51075,100.6495,82.3785,74.34825,91.230664,49.789,37.855,-1.0,2015.0,971.0,97100240.0,97127584.0,0.666667,-1,-1
1083243,0.0,2.0,313.0,9.0,1.0,10.0,10.0,-1.0,1.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,15.5,60800.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,…,65.393,57.448,75.7315,126.398602,54.334801,4.223181,106.763336,41.3245,137.692,36.046,43.09575,82.2825,121.0105,191.303,77.024336,197.619672,64.659668,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2018.0,608.0,60800071.0,60802698.0,4.0,9,7


In [72]:
# Info sur les colonnes
print("\nüìã TYPES DE DONN√âES")
dtypes = df_preprocessed.schema
dtype_counts = {}
for dtype in dtypes.values():
    dtype_str = str(dtype)
    dtype_counts[dtype_str] = dtype_counts.get(dtype_str, 0) + 1

for dtype, count in sorted(dtype_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {dtype}: {count} colonnes")


üìã TYPES DE DONN√âES
  Float64: 107 colonnes
  Int64: 3 colonnes


In [73]:
[col for col in df_preprocessed.columns if df_preprocessed[col].dtype == pl.String]

[]

## 3. V√©rification des transformations

### 3.1 Variables supprim√©es

In [74]:
# Variables redondantes ordinales qui devaient √™tre supprim√©es
redundant_ordinal = ['ST005', 'ST007', 'ST253', 'ST255', 'ST097']

# Variables m√©tadonn√©es cat√©gorielles qui devaient √™tre supprim√©es
metadata_categorical = [
    'Option_CT', 'Option_FL', 'Option_ICTQ', 'Option_PQ',
    'Option_TQ', 'Option_UH', 'Option_WBQ',
    'CYC', 'NatCen', 'SUBNATIO'
]

# Variables redondantes cat√©gorielles qui devaient √™tre supprim√©es
redundant_categorical = [
    'LANGTEST_PAQ', 'LANGTEST_QQQ', 'ST003D03T', 'ST001D01T',
    'PA008', 'PA162', 'OCOD3'
]

print("="*80)
print("V√âRIFICATION DES SUPPRESSIONS")
print("="*80)

print("\nüî¢ Variables ordinales redondantes (devaient √™tre supprim√©es):")
for var in redundant_ordinal:
    if var in df_original.columns:
        status = "‚ùå PR√âSENTE" if var in df_preprocessed.columns else "‚úÖ SUPPRIM√âE"
        print(f"  {var}: {status}")
    else:
        print(f"  {var}: ‚ö†Ô∏è  Absente du dataset original")

print("\nüè∑Ô∏è  Variables m√©tadonn√©es cat√©gorielles (devaient √™tre supprim√©es):")
for var in metadata_categorical:
    if var in df_original.columns:
        status = "‚ùå PR√âSENTE" if var in df_preprocessed.columns else "‚úÖ SUPPRIM√âE"
        print(f"  {var}: {status}")
    else:
        print(f"  {var}: ‚ö†Ô∏è  Absente du dataset original")

print("\nüè∑Ô∏è  Variables redondantes cat√©gorielles (devaient √™tre supprim√©es):")
for var in redundant_categorical:
    if var in df_original.columns:
        status = "‚ùå PR√âSENTE" if var in df_preprocessed.columns else "‚úÖ SUPPRIM√âE"
        print(f"  {var}: {status}")
    else:
        print(f"  {var}: ‚ö†Ô∏è  Absente du dataset original")

V√âRIFICATION DES SUPPRESSIONS

üî¢ Variables ordinales redondantes (devaient √™tre supprim√©es):
  ST005: ‚úÖ SUPPRIM√âE
  ST007: ‚úÖ SUPPRIM√âE
  ST253: ‚úÖ SUPPRIM√âE
  ST255: ‚úÖ SUPPRIM√âE
  ST097: ‚úÖ SUPPRIM√âE

üè∑Ô∏è  Variables m√©tadonn√©es cat√©gorielles (devaient √™tre supprim√©es):
  Option_CT: ‚úÖ SUPPRIM√âE
  Option_FL: ‚úÖ SUPPRIM√âE
  Option_ICTQ: ‚úÖ SUPPRIM√âE
  Option_PQ: ‚úÖ SUPPRIM√âE
  Option_TQ: ‚úÖ SUPPRIM√âE
  Option_UH: ‚úÖ SUPPRIM√âE
  Option_WBQ: ‚úÖ SUPPRIM√âE
  CYC: ‚úÖ SUPPRIM√âE
  NatCen: ‚úÖ SUPPRIM√âE
  SUBNATIO: ‚úÖ SUPPRIM√âE

üè∑Ô∏è  Variables redondantes cat√©gorielles (devaient √™tre supprim√©es):
  LANGTEST_PAQ: ‚úÖ SUPPRIM√âE
  LANGTEST_QQQ: ‚úÖ SUPPRIM√âE
  ST003D03T: ‚úÖ SUPPRIM√âE
  ST001D01T: ‚úÖ SUPPRIM√âE
  PA008: ‚úÖ SUPPRIM√âE
  PA162: ‚úÖ SUPPRIM√âE
  OCOD3: ‚úÖ SUPPRIM√âE


### 3.2 Scores composites

In [75]:
print("\n" + "="*80)
print("V√âRIFICATION DES SCORES COMPOSITES")
print("="*80)

composite_scores = [
    ('Score_Support_Parental', ['PA003', 'ST300']),
    ('Score_Support_Enseignant', ['ST100', 'ST270'])
]

for score_name, source_vars in composite_scores:
    print(f"\nüìä {score_name}")
    
    # V√©rifier pr√©sence des variables sources dans original
    sources_in_original = [v for v in source_vars if v in df_original.columns]
    print(f"  Variables sources dans original: {sources_in_original}")
    
    if score_name in df_preprocessed.columns:
        print(f"  ‚úÖ Score cr√©√©")
        # V√©rifier que les variables sources ont √©t√© supprim√©es
        sources_in_preprocessed = [v for v in source_vars if v in df_preprocessed.columns]
        if sources_in_preprocessed:
            print(f"  ‚ö†Ô∏è  Variables sources encore pr√©sentes: {sources_in_preprocessed}")
        else:
            print(f"  ‚úÖ Variables sources supprim√©es")
    else:
        if len(sources_in_original) >= 2:
            print(f"  ‚ö†Ô∏è  Score NON cr√©√© (sources disponibles)")
        else:
            print(f"  ‚ÑπÔ∏è  Score NON cr√©√© (sources manquantes dans original)")


V√âRIFICATION DES SCORES COMPOSITES

üìä Score_Support_Parental
  Variables sources dans original: ['PA003', 'ST300']
  ‚ö†Ô∏è  Score NON cr√©√© (sources disponibles)

üìä Score_Support_Enseignant
  Variables sources dans original: ['ST100', 'ST270']
  ‚úÖ Score cr√©√©
  ‚úÖ Variables sources supprim√©es


### 3.3 Regroupement ISCO

In [76]:
print("\n" + "="*80)
print("V√âRIFICATION DES REGROUPEMENTS ISCO")
print("="*80)

isco_vars = ['OCOD1', 'OCOD2']

for var in isco_vars:
    grouped_var = f"{var}_grouped"
    
    print(f"\nüìä {var}")
    
    if var in df_original.columns:
        original_unique = df_original.select(pl.col(var).n_unique()).item()
        print(f"  Cardinalit√© originale: {original_unique}")
        
        if var in df_preprocessed.columns:
            print(f"  ‚ùå Variable originale encore pr√©sente")
        else:
            print(f"  ‚úÖ Variable originale supprim√©e")
        
        if grouped_var in df_preprocessed.columns:
            grouped_unique = df_preprocessed.select(pl.col(grouped_var).n_unique()).item()
            print(f"  ‚úÖ Variable group√©e cr√©√©e")
            print(f"  Cardinalit√© group√©e: {grouped_unique}")
            print(f"  R√©duction: {original_unique} ‚Üí {grouped_unique} (-{(original_unique-grouped_unique)/original_unique*100:.1f}%)")
            
            # Afficher la distribution
            print(f"  Distribution:")
            distribution = df_preprocessed.group_by(grouped_var).agg(pl.count()).sort(grouped_var)
            print(distribution)
        else:
            print(f"  ‚ùå Variable group√©e NON cr√©√©e")
    else:
        print(f"  ‚ö†Ô∏è  Absente du dataset original")


V√âRIFICATION DES REGROUPEMENTS ISCO

üìä OCOD1
  Cardinalit√© originale: 774
  ‚úÖ Variable originale supprim√©e
  ‚úÖ Variable group√©e cr√©√©e
  Cardinalit√© group√©e: 11
  R√©duction: 774 ‚Üí 11 (-98.6%)
  Distribution:
shape: (11, 2)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ OCOD1_grouped ‚îÜ count  ‚îÇ
‚îÇ ---           ‚îÜ ---    ‚îÇ
‚îÇ i64           ‚îÜ u32    ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ -1            ‚îÜ 348119 ‚îÇ
‚îÇ 0             ‚îÜ 292    ‚îÇ
‚îÇ 1             ‚îÜ 43904  ‚îÇ
‚îÇ 2             ‚îÜ 167601 ‚îÇ
‚îÇ 3             ‚îÜ 64873  ‚îÇ
‚îÇ ‚Ä¶             ‚îÜ ‚Ä¶      ‚îÇ
‚îÇ 5             ‚îÜ 132603 ‚îÇ
‚îÇ 6             ‚îÜ 12139  ‚îÇ
‚îÇ 7             ‚îÜ 25148  ‚îÇ
‚îÇ 8             ‚îÜ 9340   ‚îÇ
‚îÇ 9             ‚îÜ 321833 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

üìä OCOD2
  Cardinalit√© originale: 775
  ‚úÖ Variable

(Deprecated in version 0.20.5)
  distribution = df_preprocessed.group_by(grouped_var).agg(pl.count()).sort(grouped_var)


## 4. Statistiques descriptives

In [77]:
print("\n" + "="*80)
print("STATISTIQUES DESCRIPTIVES DES COLONNES")
print("="*80)

# Colonnes num√©riques
numeric_cols = [col for col in df_preprocessed.columns if df_preprocessed[col].dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32]]

if numeric_cols:
    print(f"\nüìä Variables num√©riques ({len(numeric_cols)} colonnes)")
    print(df_preprocessed.select(numeric_cols).describe())
else:
    print("\n‚ö†Ô∏è  Aucune variable num√©rique d√©tect√©e")


STATISTIQUES DESCRIPTIVES DES COLONNES

üìä Variables num√©riques (110 colonnes)
shape: (9, 111)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ statistic ‚îÜ           ‚îÜ OECD      ‚îÜ ADMINMODE ‚îÜ ‚Ä¶ ‚îÜ CNTSTUID  ‚îÜ Score_Sup ‚îÜ OCOD1_gro ‚îÜ OCOD2_gr ‚îÇ
‚îÇ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ   ‚îÜ ---       ‚îÜ port_Ense ‚îÜ uped      ‚îÜ ouped    ‚îÇ
‚îÇ str       ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÜ   ‚îÜ f64       ‚îÜ ignant    ‚îÜ ---       ‚îÜ ---      ‚îÇ
‚îÇ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ           ‚îÜ ---       ‚îÜ f64       ‚îÜ f64      ‚îÇ
‚îÇ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ           ‚îÜ f64       ‚îÜ           ‚îÜ       

In [78]:
# Distribution des valeurs manquantes par colonne
print("\nüìâ TOP 50 COLONNES AVEC LE PLUS DE VALEURS MANQUANTES")

missing_by_col = []
for col in df_preprocessed.columns:
    null_count = df_preprocessed[col].null_count()
    null_pct = (null_count / len(df_preprocessed)) * 100
    missing_by_col.append({'column': col, 'null_count': null_count, 'null_pct': null_pct})

missing_df = pl.DataFrame(missing_by_col).sort('null_count', descending=True).head(30)
print(missing_df)


üìâ TOP 50 COLONNES AVEC LE PLUS DE VALEURS MANQUANTES
shape: (30, 3)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ column       ‚îÜ null_count ‚îÜ null_pct ‚îÇ
‚îÇ ---          ‚îÜ ---        ‚îÜ ---      ‚îÇ
‚îÇ str          ‚îÜ i64        ‚îÜ f64      ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ              ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ OECD         ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ ADMINMODE    ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ LANGTEST_COG ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ ST003D02T    ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ ‚Ä¶            ‚îÜ ‚Ä¶          ‚îÜ ‚Ä¶        ‚îÇ
‚îÇ WB164        ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ ST062        ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ ST254        ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ IC170        ‚îÜ 0          ‚îÜ 0.0      ‚îÇ
‚îÇ AGE          ‚îÜ 0          ‚îÜ 0.0    

Il y a 28 colonnes o√π il reste des NaNs.

In [79]:
[col for col in df_preprocessed.columns if df_preprocessed[col].null_count() > 0]

[]

## 5. R√©sum√© des validations

In [80]:
print("\n" + "="*80)
print("R√âSUM√â DES VALIDATIONS")
print("="*80)

validations = []

# Check suppressions ordinales
ordinal_removed = sum(1 for v in redundant_ordinal if v in df_original.columns and v not in df_preprocessed.columns)
ordinal_expected = sum(1 for v in redundant_ordinal if v in df_original.columns)
validations.append(('Variables ordinales supprim√©es', ordinal_removed, ordinal_expected))

# Check suppressions m√©tadonn√©es
metadata_removed = sum(1 for v in metadata_categorical if v in df_original.columns and v not in df_preprocessed.columns)
metadata_expected = sum(1 for v in metadata_categorical if v in df_original.columns)
validations.append(('M√©tadonn√©es supprim√©es', metadata_removed, metadata_expected))

# Check suppressions redondances cat√©gorielles
redundant_removed = sum(1 for v in redundant_categorical if v in df_original.columns and v not in df_preprocessed.columns)
redundant_expected = sum(1 for v in redundant_categorical if v in df_original.columns)
validations.append(('Redondances cat√©gorielles supprim√©es', redundant_removed, redundant_expected))

# Check regroupements ISCO
isco_grouped = sum(1 for v in isco_vars if f"{v}_grouped" in df_preprocessed.columns)
isco_expected = sum(1 for v in isco_vars if v in df_original.columns)
validations.append(('Regroupements ISCO cr√©√©s', isco_grouped, isco_expected))

print("\n")
for name, actual, expected in validations:
    status = "‚úÖ" if actual == expected else "‚ö†Ô∏è"
    print(f"{status} {name}: {actual}/{expected}")

print("\n" + "="*80)


R√âSUM√â DES VALIDATIONS


‚úÖ Variables ordinales supprim√©es: 5/5
‚úÖ M√©tadonn√©es supprim√©es: 10/10
‚úÖ Redondances cat√©gorielles supprim√©es: 7/7
‚úÖ Regroupements ISCO cr√©√©s: 2/2

