# Data cleaning for 'enfants' dataset

This notebook reads the raw CSV exported from a form, cleans column names and values,
normalizes text (removes accents), and maps inconsistent entries to canonical values.
The comments below explain each step in plain English.


In [None]:
# Import required libraries:
# - pandas: data manipulation and I/O
# - unidecode: remove accents and normalize unicode characters to ASCII
import pandas as pd
import unidecode
import re

# Read the raw CSV file
df = pd.read_csv(r"../data/enfants_dirty.csv")

df.head()


Unnamed: 0,start,end,Nom,Prenom,Sexe,Age,Niveau,Taille T-shirts,Eglise,Contact_Parent,...,_id,_uuid,_submission_time,_validation_status,_notes,_status,_submitted_by,__version__,_tags,_index
0,2025-11-28T10:02:00.000-00:00,2025-11-28T10:03:51.000-00:00,Koffi,Hermann,Garçon,10,"""""",L,Miséricorde,+225 971065484,...,617844653,c539d0cd-3e9b-4376-affd-528f42cff7a7,2025-11-28 10:02:01,"""""","""""",submitted_via_web,"""""",fMAmF1YjdSML2Pnh8NNrfv,"""""",1
1,2025-11-28T10:06:00.000-00:00,2025-11-28T10:07:53.000-00:00,GBAGBO,jean,Garçon,8,2,M,Beugre village,693547701,...,617480667,dfe99653-7d12-40a4-80bc-10a817044c8a,2025-11-28 10:06:01,"""""","""""",submitted_via_web,"""""",cg9JcblqkBd0fdC8FGJRae,"""""",2
2,2025-11-28T10:06:00.000-00:00,2025-11-28T10:07:22.000-00:00,diabate,mamadou,Garçon,5,1,S,Beugre village,+225 494415863,...,617341110,7733fb56-5c18-4116-a9fb-3fd05fb7becc,2025-11-28 10:06:01,"""""","""""",submitted_via_web,"""""",OxSKh3NLYb3dkkGraWLRmD,"""""",3
3,2025-11-28T10:14:00.000-00:00,2025-11-28T10:15:32.000-00:00,Bakayoko,franck,Garçon,14,4,XL,akouedo,0428142650,...,617331142,3d152038-798a-4ea5-8427-6fe88c9d3137,2025-11-28 10:14:01,"""""","""""",submitted_via_web,"""""",uaMvDKQU3I8uOghRQC7UPF,"""""",4
4,2025-11-28T10:16:00.000-00:00,2025-11-28T10:18:26.000-00:00,silue,akissi,Fille,9,"""""",L,MISÉRICORDE,0669631298,...,617373489,2ee1d263-48f8-4fbd-8dbc-d0950475b228,2025-11-28 10:16:01,"""""","""""",submitted_via_web,"""""",5gBnuYSNMjUA9m9j1MoPLn,"""""",5


## Remove unwanted metadata columns and rename columns

The original form export includes many technical columns (start, end, \_id, etc.).
We drop these to keep only the relevant survey fields and then rename the columns.


In [None]:
# Drop technical columns
df = df.drop(
    [
        "start",
        "end",
        "_id",
        "_uuid",
        "_submission_time",
        "_validation_status",
        "_notes",
        "_status",
        "_submitted_by",
        "__version__",
        "_tags",
        "_index",
    ],
    axis=1,
)

# Define new column names
nouveaux_noms = {
    "Nom": "nom",
    "Prenom": "prenoms",
    "Sexe": "gender",
    "Age": "age",
    "Niveau": "niveau",
    "Taille T-shirts": "taille_tshirt",
    "Eglise": "eglise",
    "Contact_Parent": "contact_parent",
    "Allergies_Maladies": "allergie_maladie",
}

df = df.rename(columns=nouveaux_noms)

df.head()


Unnamed: 0,nom,prenoms,gender,age,niveau,taille_tshirt,eglise,contact_parent,allergie_maladie
0,Koffi,Hermann,Garçon,10,"""""",L,Miséricorde,+225 971065484,Aucune
1,GBAGBO,jean,Garçon,8,2,M,Beugre village,693547701,Asthme
2,diabate,mamadou,Garçon,5,1,S,Beugre village,+225 494415863,aucune
3,Bakayoko,franck,Garçon,14,4,XL,akouedo,0428142650,aucune
4,silue,akissi,Fille,9,"""""",L,MISÉRICORDE,0669631298,Aucune


## Inspect unique values for problematic columns

Look at messy columns before cleaning.


In [None]:
print(df["eglise"].unique())
print("=" * 50)
print(df["allergie_maladie"].unique())
print("=" * 50)
print(df["taille_tshirt"].unique())
print("=" * 50)
print(df["niveau"].unique())
print("=" * 50)
print(df["gender"].unique())


['Miséricorde' 'Beugre village' 'Akouedo' 'Bethesda' 'Vridi' 'Marchou'
 'Abobo Gare']
['Aucune' 'Asthme' 'Allergie à la poussière' 'Sinusite'
 'Allergie alimentaire']
['L' 'M' 'S' 'XL']
['3' '2' '1' '4']
['Garçon' 'Fille']


## Normalization functions

Functions to clean: eglise, allergie, niveau.


In [None]:
def normaliser_eglise(x):
    x = str(x).strip().lower()
    x = unidecode.unidecode(x)

    mapping = {
        "misericorde": "Miséricorde",
        "beugre village": "Beugre village",
        "akouedo": "Akouedo",
        "bethesda": "Bethesda",
        "vridi": "Vridi",
        "marchou": "Marchou",
        "abobo gare": "Abobo Gare",
    }

    return mapping.get(x, x)


df["eglise"] = df["eglise"].apply(normaliser_eglise)
print(sorted(df["eglise"].unique()))
print("=" * 50)


def normaliser_allergie(x):
    if pd.isna(x):
        return None
    x = str(x).strip().lower()
    x = unidecode.unidecode(x)

    if x in ["", '""', "aucune"]:
        return "Aucune"

    mapping = {
        "asthme": "Asthme",
        "sinusite": "Sinusite",
        "poussiere": "Allergie à la poussière",
        "allergie a la poussiere": "Allergie à la poussière",
        "allergie poussiere": "Allergie à la poussière",
        "allergie alimentaire": "Allergie alimentaire",
    }

    return mapping.get(x, x)


df["allergie_maladie"] = df["allergie_maladie"].apply(normaliser_allergie)
print(sorted(df["allergie_maladie"].unique()))
print("=" * 50)


def calculer_niveau(age):
    try:
        age = int(age)
    except:
        return None

    if 4 <= age <= 7:
        return "1"
    elif 8 <= age <= 9:
        return "2"
    elif 10 <= age <= 12:
        return "3"
    elif 13 <= age <= 19:
        return "4"
    return None


def nettoyer_niveau(x):
    x = str(x).strip()
    if x in ["", '""', "nan"]:
        return None
    return x


df["niveau"] = df["niveau"].apply(nettoyer_niveau)
df["niveau"] = df.apply(
    lambda row: (
        row["niveau"] if pd.notna(row["niveau"]) else calculer_niveau(row["age"])
    ),
    axis=1,
)

print(df["niveau"].unique())


gender_map = {"Garçon": "M", "Fille": "F"}
df["gender"] = df["gender"].map(gender_map)
print(df["gender"].unique())


['Abobo Gare', 'Akouedo', 'Bethesda', 'Beugre village', 'Marchou', 'Miséricorde', 'Vridi']
['Allergie alimentaire', 'Allergie à la poussière', 'Asthme', 'Aucune', 'Sinusite']
['3' '2' '1' '4']


## Standardize personal names


In [None]:
df["nom"] = df["nom"].apply(lambda x: unidecode.unidecode(str(x).strip().title()))
df["prenoms"] = df["prenoms"].apply(
    lambda x: unidecode.unidecode(str(x).strip().title())
)

df.head()


Unnamed: 0,nom,prenoms,gender,age,niveau,taille_tshirt,eglise,contact_parent,allergie_maladie
0,Koffi,Hermann,Garçon,10,3,L,Miséricorde,+225 971065484,Aucune
1,Gbagbo,Jean,Garçon,8,2,M,Beugre village,693547701,Asthme
2,Diabate,Mamadou,Garçon,5,1,S,Beugre village,+225 494415863,Aucune
3,Bakayoko,Franck,Garçon,14,4,XL,Akouedo,0428142650,Aucune
4,Silue,Akissi,Fille,9,2,L,Miséricorde,0669631298,Aucune


## Clean contact information

Trim whitespace from contact numbers and then apply a normalization function to produce
an international-format phone number (Ivorian country code +225) with a consistent spacing.


In [None]:
df["contact_parent"] = df["contact_parent"].apply(lambda x: str(x).strip())
df.head()


Unnamed: 0,nom,prenoms,gender,age,niveau,taille_tshirt,eglise,contact_parent,allergie_maladie
0,Koffi,Hermann,Garçon,10,3,L,Miséricorde,+225 971065484,Aucune
1,Gbagbo,Jean,Garçon,8,2,M,Beugre village,693547701,Asthme
2,Diabate,Mamadou,Garçon,5,1,S,Beugre village,+225 494415863,Aucune
3,Bakayoko,Franck,Garçon,14,4,XL,Akouedo,0428142650,Aucune
4,Silue,Akissi,Fille,9,2,L,Miséricorde,0669631298,Aucune


### Normalize phone numbers

Define a function that removes non-digit characters, ensures the number includes the
country code '225', and formats it as '+225 ########'. This handles common local
variants like starting with '0', with or without '225', or leading zeros.


In [None]:
def normaliser_contact(num):
    if pd.isna(num):
        return None
    num = re.sub(r"\D", "", str(num))
    if num.startswith("2250"):
        num = num[1:]
    elif num.startswith("0"):
        num = num[1:]
    elif num.startswith("225"):
        num = num
    if not num.startswith("225"):
        num = "225" + num
    num = "+225 " + num[-8:]

    return num


df["contact_parent"] = df["contact_parent"].apply(normaliser_contact)
df.head()


array(['+225 71065484', '+225 93547701', '+225 94415863', '+225 28142650',
       '+225 69631298', '+225 89759269', '+225 93324747', '+225 09882785',
       '+225 35428115', '+225 93810769', '+225 64675978', '+225 51148237',
       '+225 46837876', '+225 77498719', '+225 75504553', '+225 93702026',
       '+225 93330930', '+225 81978366', '+225 75491188', '+225 99732748',
       '+225 56001662', '+225 60914492', '+225 13644702', '+225 43490038',
       '+225 34882891', '+225 19888096', '+225 46841658', '+225 90428563',
       '+225 68063463', '+225 22472985', '+225 77544555', '+225 76069055',
       '+225 07732719', '+225 49821523', '+225 17701262', '+225 87230704',
       '+225 28279735', '+225 46831037', '+225 58125683', '+225 83458413',
       '+225 41299476', '+225 63430221', '+225 58771435', '+225 74751121',
       '+225 06991404', '+225 21350447', '+225 98667225', '+225 39141715',
       '+225 84307053', '+225 21254032', '+225 72478165', '+225 41586360',
       '+225 12894707', '

## Save cleaned CSV

Write the cleaned dataframe to a new CSV file under `data/enfants_clean.csv`.
Index is included here; set index=False if you don't want the DataFrame index as a column.


In [None]:
df.to_csv("../data/enfants_clean.csv", index=True)
