In [None]:
import numpy as np
import pandas as pd

In [None]:
from cleanup_functions import clean_numeros
from cleanup_functions import classify_id

In [None]:
path_to_data = 'data' #path to data

## Read Dataset

In [None]:
subventions_normalized_data = pd.read_csv(path_to_data+'subventions_normalized_data.csv', on_bad_lines='skip', sep=';')
selected_communities_data = pd.read_csv(path_to_data+'selected_communities_data.csv', sep=';')
marches_publics_normalized_data = pd.read_csv(path_to_data+'marches_publics_normalized_data.csv', sep=';')
all_communities_data = pd.read_csv(path_to_data+'all_communities_data.csv', sep=';')

## Used Functions

Cleaning Number Formats in idbeneficiaire and nom_beneficiaire

The function clean_numeros standardizes number formats in the idbeneficiaire and nom_beneficiaire columns by handling common formatting issues.
Processing Rules:

    Preserves missing values (NaN, empty strings, or "non renseigné", "none", etc.).
    Fixes scientific notation with commas (e.g., '8,19672E+13' → '81967200000000'). This may cause issues if leading zeros are lost.
    Removes spaces within numbers (e.g., '086 257 568 00034' → '08625756800034').
    Removes decimal values with commas (e.g., '09869826600028,00' → '09869826600028').
    Removes .0 from whole numbers (e.g., '03986982660002.0' → '03986982660002')

In [None]:
subventions_normalized_data['idbeneficiaire'] = subventions_normalized_data['idbeneficiaire'].apply(clean_numeros)
subventions_normalized_data['nombeneficiaire'] = subventions_normalized_data['nombeneficiaire'].apply(clean_numeros)

Cleaning Beneficiary ID Columns (idbeneficiaire and nom_beneficiaire)

The function clean_and_check_length processes the idbeneficiaire and nombeneficiaire columns to identify whether they contain a SIRET (14-digit) or a SIREN (9). The results are categorized into different cases using the switch column:
Switch Column Values Explanation:

    0 → No changes applied.
    1 → idbeneficiaire is a SIRET, stored in siret_bene.
    2 → idbeneficiaire is a SIREN, stored in siren_bene.
    3 → nom_beneficiaire is a SIRET, moved to siret_bene, and idbeneficiaire is placed in nom_beneficiaire.
    4 → nom_beneficiaire is a SIREN, moved to siren_bene, and idbeneficiaire is placed in nom_beneficiaire.



In [None]:
subventions_normalized_data[['siret_bene', 'siren_bene', 'nombeneficiaire_new','switch']] = subventions_normalized_data.apply(
    lambda row: pd.Series(classify_id(row['idbeneficiaire'], row['nombeneficiaire'])), axis=1
)