In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import requests
from time import sleep
import pyarrow.parquet as pq

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set path to datasets
DATA_DIR = Path('../src/ose_core/data_ingestion/extracted_datasets')

print("Libraries imported successfully!")


Libraries imported successfully!


# 0. DECIDENTO DATA IMPORT

In [2]:
# Load company basic info
df_basic = pd.read_csv(DATA_DIR / '01_company_basic_info.csv', dtype={'siren': str, 'siret': str})

print(f"Dataset shape: {df_basic.shape}")
print(f"\nColumns: {list(df_basic.columns)}")
display(df_basic.head(10))


Dataset shape: (375, 10)

Columns: ['company_name', 'siren', 'siret', 'departement', 'resume_activite', 'raison_sociale_keyword', 'raison_sociale', 'last_modified', 'processedAt', 'updatedAt']


Unnamed: 0,company_name,siren,siret,departement,resume_activite,raison_sociale_keyword,raison_sociale,last_modified,processedAt,updatedAt
0,PROCONI,132066,,0,fabrication de plats cuisinés frais pour la gr...,PROCONI,PROCONI,2023-04-07T20:37:50,1763018431,1762042439
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,21,Spécialisée en pains d'épices et pâtisseries a...,PAIN D'EPICES MULOT ET PETITJEAN,PAIN D'EPICES MULOT ET PETITJEAN,2025-05-05T07:44:39,1761068226,1763052487
2,JULIEN MACK,16450298,1645029800043.0,92,Préparation industrielle de produits à base de...,JULIEN MACK,JULIEN MACK,2025-06-27T08:59:06,1761097489,1763052488
3,OROC BAT,46580031,4658003100022.0,64,"fabrication de conserves, charcuterie et jambo...",OROC BAT,OROC BAT,2025-05-05T04:38:43,1763401032,1763052541
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,38,fabrication de farines,MINOTERIE DU TRIEVES CORREARD ET FILS,MINOTERIE DU TRIEVES CORREARD ET FILS,2025-06-27T15:52:42,1763438582,1763943214
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,13,Fabrication transformation achat et vente de t...,MAISON CANTINI FLANDIN,MAISON CANTINI FLANDIN,2025-05-05T05:54:55,1761105532,1763052557
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,45,fabricant de volailles et produits transformés...,AUVRAY-AUXY EN GATINAIS,AUVRAY-AUXY EN GATINAIS,2025-05-27T08:34:13,1761118573,1763052587
7,BIRABEN,96780838,9678083800015.0,64,fabrication de plats préparés,BIRABEN,BIRABEN,2025-05-05T08:04:16,1762592602,1763052593
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,976,fabrication de lait liquide et de produits lai...,LAITERIE DE MAYOTTE,LAITERIE DE MAYOTTE,2025-09-25T02:24:42,1760928797,1763052595
9,SALAISONS CHAMBADE,300700119,30070011900039.0,71,"fabricant de produits à base de viande, charcu...",SALAISONS CHAMBADE,SALAISONS CHAMBADE,2025-05-20T18:28:55,1760538135,1763052889


In [3]:
# Summary statistics
print(f"\nSummary:")
print(f"- Total companies: {len(df_basic)}")
print(f"- Companies with SIRET: {df_basic['siret'].notna().sum()} ({df_basic['siret'].notna().sum()/len(df_basic)*100:.1f}%)")
print(f"- Companies with SIREN: {df_basic['siren'].notna().sum()} ({df_basic['siren'].notna().sum()/len(df_basic)*100:.1f}%)")

print(f"- Unique departments: {df_basic['departement'].nunique()}")
print(f"- Companies with activity description: {df_basic['resume_activite'].notna().sum()}")


Summary:
- Total companies: 375
- Companies with SIRET: 351 (93.6%)
- Companies with SIREN: 375 (100.0%)
- Unique departments: 87
- Companies with activity description: 374


In [4]:
# Load financial data
df_financial = pd.read_csv(DATA_DIR / '02_financial_data.csv', dtype={'siren': str, 'siret': str})
print(f" Original Shape:{df_financial.shape}")
df_financial.head(10)

 Original Shape:(375, 112)


Unnamed: 0,company_name,siren,siret,caConsolide,caGroupe,resultatExploitation,dateConsolide,kpi_2025_capital_social,kpi_2025_evolution_ca,kpi_2023_ca_france,...,kpi_2017_ca_consolide,kpi_2016_ca_consolide,kpi_2016_resultat_net_consolide,kpi_2023_ca_consolide,kpi_2023_resultat_net_consolide,kpi_2022_ca_consolide,kpi_2022_resultat_net_consolide,kpi_2021_ca_consolide,kpi_2021_resultat_net_consolide,kpi_2017_resultat_net_consolide
0,PROCONI,132066,,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,0.0,0.0,76546.0,0.0,120000.0,1.0772,6729652.0,...,,,,,,,,,,
2,JULIEN MACK,16450298,1645029800043.0,0.0,0.0,670860.0,0.0,257600.0,,,...,,,,,,,,,,
3,OROC BAT,46580031,4658003100022.0,0.0,0.0,140333.0,0.0,350000.0,,5028967.0,...,,,,,,,,,,
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,0.0,0.0,473736.0,0.0,3000000.0,,,...,,,,,,,,,,
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,0.0,0.0,52265.0,0.0,,,,...,,,,,,,,,,
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
7,BIRABEN,96780838,9678083800015.0,0.0,0.0,-64962.0,0.0,,,,...,,,,,,,,,,
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,,,1588313.0,,,,,...,,,,,,,,,,
9,SALAISONS CHAMBADE,300700119,30070011900039.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,


In [5]:
# Filter out zeros and negative values for better visualization
ca_consolide = df_financial['caConsolide'].replace(0, np.nan).dropna()
ca_groupe = df_financial['caGroupe'].replace(0, np.nan).dropna()

In [6]:
# Load workforce data
df_workforce = pd.read_csv(DATA_DIR / '03_workforce_data.csv', dtype={'siren': str, 'siret': str})
df_workforce.head(20)

Unnamed: 0,company_name,siren,siret,effectif,effectifConsolide,effectifEstime,effectifGroupe
0,PROCONI,132066,,0.0,0.0,75.0,0.0
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,35.0,0.0,0.0,0.0
2,JULIEN MACK,16450298,1645029800043.0,35.0,0.0,0.0,0.0
3,OROC BAT,46580031,4658003100022.0,21.0,0.0,0.0,0.0
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,23.0,0.0,30.0,0.0
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,35.0,0.0,0.0,0.0
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,35.0,0.0,0.0,0.0
7,BIRABEN,96780838,9678083800015.0,35.0,0.0,40.0,0.0
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,46.0,,,
9,SALAISONS CHAMBADE,300700119,30070011900039.0,35.0,0.0,0.0,0.0


In [7]:
# Load workforce data
df_workforce = pd.read_csv(DATA_DIR / '03_workforce_data.csv', dtype={'siren': str, 'siret': str})
df_workforce.head(20)

Unnamed: 0,company_name,siren,siret,effectif,effectifConsolide,effectifEstime,effectifGroupe
0,PROCONI,132066,,0.0,0.0,75.0,0.0
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,35.0,0.0,0.0,0.0
2,JULIEN MACK,16450298,1645029800043.0,35.0,0.0,0.0,0.0
3,OROC BAT,46580031,4658003100022.0,21.0,0.0,0.0,0.0
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,23.0,0.0,30.0,0.0
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,35.0,0.0,0.0,0.0
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,35.0,0.0,0.0,0.0
7,BIRABEN,96780838,9678083800015.0,35.0,0.0,40.0,0.0
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,46.0,,,
9,SALAISONS CHAMBADE,300700119,30070011900039.0,35.0,0.0,0.0,0.0


In [8]:
# Load classification flags
df_flags = pd.read_csv(DATA_DIR / '05_classification_flags.csv', dtype={'siren': str, 'siret': str})

print(f"Dataset shape: {df_flags.shape}")
display(df_flags.head(10))


Dataset shape: (375, 15)


Unnamed: 0,company_name,siren,siret,startup,radiee,entreprise_b2b,entreprise_b2c,fintech,cac40,entreprise_familiale,entreprise_familiale_ter,filtre_levee_fond,flag_type_entreprise,hasMarques,hasESV1Contacts
0,PROCONI,132066,,False,False,False,False,False,False,False,undefined,False,['secteur_industriel'],False,False
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,False,False,True,False,False,False,True,true,False,"['entreprise_familiale', 'entreprise_b2b', 'fi...",True,True
2,JULIEN MACK,16450298,1645029800043.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",True,False
3,OROC BAT,46580031,4658003100022.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'site_ecommerce', 'activite...",True,False
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",False,False
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'site_ecommerce', 'secteur_...",False,False
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,False,False,True,False,False,False,False,false,False,"['entreprise_b2b', 'site_ecommerce', 'activite...",False,False
7,BIRABEN,96780838,9678083800015.0,False,False,True,False,False,False,True,true,False,"['entreprise_familiale', 'entreprise_b2b', 'si...",False,False
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",False,True
9,SALAISONS CHAMBADE,300700119,30070011900039.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'secteur_industriel', 'cara...",False,False


In [9]:
# Load classification flags
df_flags = pd.read_csv(DATA_DIR / '05_classification_flags.csv', dtype={'siren': str, 'siret': str})

print(f"Dataset shape: {df_flags.shape}")
display(df_flags.head(10))


Dataset shape: (375, 15)


Unnamed: 0,company_name,siren,siret,startup,radiee,entreprise_b2b,entreprise_b2c,fintech,cac40,entreprise_familiale,entreprise_familiale_ter,filtre_levee_fond,flag_type_entreprise,hasMarques,hasESV1Contacts
0,PROCONI,132066,,False,False,False,False,False,False,False,undefined,False,['secteur_industriel'],False,False
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013.0,False,False,True,False,False,False,True,true,False,"['entreprise_familiale', 'entreprise_b2b', 'fi...",True,True
2,JULIEN MACK,16450298,1645029800043.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",True,False
3,OROC BAT,46580031,4658003100022.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'site_ecommerce', 'activite...",True,False
4,MINOTERIE DU TRIEVES CORREARD ET FILS,57504649,5750464900013.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",False,False
5,MAISON CANTINI FLANDIN,57821266,5782126600095.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'site_ecommerce', 'secteur_...",False,False
6,AUVRAY-AUXY EN GATINAIS,86781150,8678115000015.0,False,False,True,False,False,False,False,false,False,"['entreprise_b2b', 'site_ecommerce', 'activite...",False,False
7,BIRABEN,96780838,9678083800015.0,False,False,True,False,False,False,True,true,False,"['entreprise_familiale', 'entreprise_b2b', 'si...",False,False
8,LAITERIE DE MAYOTTE,99378564,9937856400019.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'activite_rd', 'secteur_ind...",False,True
9,SALAISONS CHAMBADE,300700119,30070011900039.0,False,False,True,False,False,False,False,undefined,False,"['entreprise_b2b', 'secteur_industriel', 'cara...",False,False


In [10]:
# Load KPI data
df_kpi = pd.read_csv(DATA_DIR / '07_kpi_data.csv', dtype={'siren': str, 'siret': str})

print(f"Dataset shape: {df_kpi.shape}")
print(f"\nColumns: {list(df_kpi.columns)}")
display(df_kpi.head(10))

Dataset shape: (3779, 28)

Columns: ['company_name', 'siren', 'siret', 'year', 'fonds_propres', 'ca_france', 'date_cloture_exercice', 'duree_exercice', 'salaires_traitements', 'charges_financieres', 'impots_taxes', 'ca_bilan', 'resultat_exploitation', 'dotations_amortissements', 'capital_social', 'code_confidentialite', 'resultat_bilan', 'annee', 'effectif', 'effectif_sous_traitance', 'filiales_participations', 'evolution_ca', 'subventions_investissements', 'ca_export', 'evolution_effectif', 'participation_bilan', 'ca_consolide', 'resultat_net_consolide']


Unnamed: 0,company_name,siren,siret,year,fonds_propres,ca_france,date_cloture_exercice,duree_exercice,salaires_traitements,charges_financieres,...,effectif,effectif_sous_traitance,filiales_participations,evolution_ca,subventions_investissements,ca_export,evolution_effectif,participation_bilan,ca_consolide,resultat_net_consolide
0,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2023,2192166.0,6729652.0,2023-01-31,12.0,1394492.0,80993.0,...,,,,,,,,,,
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2022,1614077.0,6247357.0,2022-01-31,12.0,1327711.0,81469.0,...,35.0,,,,,,,,,
2,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2021,1497114.0,5296275.0,2021-01-31,12.0,1318083.0,66111.0,...,34.0,,,,,,,,,
3,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2020,1577275.0,5710890.0,2020-01-31,12.0,1380952.0,70953.0,...,45.0,18930.0,1.0,,,,,,,
4,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2019,1348804.0,5221375.0,2019-01-31,12.0,1230571.0,88389.0,...,43.0,15835.0,1.0,,,,,,,
5,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2018,1492199.0,,2018-01-31,12.0,1372333.0,385712.0,...,44.0,,,,,,,,,
6,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2017,1419433.0,5630041.0,2017-01-31,12.0,1394179.0,47878.0,...,44.0,,,,,,,,,
7,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2016,,,2016-01-31,12.0,,,...,,,,,,,,,,
8,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2015,,,2015-01-31,12.0,,,...,,,,,,,,,,
9,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,2025,,,,,,,...,,,,1.0772,,,,,,


In [11]:
# Summary
print(f"\nKPI Summary:")
print(f"Total records: {len(df_kpi)}")
print(f"Unique companies: {df_kpi['siren'].nunique()}")

# print(f"Years covered: {sorted(df_kpi['year'].unique())}")

print(f"\nRecords per year:")
print(df_kpi['year'].value_counts().sort_index())


KPI Summary:
Total records: 3779
Unique companies: 374

Records per year:
year
2010     15
2011     48
2012     58
2013    210
2014    242
2015    306
2016    291
2017    355
2018    282
2019    300
2020    330
2021    314
2022    342
2023    264
2024    278
2025    144
Name: count, dtype: int64


In [12]:
# Load signals data
df_signals = pd.read_csv(DATA_DIR / '08_signals.csv', dtype={'siren': str, 'siret': str})

print(f"Dataset shape: {df_signals.shape}")
print(f"\nColumns: {list(df_signals.columns)}")
display(df_signals.head(10))

Dataset shape: (2133, 12)

Columns: ['company_name', 'siren', 'siret', 'continent', 'country', 'departement', 'publishedAt', 'isMain', 'type', 'createdAt', 'companies_count', 'sirets_count']


Unnamed: 0,company_name,siren,siret,continent,country,departement,publishedAt,isMain,type,createdAt,companies_count,sirets_count
0,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,"[{'id': 6, 'label': 'Europe'}]","[{'id': 72, 'label': 'France'}]","[{'parent': 'Bourgogne-Franche-Comté', 'id': 2...",2021-09-30T00:00:00+02:00,True,"{'code': 'K1', 'id': 32, 'label': 'Investissem...",2020-09-07T15:14:38+02:00,1,1
1,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,"[{'id': 6, 'label': 'Europe'}]","[{'id': 72, 'label': 'France'}]","[{'parent': 'Bourgogne-Franche-Comté', 'id': 2...",2020-09-08T00:00:00+02:00,True,"{'code': 'L', 'id': 12, 'label': 'Levée de fon...",2020-09-07T15:14:12+02:00,1,1
2,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,,,,2016-09-21T00:00:00+02:00,True,"{'code': 'F', 'id': 6, 'label': ""Développement...",2016-09-20T10:45:13+02:00,1,1
3,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,"[{'id': 1, 'label': 'Asie'}]",,,2018-04-06T00:00:00+02:00,True,"{'code': 'F', 'id': 6, 'label': ""Développement...",2018-04-05T11:16:18+02:00,1,1
4,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,"[{'id': 1, 'label': 'Asie'}]",,,2018-04-06T00:00:00+02:00,True,"{'code': 'E', 'id': 5, 'label': 'Créations & o...",2018-04-05T11:15:32+02:00,1,1
5,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,"[{'id': 1, 'label': 'Asie'}]",,,2018-04-06T00:00:00+02:00,True,"{'code': 'H', 'id': 8, 'label': 'Activité inte...",2018-04-05T11:14:31+02:00,1,1
6,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,,,,2014-05-20T00:00:00+02:00,True,"{'code': 'X', 'id': 25, 'label': 'Actualité en...",2014-05-16T10:10:24+02:00,1,1
7,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,,,,2016-04-01T00:00:00+02:00,True,"{'code': 'U', 'id': 21, 'label': 'Nomination'}",2016-03-31T12:48:27+02:00,1,1
8,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,,"[{'id': 72, 'label': 'France'}]","[{'parent': 'Bourgogne-Franche-Comté', 'id': 2...",2016-09-21T00:00:00+02:00,True,"{'code': 'K1', 'id': 32, 'label': 'Investissem...",2016-09-20T10:45:56+02:00,1,1
9,PAIN D'EPICES MULOT ET PETITJEAN,15751530,1575153000013,,,,2016-04-01T00:00:00+02:00,True,"{'code': 'Hbis', 'id': 24, 'label': 'Activité ...",2016-03-28T19:33:31+02:00,1,1


# 1. master_df building

In [13]:
# ==== CONSTRUCTION DU MASTER PANEL FIRM-YEAR ====
# Identifiants + temps
ID_COLS = ["siren", "siret", "company_name"]
TIME_COL = "year"

# Targets (ce qu'on veut prédire en N+1)
CA_COL       = "ca_bilan"
EFFECTIF_COL = "effectif"
RESULT_COL   = "resultat_bilan"
TARGET_COLS = [CA_COL, EFFECTIF_COL, RESULT_COL]

# Features que à conserver (hors targets et hors data leakage)
FEATURE_COLS = [
    "fonds_propres",
    "salaires_traitements",
    "charges_financieres",
    "impots_taxes",
    "dotations_amortissements",
    "capital_social",
    "effectif_sous_traitance",
    "filiales_participations",
    "subventions_investissements",
    "participation_bilan",
]

# On vérifie que tout existe bien dans df_kpi
all_needed_cols = ID_COLS + [TIME_COL] + FEATURE_COLS + TARGET_COLS
missing = [c for c in all_needed_cols if c not in df_kpi.columns]
if missing:
    print("⚠️ Colonnes manquantes dans df_kpi :", missing)


In [14]:
# ==== CONSTRUCTION DU MASTER PANEL FIRM-YEAR ====
# Identifiants + temps
ID_COLS = ["siren", "siret", "company_name"]
TIME_COL = "year"

# Targets (ce qu'on veut prédire en N+1)
CA_COL       = "ca_bilan"
EFFECTIF_COL = "effectif"
RESULT_COL   = "resultat_bilan"
TARGET_COLS = [CA_COL, EFFECTIF_COL, RESULT_COL]

# Features que à conserver (hors targets et hors data leakage)
FEATURE_COLS = [
    "fonds_propres",
    "salaires_traitements",
    "charges_financieres",
    "impots_taxes",
    "dotations_amortissements",
    "capital_social",
    "effectif_sous_traitance",
    "filiales_participations",
    "subventions_investissements",
    "participation_bilan",
]

# On vérifie que tout existe bien dans df_kpi
all_needed_cols = ID_COLS + [TIME_COL] + FEATURE_COLS + TARGET_COLS
missing = [c for c in all_needed_cols if c not in df_kpi.columns]
if missing:
    print("⚠️ Colonnes manquantes dans df_kpi :", missing)


In [15]:
# On construit le panel firm-year propre
master_df = df_kpi[all_needed_cols].copy()

# Nettoyage minimal
master_df = master_df.dropna(subset=["siren", TIME_COL])
master_df["siren"] = master_df["siren"].astype(str)
master_df[TIME_COL] = master_df[TIME_COL].astype(int)

# Tri par entreprise + année (important pour la suite RNN)
master_df = master_df.sort_values(["siren", TIME_COL]).reset_index(drop=True)

print("master_df shape :", master_df.shape)

master_df shape : (3779, 17)


In [16]:
#UTIL Pourcentage de missing values par colonne
def nan_analyzer(df: pd.DataFrame):
    nan_percent = (
    df.isna()
        .mean() * 100
    )

    nan_percent = nan_percent.sort_values(ascending=False)

    print("📊 Pourcentage de valeurs manquantes par colonne :\n")
    display(nan_percent.to_frame("percent_nan"))

In [17]:
nan_analyzer(master_df)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
filiales_participations,99.259063
effectif_sous_traitance,95.263297
participation_bilan,95.236835
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
effectif,56.708124
fonds_propres,52.527124


In [18]:
cols_to_drop = [
    "filiales_participations",
    "effectif_sous_traitance",
    "participation_bilan",
    "siret"
]
master_df = master_df.drop(columns=cols_to_drop)

In [19]:
nan_analyzer(master_df)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
effectif,56.708124
fonds_propres,52.527124
capital_social,38.581635
ca_bilan,33.342154
resultat_bilan,29.319926


# 1.1 Call API to get more financial info

In [20]:
master_df["siren"] = master_df["siren"].astype(str).str.zfill(9)

siren_list = sorted(master_df["siren"].unique())
print(f"Nombre de SIREN uniques : {len(siren_list)}")

Nombre de SIREN uniques : 374


In [21]:
# UTIL pour récupérer les ratio par SIREN auprès de DATA.GOUV entre 2013 et 2025
# (possiblement adaptable pour passer les dates en fonction des années où on a vraiment de la data depuis Decidento)
def fetch_ratios_for_siren(siren: str, limit: int = 100, start_year: int =2013, end_year: int=2025) -> list[dict]:
    url = (
        f"https://data.economie.gouv.fr/api/explore/v2.1/catalog/datasets/ratios_inpi_bce/records"
        f"?where=siren%20%3D%20%22{siren}%22"
        f"%20AND%20date_cloture_exercice%3A%5B%22{start_year}-01-01%22%20TO%20%22{end_year}-12-31%22%5D"
        f"&limit={limit}"
    )

    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json().get("results", [])

Import full data from DATA.GOUV in df_ratio

In [22]:
all_rows = []

for i, s in enumerate(siren_list, start=1):
    s = str(s)  # sécurité
    rows = fetch_ratios_for_siren(s)
    all_rows.extend(rows)

    if i % 50 == 0 or i == len(siren_list):
        print(f"{i}/{len(siren_list)} traités — {len(all_rows)} lignes cumulées")

    sleep(0.1)

df_ratios = pd.DataFrame(all_rows)
df_ratios.head()

50/374 traités — 313 lignes cumulées
100/374 traités — 612 lignes cumulées
150/374 traités — 903 lignes cumulées
200/374 traités — 1187 lignes cumulées
250/374 traités — 1474 lignes cumulées
300/374 traités — 1830 lignes cumulées
350/374 traités — 2073 lignes cumulées
374/374 traités — 2194 lignes cumulées


Unnamed: 0,siren,date_cloture_exercice,chiffre_d_affaires,marge_brute,ebe,ebit,resultat_net,taux_d_endettement,ratio_de_liquidite,ratio_de_vetuste,...,caf_sur_ca,capacite_de_remboursement,marge_ebe,resultat_courant_avant_impots_sur_ca,poids_bfr_exploitation_sur_ca_jours,rotation_des_stocks_jours,credit_clients_jours,credit_fournisseurs_jours,type_bilan,confidentiality
0,15751530,2019-01-31,5221375,3881705,273008,-247638,-118356,381.035,337.573,50.386,...,7.767,12.671,5.228,-3.33,106.37,52.474,54.935,45.892,C,Public
1,15751530,2022-01-31,6247357,4543263,629177,65908,182001,357.639,423.713,39.307,...,10.855,8.462,10.012,2.293,110.553,58.304,32.115,51.689,C,Public
2,15751530,2020-01-31,5710890,4241952,581304,93684,253510,304.981,327.835,47.997,...,12.379,6.747,10.092,3.562,98.157,53.384,43.512,40.142,C,Public
3,15751530,2017-01-31,5630041,3993251,222334,32986,81667,291.683,238.398,25.921,...,0.675,107.241,3.886,1.15,111.478,43.771,52.348,62.406,C,Public
4,15751530,2021-01-31,5296275,3747966,307127,-62109,-55121,360.705,344.8,47.803,...,6.525,15.616,5.795,2.389,124.078,62.722,39.657,48.124,C,Public


In [23]:
df_ratios["date_cloture_exercice"] = pd.to_datetime(
    df_ratios["date_cloture_exercice"], errors="coerce"
)

In [24]:
nan_analyzer(df_ratios)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
resultat_courant_avant_impots_sur_ca,26.253418
poids_bfr_exploitation_sur_ca_jours,26.253418
rotation_des_stocks_jours,26.253418
marge_ebe,26.253418
caf_sur_ca,26.253418
poids_bfr_exploitation_sur_ca,26.253418
capacite_de_remboursement,26.071103
couverture_des_interets,25.934366
credit_clients_jours,24.111212
credit_fournisseurs_jours,24.020055


In [25]:
df_ratios

Unnamed: 0,siren,date_cloture_exercice,chiffre_d_affaires,marge_brute,ebe,ebit,resultat_net,taux_d_endettement,ratio_de_liquidite,ratio_de_vetuste,...,caf_sur_ca,capacite_de_remboursement,marge_ebe,resultat_courant_avant_impots_sur_ca,poids_bfr_exploitation_sur_ca_jours,rotation_des_stocks_jours,credit_clients_jours,credit_fournisseurs_jours,type_bilan,confidentiality
0,015751530,2019-01-31,5221375,3881705,273008,-247638,-118356,381.035,337.573,50.386,...,7.767,12.671,5.228,-3.330,106.370,52.474,54.935,45.892,C,Public
1,015751530,2022-01-31,6247357,4543263,629177,65908,182001,357.639,423.713,39.307,...,10.855,8.462,10.012,2.293,110.553,58.304,32.115,51.689,C,Public
2,015751530,2020-01-31,5710890,4241952,581304,93684,253510,304.981,327.835,47.997,...,12.379,6.747,10.092,3.562,98.157,53.384,43.512,40.142,C,Public
3,015751530,2017-01-31,5630041,3993251,222334,32986,81667,291.683,238.398,25.921,...,0.675,107.241,3.886,1.150,111.478,43.771,52.348,62.406,C,Public
4,015751530,2021-01-31,5296275,3747966,307127,-62109,-55121,360.705,344.800,47.803,...,6.525,15.616,5.795,2.389,124.078,62.722,39.657,48.124,C,Public
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2189,950451518,2019-12-31,4712576,2878409,316467,216983,161476,41.431,177.317,54.539,...,5.577,1.901,6.700,4.501,99.534,46.260,53.869,80.903,C,Public
2190,950451518,2022-12-31,7069788,4003556,330871,169720,185245,69.748,256.879,58.987,...,4.439,4.466,4.680,2.128,101.909,44.961,49.289,53.481,C,Public
2191,950451518,2017-12-31,3758993,2230558,173548,161700,285533,27.536,199.004,45.928,...,7.799,0.960,4.599,4.208,117.712,51.554,55.049,70.918,C,Public
2192,950451518,2021-12-31,6757132,4026707,622926,501666,383358,88.639,228.420,63.795,...,7.470,2.818,9.219,7.369,83.676,37.089,49.062,55.781,C,Public


In [26]:
set_master = set(master_df["siren"])
set_ratios = set(df_ratios["siren"])

print("SIREN master_df :", len(set_master))
print("SIREN df_ratios :", len(set_ratios))
print("SIREN communs   :", len(set_master & set_ratios))
print("master_df sans ratios :", len(set_master - set_ratios))
print("ratios sans master_df :", len(set_ratios - set_master))

SIREN master_df : 374
SIREN df_ratios : 328
SIREN communs   : 328
master_df sans ratios : 46
ratios sans master_df : 0


# 1.2 Récupération Data INSEE

In [27]:
# 1. Liste de SIREN à garder (depuis ton master_df)
target_sirens = (
    master_df["siren"]
    .astype(str)
    .str.zfill(9)
    .unique()
)
target_sirens_set = set(target_sirens)  # plus rapide pour isin

# 2. Chemin vers le gros fichier Sirene parquet
DATA_PATH = Path("../src/ose_core/data_ingestion/raw data/StockUniteLegale_utf8_INSEE.parquet")
pf = pq.ParquetFile(DATA_PATH)

cols = [
    "siren",
    "trancheEffectifsUniteLegale",
    "anneeEffectifsUniteLegale",
    "categorieEntreprise",
    "dateCreationUniteLegale",
    "activitePrincipaleUniteLegale",
]

chunks = []

# 3. Lecture par batches
for batch in pf.iter_batches(columns=cols, batch_size=100_000):
    df_batch = batch.to_pandas()
    df_batch["siren"] = df_batch["siren"].astype(str).str.zfill(9)

    # on ne garde que les siren qui t'intéressent
    df_batch = df_batch[df_batch["siren"].isin(target_sirens_set)]

    if not df_batch.empty:
        chunks.append(df_batch)

# 4. Concaténer chunks
df_sirene_filtered = pd.concat(chunks, ignore_index=True)

df_sirene_filtered.head(), df_sirene_filtered.shape


(       siren trancheEffectifsUniteLegale  anneeEffectifsUniteLegale  \
 0  015751530                          12                     2023.0   
 1  016450298                          12                     2023.0   
 2  046580031                          12                     2023.0   
 3  057504649                          12                     2023.0   
 4  057821266                          11                     2023.0   
 
   categorieEntreprise dateCreationUniteLegale activitePrincipaleUniteLegale  
 0                 PME              1957-01-01                        10.72Z  
 1                 ETI              1964-01-01                        10.13A  
 2                 PME              1965-01-01                        10.13A  
 3                 PME              1957-01-01                        10.61A  
 4                 PME              1957-01-01                        10.11Z  ,
 (374, 6))

In [28]:
df_sirene_filtered["anneeEffectifsUniteLegale"].value_counts()


anneeEffectifsUniteLegale
2023.0    368
Name: count, dtype: int64

In [29]:
tranche_to_median = {
    "00": 0,
    "01": 1.5,
    "02": 4,
    "03": 7.5,
    "11": 15,
    "12": 35,
    "21": 75,
    "22": 150,
    "31": 225,
    "32": 375,
    "41": 750,
    "42": 1500,
    "51": 3500,
    "52": 7500,
    "53": 12000
}

df_sirene_filtered["effectif_median_sirene"] = (
    df_sirene_filtered["trancheEffectifsUniteLegale"].map(tranche_to_median)
)


# 1.3 Merge of gathered data with master_df => master_clean

In [30]:
master_df.columns

Index(['siren', 'company_name', 'year', 'fonds_propres',
       'salaires_traitements', 'charges_financieres', 'impots_taxes',
       'dotations_amortissements', 'capital_social',
       'subventions_investissements', 'ca_bilan', 'effectif',
       'resultat_bilan'],
      dtype='object')

In [31]:
# 1. Merge Sirene → master_df
df_sirene_keep = df_sirene_filtered[["siren", "effectif_median_sirene"]]

master_df["siren"] = master_df["siren"].astype(str).str.zfill(9)

master_df = master_df.merge(df_sirene_keep, on="siren", how="left")

# 2. Imputer par forward/backward fill les effectifs existants si j'en ai sur une des années
master_df = (
    master_df
    .sort_values(["siren", "year"])
    .groupby("siren")
    .apply(lambda g: g.assign(effectif=g["effectif"].ffill().bfill()))
    .reset_index(drop=True)
)

# 3. Trouver les lignes où effectif sont encore NaN
mask_missing = master_df["effectif"].isna()

# 4. Imputer avec effectif_median_sirene
master_df.loc[mask_missing, "effectif"] = master_df.loc[
    mask_missing, "effectif_median_sirene"
]

# 5. Check
print("Remaining NaN :", master_df["effectif"].isna().sum())
master_df[["siren", "year", "effectif", "effectif_median_sirene"]].sample(8)


Remaining NaN : 11


  .apply(lambda g: g.assign(effectif=g["effectif"].ffill().bfill()))


Unnamed: 0,siren,year,effectif,effectif_median_sirene
2245,423554484,2017,74.0,75.0
2519,442306510,2020,32.0,75.0
1722,388400087,2017,27.0,75.0
285,311974661,2015,34.0,35.0
2702,483282752,2022,34.0,35.0
1743,389210923,2014,74.0,75.0
2572,448969592,2024,22.0,35.0
3606,810225532,2017,34.0,75.0


In [32]:
nan_analyzer(master_df)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
fonds_propres,52.527124
capital_social,38.581635
ca_bilan,33.342154
resultat_bilan,29.319926
effectif_median_sirene,1.667108


In [33]:
df_ratios

Unnamed: 0,siren,date_cloture_exercice,chiffre_d_affaires,marge_brute,ebe,ebit,resultat_net,taux_d_endettement,ratio_de_liquidite,ratio_de_vetuste,...,caf_sur_ca,capacite_de_remboursement,marge_ebe,resultat_courant_avant_impots_sur_ca,poids_bfr_exploitation_sur_ca_jours,rotation_des_stocks_jours,credit_clients_jours,credit_fournisseurs_jours,type_bilan,confidentiality
0,015751530,2019-01-31,5221375,3881705,273008,-247638,-118356,381.035,337.573,50.386,...,7.767,12.671,5.228,-3.330,106.370,52.474,54.935,45.892,C,Public
1,015751530,2022-01-31,6247357,4543263,629177,65908,182001,357.639,423.713,39.307,...,10.855,8.462,10.012,2.293,110.553,58.304,32.115,51.689,C,Public
2,015751530,2020-01-31,5710890,4241952,581304,93684,253510,304.981,327.835,47.997,...,12.379,6.747,10.092,3.562,98.157,53.384,43.512,40.142,C,Public
3,015751530,2017-01-31,5630041,3993251,222334,32986,81667,291.683,238.398,25.921,...,0.675,107.241,3.886,1.150,111.478,43.771,52.348,62.406,C,Public
4,015751530,2021-01-31,5296275,3747966,307127,-62109,-55121,360.705,344.800,47.803,...,6.525,15.616,5.795,2.389,124.078,62.722,39.657,48.124,C,Public
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2189,950451518,2019-12-31,4712576,2878409,316467,216983,161476,41.431,177.317,54.539,...,5.577,1.901,6.700,4.501,99.534,46.260,53.869,80.903,C,Public
2190,950451518,2022-12-31,7069788,4003556,330871,169720,185245,69.748,256.879,58.987,...,4.439,4.466,4.680,2.128,101.909,44.961,49.289,53.481,C,Public
2191,950451518,2017-12-31,3758993,2230558,173548,161700,285533,27.536,199.004,45.928,...,7.799,0.960,4.599,4.208,117.712,51.554,55.049,70.918,C,Public
2192,950451518,2021-12-31,6757132,4026707,622926,501666,383358,88.639,228.420,63.795,...,7.470,2.818,9.219,7.369,83.676,37.089,49.062,55.781,C,Public


In [34]:
# Harmonisation SIREN
master_df["siren"] = master_df["siren"].astype(str).str.zfill(9)
df_ratios["siren"] = df_ratios["siren"].astype(str).str.zfill(9)

# Harmonisation YEAR
df_ratios["date_cloture_exercice"] = pd.to_datetime(
    df_ratios["date_cloture_exercice"], errors="coerce"
)
df_ratios["year"] = df_ratios["date_cloture_exercice"].dt.year


In [35]:
ratio_cols = [c for c in df_ratios.columns if c not in ["siren", "year"]]

master_df["siren"] = master_df["siren"].astype(str).str.zfill(9)
df_ratios["siren"] = df_ratios["siren"].astype(str).str.zfill(9)

master_with_ratios = master_df.merge(
    df_ratios,
    on=["siren", "year"],
    how="left"
)

In [36]:
len(master_df), len(master_with_ratios)

(3779, 3796)

In [37]:
nan_analyzer(master_with_ratios)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448894
dotations_amortissements,64.778714
charges_financieres,64.488936
salaires_traitements,63.777661
impots_taxes,63.408851
rotation_des_stocks_jours,58.667018
poids_bfr_exploitation_sur_ca_jours,58.667018
caf_sur_ca,58.667018
marge_ebe,58.667018
resultat_courant_avant_impots_sur_ca,58.667018


In [38]:
# combien de lignes par (siren, year) ?
dup_counts = (
    df_ratios
    .groupby(["siren", "year"])
    .size()
    .reset_index(name="n")
)

dup_counts[dup_counts["n"] > 1]

Unnamed: 0,siren,year,n
703,347742686,2022,2
999,388400087,2017,2
1117,400968681,2016,2
1119,400968681,2018,2
1120,400968681,2019,2
1144,402489777,2020,2
1394,433996675,2022,2
1592,498376474,2016,2
1593,498376474,2017,2
1594,498376474,2018,2


In [39]:
dup_mask = master_with_ratios.duplicated(["siren", "year"], keep=False)
master_with_ratios[dup_mask].head()

Unnamed: 0,siren,company_name,year,fonds_propres,salaires_traitements,charges_financieres,impots_taxes,dotations_amortissements,capital_social,subventions_investissements,...,caf_sur_ca,capacite_de_remboursement,marge_ebe,resultat_courant_avant_impots_sur_ca,poids_bfr_exploitation_sur_ca_jours,rotation_des_stocks_jours,credit_clients_jours,credit_fournisseurs_jours,type_bilan,confidentiality
1212,347742686,MADERN PLATS CUISINES,2022,776184.0,789871.0,11012.0,80439.0,64303.0,114336.0,5846.0,...,-4.615,-6.084,-4.154,-4.493,166.042,97.605,54.306,52.443,C,Public
1213,347742686,MADERN PLATS CUISINES,2022,776184.0,789871.0,11012.0,80439.0,64303.0,114336.0,5846.0,...,0.356,178.428,0.88,-0.778,306.321,207.232,85.745,70.248,C,Partiellement confidentiel
1723,388400087,SAS FROMAGERIE LE PIC,2017,686051.0,,,,,99990.0,26504.0,...,,,,,,,,,C,Partiellement confidentiel
1724,388400087,SAS FROMAGERIE LE PIC,2017,686051.0,,,,,99990.0,26504.0,...,,,,,,,,,C,Partiellement confidentiel
1940,400968681,MANDAR,2016,1920617.0,2468485.0,126957.0,44143.0,292647.0,900000.0,,...,101.175,0.182,19.959,2.732,34.647,0.0,0.0,60.996,K,Public


In [40]:
# Priorité pour la colonne confidentiality
priority = {
    "Partiellement confidentiel": 1,
    "Public": 2
}

# Ajouter une colonne “priorité” temporaire
master_with_ratios["conf_priority"] = (
    master_with_ratios["confidentiality"].map(priority)
)

# Trier par siren, year et priorité
master_sorted = (
    master_with_ratios
    .sort_values(["siren", "year", "conf_priority"])
)

# Garder la meilleure ligne (la plus prioritaire) par (siren, year)
master_dedup = (
    master_sorted
    .drop_duplicates(subset=["siren", "year"], keep="first")
)

# Enlever la colonne temporaire
master_dedup = master_dedup.drop(columns=["conf_priority"])


In [41]:
len(master_with_ratios), len(master_dedup)


(3796, 3779)

In [42]:
nan_analyzer(master_dedup)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
rotation_des_stocks_jours,58.904472
poids_bfr_exploitation_sur_ca_jours,58.904472
caf_sur_ca,58.904472
marge_ebe,58.904472
resultat_courant_avant_impots_sur_ca,58.904472


In [46]:
# On rappatrie de l'info qui était dans ratio_df vers les colonnes originelles du master_dedup
mask = master_dedup["ca_bilan"].isna() & master_dedup["chiffre_d_affaires"].notna()
master_dedup.loc[mask, "ca_bilan"] = master_dedup.loc[mask, "chiffre_d_affaires"]
master_dedup["ca_final"] = master_dedup["ca_bilan"]

In [47]:
nan_analyzer(master_dedup)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
rotation_des_stocks_jours,58.904472
poids_bfr_exploitation_sur_ca_jours,58.904472
caf_sur_ca,58.904472
poids_bfr_exploitation_sur_ca,58.904472
resultat_courant_avant_impots_sur_ca,58.904472


In [None]:
mask_res = master_dedup["resultat_bilan"].isna() & master_dedup["resultat_net"].notna()
master_dedup.loc[mask_res, "resultat_bilan"] = master_dedup.loc[mask_res, "resultat_net"]

# mask_res2 = master_dedup["resultat_net"].isna() & master_dedup["resultat_bilan"].notna()
# master_dedup.loc[mask_res2, "resultat_net"] = master_dedup.loc[mask_res2, "resultat_bilan"]

master_dedup["resultat_final"] = master_dedup["resultat_bilan"]

In [52]:
nan_analyzer(master_dedup)

📊 Pourcentage de valeurs manquantes par colonne :



Unnamed: 0,percent_nan
subventions_investissements,76.448796
dotations_amortissements,64.990738
charges_financieres,64.726118
salaires_traitements,64.011643
impots_taxes,63.641175
rotation_des_stocks_jours,58.904472
poids_bfr_exploitation_sur_ca_jours,58.904472
marge_ebe,58.904472
resultat_courant_avant_impots_sur_ca,58.904472
poids_bfr_exploitation_sur_ca,58.904472


In [53]:
# On drop les années antérieures à 2013
master_2013 = master_dedup[master_dedup["year"] >= 2013]

In [71]:
#Util V2
def nan_analyzer(df):
    percent_nan = df.isna().mean() * 100

    result = (
        percent_nan
        .reset_index()
        .rename(columns={"index": "column", 0: "percent_nan"})
        .sort_values(by="percent_nan", ascending=False)
        .reset_index(drop=True)
    )

    return result

In [87]:
analyse_colonnes = nan_analyzer(master_2013)
analyse_colonnes

Unnamed: 0,column,percent_nan
0,subventions_investissements,75.669765
1,dotations_amortissements,63.860033
2,charges_financieres,63.586659
3,salaires_traitements,62.848551
4,impots_taxes,62.465828
5,rotation_des_stocks_jours,57.545107
6,poids_bfr_exploitation_sur_ca_jours,57.545107
7,marge_ebe,57.545107
8,resultat_courant_avant_impots_sur_ca,57.545107
9,poids_bfr_exploitation_sur_ca,57.545107


In [111]:
#on drop les colonnes redondantes et non utiles:
cols_to_drop = ['effectif_median_sirene', 'type_bilan', 'ebit',
                'ca_bilan', 'chiffre_d_affaires', 'resultat_net', 'resultat_bilan', 'date_cloture_exercice', 'confidentiality']

# on fixe un seuil de NaN au dessus duquel on drop aussi:
# Seuil de suppression
threshold = 50

cols_to_drop2 = (
    analyse_colonnes[analyse_colonnes["percent_nan"] > threshold]["column"]
    .tolist()
)

cols_to_drop2

['subventions_investissements',
 'dotations_amortissements',
 'charges_financieres',
 'salaires_traitements',
 'impots_taxes',
 'rotation_des_stocks_jours',
 'poids_bfr_exploitation_sur_ca_jours',
 'marge_ebe',
 'resultat_courant_avant_impots_sur_ca',
 'poids_bfr_exploitation_sur_ca',
 'caf_sur_ca',
 'capacite_de_remboursement',
 'couverture_des_interets',
 'credit_clients_jours',
 'credit_fournisseurs_jours',
 'fonds_propres']

In [112]:
master_clean = master_2013.drop(columns=cols_to_drop2).drop(columns=cols_to_drop)

In [113]:
master_clean

Unnamed: 0,siren,company_name,year,capital_social,effectif,marge_brute,ebe,taux_d_endettement,ratio_de_liquidite,ratio_de_vetuste,autonomie_financiere,ca_final,resultat_final
0,015751530,PAIN D'EPICES MULOT ET PETITJEAN,2014,,44.0,,,,,,,6653070.0,388230.0
1,015751530,PAIN D'EPICES MULOT ET PETITJEAN,2015,,44.0,,,,,,,4905670.0,181450.0
2,015751530,PAIN D'EPICES MULOT ET PETITJEAN,2016,,44.0,,,,,,,4684680.0,137360.0
3,015751530,PAIN D'EPICES MULOT ET PETITJEAN,2017,120000.0,44.0,3993251.0,222334.0,291.683,238.398,25.921,20.832,5630040.0,81670.0
4,015751530,PAIN D'EPICES MULOT ET PETITJEAN,2018,120000.0,44.0,4214266.0,412612.0,380.550,338.112,57.127,18.302,5971010.0,87770.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,950451518,PSMA LA SABLAISE,2019,191160.0,29.0,2878409.0,316467.0,41.431,177.317,54.539,42.062,4712576.0,161476.0
3792,950451518,PSMA LA SABLAISE,2020,,34.0,,,,,,,,
3793,950451518,PSMA LA SABLAISE,2021,191160.0,34.0,4026707.0,622926.0,88.639,228.420,63.795,38.053,6757132.0,383358.0
3794,950451518,PSMA LA SABLAISE,2022,,35.0,4003556.0,330871.0,69.748,256.879,58.987,43.943,7069788.0,185245.0


In [114]:
nan_analyzer(master_clean)

Unnamed: 0,column,percent_nan
0,ratio_de_vetuste,43.93111
1,taux_d_endettement,42.728267
2,autonomie_financiere,42.700929
3,ratio_de_liquidite,42.700929
4,marge_brute,42.673592
5,ebe,42.673592
6,capital_social,38.162931
7,resultat_final,21.979224
8,ca_final,18.069984
9,effectif,0.300711


# 2. Imputing with real logic

In [115]:
# Ajout des flags de missingness qui permettront à un model RNN d'être plus performant (Paper “Deep Learning for Time Series Forecasting” (Lim & Zohdy))
for col in master_clean.columns.tolist():
    master_clean[f"{col}_was_nan"] = master_clean[col].isna().astype(int)

In [116]:
#IMPUTATION à pipeliner plus tard

# --- Étape 1 : tri par entreprise + année ---
master_clean = master_clean.sort_values(["siren", "year"]).reset_index(drop=True)

# --- Étape 2 : ffill + bfill par entreprise ---
master_clean= (
    master_clean
    .groupby("siren")
    .apply(lambda g: g.ffill().bfill())
    .reset_index(drop=True)
)

  .apply(lambda g: g.ffill().bfill())


In [117]:
nan_analyzer(master_clean)

Unnamed: 0,column,percent_nan
0,ratio_de_vetuste,6.697649
1,marge_brute,6.369601
2,ratio_de_liquidite,6.369601
3,taux_d_endettement,6.369601
4,ebe,6.369601
5,autonomie_financiere,6.369601
6,resultat_final,5.822854
7,ca_final,0.51941
8,capital_social,0.355385
9,effectif,0.300711


In [121]:
master_clean[master_clean['ca_final'].isna()]

Unnamed: 0,siren,company_name,year,capital_social,effectif,marge_brute,ebe,taux_d_endettement,ratio_de_liquidite,ratio_de_vetuste,...,capital_social_was_nan,effectif_was_nan,marge_brute_was_nan,ebe_was_nan,taux_d_endettement_was_nan,ratio_de_liquidite_was_nan,ratio_de_vetuste_was_nan,autonomie_financiere_was_nan,ca_final_was_nan,resultat_final_was_nan
1031,341842052,MARO-OCEANS,2017,700000.0,40.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1032,341842052,MARO-OCEANS,2018,700000.0,20.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1033,341842052,MARO-OCEANS,2019,700000.0,34.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1034,341842052,MARO-OCEANS,2020,700000.0,34.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1035,341842052,MARO-OCEANS,2021,700000.0,34.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1036,341842052,MARO-OCEANS,2022,700000.0,35.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1037,341842052,MARO-OCEANS,2023,700000.0,35.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1038,341842052,MARO-OCEANS,2024,700000.0,35.0,,,,,,...,0,0,1,1,1,1,1,1,1,1
1883,401692801,SCEA STURGEON,2018,1000000.0,34.0,,,,,,...,1,0,1,1,1,1,1,1,1,1
1884,401692801,SCEA STURGEON,2022,1000000.0,35.0,,,,,,...,1,0,1,1,1,1,1,1,1,1


In [123]:
# imputation CA basée sur l'effectif
effectif_to_ca = {
    1: 120000,
    2: 200000,
    5: 350000,
    10: 650000,
    20: 1200000,
    50: 3000000,
    100: 6000000
}

def estimate_ca_from_effectif(e):
    if pd.isna(e): return None
    for k, v in effectif_to_ca.items():
        if e <= k: return v
    return 8000000  # gros comptes

master_clean["ca_final"] = master_clean["ca_final"].fillna(
    master_clean["effectif"].apply(estimate_ca_from_effectif)
)

In [None]:
# imputation qu'on tente de moduler via le CA qui est désormais bien renseigné partout.
cols_to_impute = [
    "ratio_de_vetuste",
    "marge_brute",
    "ratio_de_liquidite",
    "taux_d_endettement",
    "ebe",
    "autonomie_financiere",
    "resultat_final"
]

ca_col = "ca_final"  # ou "ca_bilan" selon ce que tu utilises comme CA final

mean_ca = master_clean[ca_col].mean()

for col in cols_to_impute:
    mean_col = master_clean[col].mean()

    # Valeur imputée : mean_col × (CA_i / mean_CA)
    imputed_values = mean_col * (master_clean[ca_col] / mean_ca)

    master_clean[col] = master_clean[col].fillna(imputed_values)


In [125]:
nan_analyzer(master_clean)

Unnamed: 0,column,percent_nan
0,capital_social,0.355385
1,effectif,0.300711
2,company_name,0.0
3,siren,0.0
4,year,0.0
5,marge_brute,0.0
6,ebe,0.0
7,taux_d_endettement,0.0
8,ratio_de_liquidite,0.0
9,ratio_de_vetuste,0.0


In [126]:
# Imputation simple pour le reste

# 1. capital_social → médiane
median_capital = master_clean["capital_social"].median()
master_clean["capital_social"] = master_clean["capital_social"].fillna(median_capital)

# 2. effectif → médiane (et arrondi)
median_effectif = master_clean["effectif"].median()
master_clean["effectif"] = (
    master_clean["effectif"]
    .fillna(median_effectif)
    .round()
    .astype("Int64")
)
# on force minimum = 1
master_clean.loc[master_clean["effectif"] < 1, "effectif"] = 1


In [127]:
nan_analyzer(master_clean)

Unnamed: 0,column,percent_nan
0,siren,0.0
1,company_name,0.0
2,year,0.0
3,capital_social,0.0
4,effectif,0.0
5,marge_brute,0.0
6,ebe,0.0
7,taux_d_endettement,0.0
8,ratio_de_liquidite,0.0
9,ratio_de_vetuste,0.0


# Preprocess de df_signals avant merge avec master

In [128]:
import ast
from sklearn.preprocessing import OneHotEncoder

df_signals["siren"] = df_signals["siren"].astype(str)

# Extraire l'année de publishedAt de df_signals
# Conversion robuste en datetime
df_signals["publishedAt_dt"] = pd.to_datetime(
    df_signals["publishedAt"],
    errors="coerce",   # Si une ligne est complètement cheloue → NaT
    utc=True           # Force un timezone
)

# Extraction de l'année
df_signals["year"] = df_signals["publishedAt_dt"].dt.year

print(df_signals[["publishedAt", "publishedAt_dt", "year"]].head())
print("Type de publishedAt_dt :", df_signals["publishedAt_dt"].dtype)


                 publishedAt            publishedAt_dt  year
0  2021-09-30T00:00:00+02:00 2021-09-29 22:00:00+00:00  2021
1  2020-09-08T00:00:00+02:00 2020-09-07 22:00:00+00:00  2020
2  2016-09-21T00:00:00+02:00 2016-09-20 22:00:00+00:00  2016
3  2018-04-06T00:00:00+02:00 2018-04-05 22:00:00+00:00  2018
4  2018-04-06T00:00:00+02:00 2018-04-05 22:00:00+00:00  2018
Type de publishedAt_dt : datetime64[ns, UTC]


In [129]:
# Fonction pour extraire le code du signal
def extract_signal_code(x):
    if isinstance(x, dict):
        return x.get("code")
    if isinstance(x, str):
        try:
            d = ast.literal_eval(x)
            if isinstance(d, dict):
                return d.get("code")
        except:
            return np.nan
    return np.nan

df_signals["signal_code"] = df_signals["type"].apply(extract_signal_code)


In [130]:
signals_clean = df_signals.dropna(subset=["siren", "year", "signal_code"]).copy()

signals_clean = signals_clean[["siren", "year", "signal_code"]]
signals_clean.head()

# OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

onehot = ohe.fit_transform(signals_clean[["signal_code"]])

# On remet ça dans un DataFrame
df_onehot = pd.DataFrame(
    onehot,
    columns=[f"signal_{c}_count" for c in ohe.categories_[0]]
)

signals_encoded = pd.concat([signals_clean.reset_index(drop=True), df_onehot], axis=1)
signals_encoded.head()


Unnamed: 0,siren,year,signal_code,signal_A_count,signal_B_count,signal_D_count,signal_E_count,signal_F_count,signal_G_count,signal_H_count,...,signal_R_count,signal_S_count,signal_U_count,signal_W_count,signal_X_count,signal_Y_count,signal_Z_count,signal_Z1_count,signal_Z2_count,signal_Z3_count
0,15751530,2021,K1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15751530,2020,L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15751530,2016,F,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15751530,2018,F,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15751530,2018,E,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [131]:
# Agréger par siren + year : on somme les colonnes one-hot
signals_agg = (
    signals_encoded
    .groupby(["siren", "year"])
    .sum()
    .reset_index()
)
signals_agg = signals_agg.drop(columns='signal_code')
print("signals_agg shape :", signals_agg.shape)
display(signals_agg.head())


signals_agg shape : (729, 27)


Unnamed: 0,siren,year,signal_A_count,signal_B_count,signal_D_count,signal_E_count,signal_F_count,signal_G_count,signal_H_count,signal_Hbis_count,...,signal_R_count,signal_S_count,signal_U_count,signal_W_count,signal_X_count,signal_Y_count,signal_Z_count,signal_Z1_count,signal_Z2_count,signal_Z3_count
0,15751530,2014,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,15751530,2015,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15751530,2016,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15751530,2018,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15751530,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
# Merge avec Master DF sur siren + year
master_final = master_clean.merge(
    signals_agg,
    on=["siren", "year"],
    how="left"
)

# Trouver toutes les colonnes de signaux
signal_cols = [c for c in master_final.columns if c.startswith("signal_")]

# Remplacer les NaN (années sans signaux) par 0
master_final[signal_cols] = master_final[signal_cols].fillna(0).astype(int)

print("master_final shape après merge signaux :", master_final.shape)

master_final shape après merge signaux : (3658, 51)


# Sequencing

In [None]:
def build_sequences_panel_3y_next(
    df: pd.DataFrame,
    id_col: str,
    time_col: str,
    target_cols,
    covariate_cols,
    seq_len: int = 3,
):
    all_X = []
    all_y = []
    meta_rows = []

    # data = [targets | covariates]
    all_cols = list(target_cols) + list(covariate_cols)
    n_targets = len(target_cols)

    df = df.sort_values([id_col, time_col]).copy()

    for company_id, company_df in df.groupby(id_col):
        company_df = company_df.sort_values(time_col)

        years = company_df[time_col].to_numpy()
        data  = company_df[all_cols].to_numpy(dtype=float)

        n = len(company_df)
        if n <= seq_len:
            continue

        X_company = []
        y_company = []

        for i in range(0, n - seq_len):
            window_years = years[i : i + seq_len + 1]  # 3 ans + année cible

            # on exige des années consécutives
            if not np.all(np.diff(window_years) == 1):
                continue

            # ✅ X = targets + covariates sur les 3 années passées
            Xi_feat = data[i : i + seq_len, :]      # (3, n_features_total)

            # ✅ y = targets de l'année suivante
            yi_tgt  = data[i + seq_len, :n_targets] # (3,)

            X_company.append(Xi_feat)
            y_company.append(yi_tgt)

            meta_rows.append({
                id_col: company_id,
                "year_input_start": int(window_years[0]),
                "year_input_end":   int(window_years[seq_len - 1]),
                "year_target":      int(window_years[-1]),
            })

        if len(X_company) == 0:
            continue

        all_X.append(np.array(X_company))
        all_y.append(np.array(y_company))

    X = np.concatenate(all_X, axis=0)   # (n_seq, 3, n_features_total)
    y = np.concatenate(all_y, axis=0)   # (n_seq, 3)
    meta_df = pd.DataFrame(meta_rows)

    print("X shape :", X.shape)
    print("y shape :", y.shape)
    print("meta_df shape :", meta_df.shape)

    return X, y, meta_df
