In [None]:
import pandas as pd
import numpy as np

# Charger les donnÃ©es
df_py2 = pd.read_excel("py1.xlsx", sheet_name=0)  # Nouveau fichier Python
df_r = pd.read_excel("val_py0r7.xlsx", sheet_name=0)

print(f"Fichier Python v1: {len(df_py2)} lignes")
print(f"Fichier R: {len(df_r)} lignes")


Fichier Python v1: 2389 lignes
Fichier R: 2389 lignes


In [None]:
# Extraire les colonnes R (suffixÃ©es .y)
cols_r = ["sej_id"] + [col for col in df_r.columns if col.endswith(".y")]
df_r_only = df_r[cols_r].copy()

# Merger
df_merged = df_py2.merge(
    df_r_only, on="sej_id", how="outer", suffixes=(".x", ".y"), indicator=True
)

print(f"\n=== RÃ‰SULTAT DU MERGE ===")
print(f"Total: {len(df_merged)}")
print(df_merged["_merge"].value_counts())



=== RÃ‰SULTAT DU MERGE ===
Total: 2389
_merge
both          2389
left_only        0
right_only       0
Name: count, dtype: int64


In [None]:
# Identifier la colonne sej_classe Python
if "sej_classe" in df_merged.columns:
    col_py = "sej_classe"
else:
    col_py = "sej_classe.x"

col_r = "sej_classe.y"

# Concordance
df_merged["pyr_st"] = df_merged[col_py] == df_merged[col_r]

print("\n=== CONCORDANCE ===")
print(df_merged["pyr_st"].value_counts())

concordants = df_merged["pyr_st"].sum()
total = len(df_merged)
print(f"\nTaux de concordance: {concordants / total * 100:.1f}%")
print(f"Cas restants: {total - concordants}")


=== CONCORDANCE ===
pyr_st
True     2366
False      23
Name: count, dtype: int64

Taux de concordance: 99.0%
Cas restants: 23


In [None]:
# Matrice de confusion
print("\n=== MATRICE DE CONFUSION ===")
confusion = pd.crosstab(
    df_merged[col_py].fillna("NA"), df_merged[col_r].fillna("NA"), margins=True
)
print(confusion)



=== MATRICE DE CONFUSION ===
sej_classe.y    0j  1j+  NA  sansLL   All
sej_classe                               
0j            1954    1   1       0  1956
1j+              0  159   0       3   162
sansLL           0   18   0     253   271
All           1954  178   1     256  2389

=== 23 CAS DISCORDANTS ===


In [None]:
# Filtrer les 23 cas discordants
discordants = df_merged[df_merged["pyr_st"] == False].copy()
print(f"\n=== {len(discordants)} CAS DISCORDANTS ===")
# RÃ©partition des discordances
print("\n=== RÃ‰PARTITION DES DISCORDANCES ===\n")

categories = [
    ("Python=0j, R=1j+", (discordants[col_py] == "0j") & (discordants[col_r] == "1j+")),
    (
        "Python=0j, R=sansLL",
        (discordants[col_py] == "0j") & (discordants[col_r] == "sansLL"),
    ),
    ("Python=1j+, R=0j", (discordants[col_py] == "1j+") & (discordants[col_r] == "0j")),
    (
        "Python=1j+, R=sansLL",
        (discordants[col_py] == "1j+") & (discordants[col_r] == "sansLL"),
    ),
    (
        "Python=sansLL, R=0j",
        (discordants[col_py] == "sansLL") & (discordants[col_r] == "0j"),
    ),
    (
        "Python=sansLL, R=1j+",
        (discordants[col_py] == "sansLL") & (discordants[col_r] == "1j+"),
    ),
]

for name, mask in categories:
    count = mask.sum()
    if count > 0:
        print(f"{name}: {count}")


=== 23 CAS DISCORDANTS ===

=== RÃ‰PARTITION DES DISCORDANCES ===

Python=0j, R=1j+: 1
Python=1j+, R=sansLL: 3
Python=sansLL, R=1j+: 18


In [None]:
# Analyse dÃ©taillÃ©e de tous les cas discordants
print("\n" + "=" * 80)
print("ANALYSE DÃ‰TAILLÃ‰E DES CAS DISCORDANTS")
print("=" * 80)

# VÃ©rifier si mÃªme doc_id
if "doc_id" in discordants.columns and "doc_id.y" in discordants.columns:
    discordants["same_doc"] = discordants["doc_id"] == discordants["doc_id.y"]
    print("\n=== MÃªme doc_id sÃ©lectionnÃ© ? ===")
    print(discordants["same_doc"].value_counts(dropna=False))


ANALYSE DÃ‰TAILLÃ‰E DES CAS DISCORDANTS

=== MÃªme doc_id sÃ©lectionnÃ© ? ===
same_doc
False    19
True      4
Name: count, dtype: int64


In [None]:
# Cas avec MÃŠME doc_id mais classification diffÃ©rente
same_doc = discordants[discordants["same_doc"] == True].copy()
diff_doc = discordants[discordants["same_doc"] == False].copy()

print(f"\nðŸŽ¯ MÃŠME doc_id, classification diffÃ©rente: {len(same_doc)} cas")
print(f"ðŸ”„ Doc_id DIFFÃ‰RENT: {len(diff_doc)} cas")


ðŸŽ¯ MÃŠME doc_id, classification diffÃ©rente: 4 cas
ðŸ”„ Doc_id DIFFÃ‰RENT: 19 cas


In [None]:
if len(same_doc) > 0:
    print("\n" + "=" * 60)
    print(f"ðŸŽ¯ ANALYSE DES {len(same_doc)} CAS MÃŠME DOC")
    print("=" * 60)

    cols_same = [
        "sej_id",
        "doc_id",
        "sej_spe",
        "del_sorval",
        "del_val",
        "sdt_doclibre",
        col_py,
        "sej_spe.y",
        "del_sorval.y",
        "del_val.y",
        "sdt_doclibre.y",
        col_r,
    ]
    cols_same = [c for c in cols_same if c in same_doc.columns]
    print(same_doc[cols_same].to_string())


ðŸŽ¯ ANALYSE DES 4 CAS MÃŠME DOC
         sej_id      doc_id      sej_spe  del_sorval  del_val  sdt_doclibre sej_classe    sej_spe.y  del_sorval.y  del_val.y sej_classe.y
39    240373975  38966139.0          MIP         0.0      0.0          True         0j          MIP           1.0        1.0          1j+
1086  250017901  39290636.0  PNEUMOLOGIE        26.0     26.0          True        1j+  PNEUMOLOGIE           NaN        NaN       sansLL
2275  259004249  39244790.0  OBSTETRIQUE        13.0     13.0          True        1j+  OBSTETRIQUE           NaN        NaN       sansLL
2360  259004788  39187223.0  OBSTETRIQUE         3.0      3.0          True        1j+  OBSTETRIQUE           NaN        NaN       sansLL


In [None]:
# Analyser les cas DOC DIFFÃ‰RENT
if len(diff_doc) > 0:
    print("\n" + "=" * 60)
    print(f"ðŸ”„ ANALYSE DES {len(diff_doc)} CAS DOC DIFFÃ‰RENT")
    print("=" * 60)

    cols_diff = [
        "sej_id",
        "sej_uf",
        "doc_id",
        "doc_libelle",
        "sej_spe",
        "del_sorval",
        "del_val",
        col_py,
        "doc_id.y",
        "doc_libelle.y",
        "sej_spe.y",
        "del_sorval.y",
        "del_val.y",
        col_r,
    ]
    cols_diff = [c for c in cols_diff if c in diff_doc.columns]
    print(diff_doc[cols_diff].to_string())


ðŸ”„ ANALYSE DES 19 CAS DOC DIFFÃ‰RENT
         sej_id  sej_uf      doc_id                                                doc_libelle           sej_spe  del_sorval  del_val sej_classe    doc_id.y                                              doc_libelle.y         sej_spe.y  del_sorval.y  del_val.y sej_classe.y
2     240281460     338  38998006.0                                      CR HDJ Oncologie Foch               NaN        -3.0      NaN     sansLL  39022332.0                CR Lettre de Liaison UnitÃ© Vanderbilt Foch         VANDERBILT          77.0       77.0          1j+
164   249050332     330  39269527.0                      CR Lettre de Liaison Pneumologie Foch       PNEUMOLOGIE         NaN      NaN     sansLL  39104496.0                      CR Lettre de Liaison Pneumologie Foch       PNEUMOLOGIE         104.0      104.0          1j+
272   249053689     438  38961734.0                                                CR Urgences               NaN        -2.0      NaN     sansL

In [None]:
# Comparer les critÃ¨res boolÃ©ens pour les cas discordants
print("\n" + "=" * 60)
print("COMPARAISON DES CRITÃˆRES SDT_*")
print("=" * 60)

criteres = [
    "sdt_docven",
    "sdt_docval",
    "sdt_smere",
    "sdt_doccre",
    "sdt_doccref",
    "sdt_emere",
    "sdt_status",
]

for critere in criteres:
    col_x = critere if critere in discordants.columns else f"{critere}.x"
    col_y = f"{critere}.y"

    if col_x in discordants.columns and col_y in discordants.columns:
        match = (discordants[col_x] == discordants[col_y]).sum()
        total_c = len(discordants)
        print(
            f"{critere}: {match}/{total_c} concordants ({match / total_c * 100:.0f}%)"
        )


COMPARAISON DES CRITÃˆRES SDT_*
sdt_docven: 10/23 concordants (43%)
sdt_docval: 15/23 concordants (65%)
sdt_smere: 16/23 concordants (70%)
sdt_doccre: 19/23 concordants (83%)
sdt_doccref: 15/23 concordants (65%)
sdt_emere: 22/23 concordants (96%)
sdt_status: 9/23 concordants (39%)


In [None]:
# Focus sur sdt_doclibre
print("\n" + "=" * 60)
print("FOCUS SUR sdt_doclibre")
print("=" * 60)

col_doclibre_py = (
    "sdt_doclibre" if "sdt_doclibre" in discordants.columns else "sdt_doclibre.x"
)
col_doclibre_r = "sdt_doclibre.y"

if col_doclibre_py in discordants.columns and col_doclibre_r in discordants.columns:
    print("\nValeurs Python:")
    print(discordants[col_doclibre_py].value_counts(dropna=False))
    print("\nValeurs R:")
    print(discordants[col_doclibre_r].value_counts(dropna=False))

    # Cas oÃ¹ sdt_doclibre diffÃ¨re
    doclibre_diff = discordants[
        discordants[col_doclibre_py] != discordants[col_doclibre_r]
    ]
    print(f"\nCas oÃ¹ sdt_doclibre diffÃ¨re: {len(doclibre_diff)}")


FOCUS SUR sdt_doclibre


In [None]:
# Focus sur del_sorval et del_val
print("\n" + "=" * 60)
print("FOCUS SUR del_sorval ET del_val")
print("=" * 60)

col_delsorval_py = (
    "del_sorval" if "del_sorval" in discordants.columns else "del_sorval.x"
)
col_delval_py = "del_val" if "del_val" in discordants.columns else "del_val.x"

print("\n=== del_sorval ===")
print(f"Python NaN: {discordants[col_delsorval_py].isna().sum()}")
print(f"R NaN: {discordants['del_sorval.y'].isna().sum()}")

print("\n=== del_val ===")
print(f"Python NaN: {discordants[col_delval_py].isna().sum()}")
print(f"R NaN: {discordants['del_val.y'].isna().sum()}")

# Cas oÃ¹ del_val diffÃ¨re
print("\n=== Comparaison del_val Python vs R ===")
cols_delval = [
    "sej_id",
    "doc_id",
    col_delsorval_py,
    col_delval_py,
    col_py,
    "del_sorval.y",
    "del_val.y",
    col_r,
]
cols_delval = [c for c in cols_delval if c in discordants.columns]
print(discordants[cols_delval].to_string())


FOCUS SUR del_sorval ET del_val

=== del_sorval ===
Python NaN: 13
R NaN: 4

=== del_val ===
Python NaN: 18
R NaN: 4

=== Comparaison del_val Python vs R ===
         sej_id      doc_id  del_sorval  del_val sej_classe  del_sorval.y  del_val.y sej_classe.y
2     240281460  38998006.0        -3.0      NaN     sansLL          77.0       77.0          1j+
39    240373975  38966139.0         0.0      0.0         0j           1.0        1.0          1j+
164   249050332  39269527.0         NaN      NaN     sansLL         104.0      104.0          1j+
272   249053689  38961734.0        -2.0      NaN     sansLL          71.0       71.0          1j+
727   250005560  39262570.0         NaN      NaN     sansLL          91.0       91.0          1j+
731   250005582         NaN         NaN      NaN     sansLL          38.0       38.0          1j+
802   250007617  39113930.0         NaN      NaN     sansLL         109.0      109.0          1j+
901   250011158         NaN         NaN      NaN     sans

In [None]:
# VÃ©rifier si les doc_id des cas discordants sont partagÃ©s avec d'autres sÃ©jours
print("\n" + "=" * 60)
print("VÃ‰RIFICATION MULTI-SÃ‰JOURS")
print("=" * 60)

for idx, row in discordants.iterrows():
    doc_id = row.get("doc_id") or row.get("doc_id.x")
    sej_id = row["sej_id"]

    if pd.notna(doc_id):
        # Chercher ce doc_id dans tout le dataset
        autres_sejours = df_merged[
            (df_merged["doc_id"] == doc_id) & (df_merged["sej_id"] != sej_id)
        ]

        if len(autres_sejours) > 0:
            print(f"\nðŸ“Œ sej_id={sej_id}, doc_id={doc_id}")
            print(
                f"   Ce doc est aussi utilisÃ© par {len(autres_sejours)} autre(s) sÃ©jour(s):"
            )
            for _, autre in autres_sejours.iterrows():
                autre_sej = autre["sej_id"]
                autre_del = autre.get("del_sorval") or autre.get("del_sorval.x")
                autre_classe = autre.get(col_py)
                print(
                    f"   - sej_id={autre_sej}, del_sorval={autre_del}, classe={autre_classe}"
                )



VÃ‰RIFICATION MULTI-SÃ‰JOURS


In [None]:
# Export pour analyse manuelle
discordants.to_excel("analyse_23_cas_restants.xlsx", index=False)
print("\nâœ… Fichier 'analyse_23_cas_restants.xlsx' crÃ©Ã©")


âœ… Fichier 'analyse_23_cas_restants.xlsx' crÃ©Ã©
