# Analyse de la prévalence de la maladie

## Nettoyage des données

Chargement des données

In [None]:
from getting_started import df_patient, df_pcr, pd

df_patient = df_patient.convert_dtypes()
df_pcr = df_pcr.convert_dtypes()

Nettoyage et déduplication

In [None]:
from entity_resolution import detect_duplicates

df_patient = detect_duplicates(df_patient)

In [None]:
df_patient.head(10)

In [None]:
df_patient["dedup_id"] = df_patient.index.to_frame()

df = df_patient[df_patient.duplicated("phone_number", keep=False)].dropna().reset_index()

df = df[["phone_number", "patient_id"]]
df = df.merge(df, on="phone_number").drop(columns="phone_number")
df = df.groupby(["patient_id_x"]).min()
df = df.rename_axis("patient_id").rename(columns={"patient_id_y": "dedup_id"})

df_patient.update(df)

df_patient

Taux de déduplication

In [None]:
len(df_patient[df_patient.dedup_id.duplicated(keep=False)]) / len(df_patient)

Conversion des résultats de test PCR en variable catégorielle

In [None]:
df_pcr.pcr = pd.Categorical(df_pcr.pcr.str[0], categories=["N", "P"], ordered=True)

df_pcr.pcr.value_counts()

Jointure avec le référentiel de patient

In [None]:
df_pcr = df_pcr.merge(df_patient, left_on="patient_id", right_index=True, validate="m:1")

df_pcr.pcr.value_counts()

In [None]:
df_prevalence_state = (
    df_pcr[["dedup_id", "pcr"]]
    .groupby("dedup_id").max()
    .replace({"N": 0, "P": 1})
    .merge(df_pcr[["dedup_id", "state"]], on="dedup_id")
    .groupby("state").aggregate({"dedup_id": "count", "pcr": "sum"})
    .rename(columns={"dedup_id": "tested", "pcr": "affected"})
)

df_prevalence_state.sort_values("affected", ascending=False)

In [None]:
df_prevalence_postcode = (
    df_pcr[["dedup_id", "pcr"]]
    .groupby("dedup_id").max()
    .replace({"N": 0, "P": 1})
    .merge(df_pcr[["dedup_id", "postcode"]], on="dedup_id")
    .groupby("postcode").aggregate({"dedup_id": "count", "pcr": "sum"})
    .rename(columns={"dedup_id": "tested", "pcr": "affected"})
)

df_prevalence_postcode.sort_values("affected", ascending=False)

Carto

In [None]:
from ipyleaflet import Choropleth, LegendControl, Map, basemaps
from branca.colormap import linear
import json

geo_data = json.load(open("aus_state.geojson"))

choro_data = df_prevalence_state["affected"].rename(index={"nsw":0, "vic":1, "qld":2, "sa":3, "wa":4, "tas":5, "nt":6, "act":7}).to_dict()
choro_data.update({8: 0})

choropleth = Choropleth(
    geo_data=geo_data,
    choro_data=choro_data,
    colormap=linear.YlOrRd_04,
    style={'fillOpacity': 0.8, 'dashArray': '5, 5'},
)

map_ = Map(basemap=basemaps.OpenStreetMap.BlackAndWhite, center=(-25.8, 136.8698), zoom=4)
map_.add_layer(choropleth)

map_