# Analyse de la prévalence de la maladie

## Nettoyage des données

Préambule

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['figure.dpi'] = 100

Chargement des données

In [None]:
from getting_started import df_patient, df_pcr, pd

df_patient = df_patient.convert_dtypes()
df_pcr = df_pcr.convert_dtypes()

Nettoyage et déduplication

In [None]:
from entity_resolution import detect_duplicates

df_patient = detect_duplicates(df_patient)

In [None]:
df_patient.head(10)

In [None]:
import recordlinkage as rl

# Every attribute but address_2
df = df_patient[df_patient.duplicated(subset=df_patient.columns[::-1], keep=False)]

pairs = rl.Index(rl.index.Full()).index(df)

comparator = rl.Compare()
for col in df.columns:
    comparator.exact(col, col, label=col)

features = comparator.compute(pairs, df)

min_notna = df.notna().sum(axis=1).min()

features[features.sum(axis=1) >= min_notna].index

In [None]:
df = df_patient.dropna(subset=["phone_number"])

pairs = rl.Index(rl.index.Block("phone_number")).index(df)

comparator = rl.Compare(n_jobs=4)
#comparator.exact("street_number", "street_number", label="street_number")
#comparator.exact("suburb", "suburb", label="suburb")
comparator.exact("postcode", "postcode", label="postcode")
#comparator.exact("state", "state", label="state")
comparator.exact("age", "age", label="age")
comparator.string("given_name", "given_name", label="given_name", method="jarowinkler", threshold=.85)
comparator.string("surname", "surname", label="surname", method="jarowinkler", threshold=.85)
comparator.string("given_name", "surname", label="given_name_surname", method="jarowinkler", threshold=.85)
comparator.string("surname", "given_name", label="surname_given_name", method="jarowinkler", threshold=.85)
#comparator.string("address_1", "address_1", label="address_1", method="jarowinkler", threshold=.8)
#comparator.string("address_2", "address_2", label="address_2", method="jarowinkler", threshold=.8)
#comparator.string("address_1", "address_2", label="address_1_2", method="jarowinkler", threshold=.8)
#comparator.string("address_2", "address_1", label="address_2_1", method="jarowinkler", threshold=.8)

features = comparator.compute(pairs, df)

features.sum(axis=1).value_counts().sort_index()

# threshold à 3

In [None]:
df = df_patient.dropna(subset=["phone_number"])

pairs = rl.Index(rl.index.Block("phone_number")).index(df)

comparator = rl.Compare(n_jobs=4)
comparator.exact("street_number", "street_number", label="street_number")
comparator.string("suburb", "suburb", label="suburb", method="jarowinkler", threshold=.9)
comparator.exact("postcode", "postcode", label="postcode")
comparator.exact("state", "state", label="state")
comparator.exact("age", "age", label="age")
comparator.string("address_1", "address_1", label="address_1", method="jarowinkler", threshold=.9)
comparator.string("address_2", "address_2", label="address_2", method="jarowinkler", threshold=.9)
comparator.string("address_1", "address_2", label="address_1_2", method="jarowinkler", threshold=.9)
comparator.string("address_2", "address_1", label="address_2_1", method="jarowinkler", threshold=.9)

features = comparator.compute(pairs, df)

features.sum(axis=1).value_counts().sort_index()

#threshold à 4

In [None]:
df = df_patient.dropna(subset=["postcode"])

pairs = rl.Index(rl.index.Block("postcode")).index(df)

comparator = rl.Compare(n_jobs=4)
comparator.exact("street_number", "street_number", label="street_number")
comparator.string("suburb", "suburb", label="suburb", method="jarowinkler", threshold=.9)
comparator.exact("age", "age", label="age")
comparator.exact("phone_number", "phone_number", label="phone_number")
comparator.string("address_1", "address_1", label="address_1", method="jarowinkler", threshold=.9)
comparator.string("address_2", "address_2", label="address_2", method="jarowinkler", threshold=.9)
comparator.string("address_1", "address_2", label="address_1_2", method="jarowinkler", threshold=.9)
comparator.string("address_2", "address_1", label="address_2_1", method="jarowinkler", threshold=.9)

features = comparator.compute(pairs, df)

# threshold 4
features.sum(axis=1).value_counts().sort_index()

In [None]:
df = df_patient.dropna(subset=["postcode"])

pairs = rl.Index(rl.index.Block("postcode")).index(df)

comparator = rl.Compare(n_jobs=4)
comparator.exact("age", "age", label="age")
comparator.exact("phone_number", "phone_number", label="age")
comparator.string("given_name", "given_name", label="given_name", method="jarowinkler", threshold=.85)
comparator.string("surname", "surname", label="surname", method="jarowinkler", threshold=.85)
comparator.string("given_name", "surname", label="given_name_surname", method="jarowinkler", threshold=.85)
comparator.string("surname", "given_name", label="surname_given_name", method="jarowinkler", threshold=.85)

features = comparator.compute(pairs, df)

# threshold 3
features.sum(axis=1).value_counts().sort_index()

In [None]:
df_patient["dedup_id"] = df_patient.index.to_frame().astype("Int64")

df = df_patient[df_patient.duplicated("phone_number", keep=False)].dropna().reset_index()
df = df[["phone_number", "patient_id"]]
df = df.merge(df, on="phone_number").drop(columns="phone_number")
df = df.groupby(["patient_id_x"]).min()
df = df.rename_axis("patient_id").rename(columns={"patient_id_y": "dedup_id"}).astype("Int64")

df_patient.update(df)
df_patient = df_patient.convert_dtypes()

df_patient

In [None]:
df_patient.info()

Taux de déduplication

In [None]:
len(df_patient[df_patient.dedup_id.duplicated(keep=False)]) / len(df_patient)

Conversion des résultats de test PCR en variable catégorielle

In [None]:
df_pcr.pcr = pd.Categorical(df_pcr.pcr.str[0], categories=["N", "P"], ordered=True)

df_pcr.pcr.value_counts()

Jointure avec le référentiel de patient

In [None]:
df_pcr = df_pcr.merge(df_patient["dedup_id"], left_on="patient_id", right_index=True, validate="m:1")

df_pcr.pcr.value_counts()

## Prévalance à l'échelle nationale

In [None]:
df_prevalence = (
    df_pcr[["dedup_id", "pcr"]]
    .groupby("dedup_id").max()
    .rename(columns={"pcr": "affected"})
    .merge(df_patient, on="dedup_id")
    .set_index("dedup_id")
)

df_prevalence

## Prévalance par catégorie d'âge

In [None]:
df_prevalence["age_category"] = pd.cut(df_prevalence.age, bins=[0, 10, 20, 30, 40, 100], include_lowest=True, ordered=True)

df_prevalence_age = (
    df_prevalence[["age_category", "affected"]]
    .replace({"affected": {"N": 0, "P": 1}})
    .groupby("age_category").agg(["sum", "count"])
    .droplevel(level=0, axis="columns")
    .rename(columns={"sum": "affected", "count": "tested"})
)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
labels = ["0-9", "10-19", "20-29", "30-39", "40+"]
df_prevalence_age.affected.plot.pie(ax=ax[0], labels=labels)
ax[0].set_title("Distribution of affected patients")
ax[0].yaxis.set_visible(False)
df_prevalence_age.plot.bar(stacked=True, rot=False, ax=ax[1])
_ = ax[1].xaxis.set_ticklabels(labels)

## Prévalance par état

In [None]:
df_prevalence_state = (
    df_prevalence[["state", "affected"]]
    .replace({"affected": {"N": 0, "P": 1}})
    .groupby("state").agg(["sum", "count"])
    .droplevel(level=0, axis="columns")
    .rename(columns={"sum": "affected", "count": "tested"})
)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
df_prevalence_state.affected.plot.pie(ax=ax[0])
ax[0].set_title("Distribution of affected patients")
ax[0].yaxis.set_visible(False)
df_prevalence_state.plot.bar(stacked=True, rot=False, ax=ax[1])

## Cartography

Choropleth

In [None]:
from ipyleaflet import Choropleth
from branca.colormap import linear
import json

geo_data = json.load(open("aus_state.geojson"))

choro_data = (
    df_prevalence_state["affected"]
    .rename(index={
        "nsw": 0,
        "vic": 1,
        "qld": 2,
        "sa": 3,
        "wa": 4,
        "tas": 5,
        "nt": 6,
        "act": 7
    })
).to_dict()

choro_data[8] = 0   # Other territories

choropleth = Choropleth(
    geo_data=geo_data,
    choro_data=choro_data,
    colormap=linear.YlOrRd_04,
    style={'fillOpacity': 0.6, 'dashArray': '5, 5'},
    name="choropleth"
)

Marker cluster

In [None]:
from ipyleaflet import AwesomeIcon, Marker, MarkerCluster

locations = (
    pd.read_csv(
        "australian_postcodes.csv",
        index_col="id",
        usecols=("id","postcode", "long", "lat"),
        dtype={
            "id": "int64",
            "postcode": "str",
            "long": "float",
            "lat": "float",
        }
    )
    .rename(columns={"long": "longitude", "lat": "latitude"})
    .drop_duplicates("postcode", keep="first")
    .set_index("postcode")
)

locations = (
    df_prevalence.merge(locations, left_on="postcode", right_index=True)
    [["latitude", "longitude"]].values.tolist()
)

icon = AwesomeIcon(
    name="plus-square",
    icon_color="white",
    marker_color="black",
)

markers = [
    Marker(icon=icon, location=location)
    for location in locations
]

marker_cluster = MarkerCluster(markers=markers, name="clusters")

Interactive map

In [None]:
from ipywidgets import Layout
from ipyleaflet import Choropleth, Map, basemaps
from ipyleaflet import LayersControl, SearchControl

# Base map
map_ = Map(
    basemap=basemaps.OpenStreetMap.BlackAndWhite,
    center=(-25.8, 136.8698),
    zoom=5,
    scroll_wheel_zoom=True,
    layout=Layout(width="100%", height="800px")
)

# Layer control
layer_control = LayersControl(position="topleft")


# Search control
search_marker = Marker(icon=AwesomeIcon(name="cirle"))

search_control = SearchControl(
    position="topright",
    url="https://nominatim.openstreetmap.org/search?format=json&q={s}",
    zoom=10,
    marker=search_marker,
)

# Compose layers
map_.add_layer(choropleth)
map_.add_layer(marker_cluster)

# Compose controls
map_.add_control(layer_control)
map_.add_control(search_control)

# Display map
map_