# Analyse de la prévalence de la maladie

## Nettoyage des données

Chargement des données

In [1]:
from getting_started import df_patient, df_pcr, pd

df_patient = df_patient.convert_dtypes()
df_pcr = df_pcr.convert_dtypes()

Nettoyage et déduplication

In [2]:
from entity_resolution import detect_duplicates

df_patient = detect_duplicates(df_patient)

In [3]:
df_patient.head(10)

Unnamed: 0_level_0,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,address_2
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
221958,matisse,clarke,13,rene street,ellenbrook,2527,nsw,1971-07-08,32,08 86018809,westella
771155,joshua,elrick,23,andrea place,east preston,2074,nsw,1912-09-21,34,02 97793152,foxdown
231932,alice,conboy,35,mountain circuit,prospect,2305,nsw,1981-09-05,22,02 20403934,
465838,sienna,craswell,39,cumberlegeicrescent,henty,3620,vic,1984-08-09,30,02 62832318,jodane
359178,joshua,bastiaans,144,lowrie street,campbell town,4051,qld,1934-04-30,31,03 69359594,
744167,ky,laing,448,nyawi place,barmera,3556,vic,1905-09-19,32,03 59872070,
210268,matthew,laing,11,barnes place,laurieton,2160,nsw,1906-10-18,29,02 86925029,
832180,jack,renfrey,27,osmand street,maribyrnong,2170,nsw,1961-05-18,31,03 15575583,dhurringill
154886,adele,ryan,76,house circuit,new farm,2200,nsw,1943-01-02,33,07 37444521,
237337,breeanne,wynne,12,cowper street,bonnet bay,2062,nsw,1903-06-06,35,08 24888117,


In [4]:
df_patient["dedup_id"] = df_patient.index.to_frame()

df = df_patient[df_patient.duplicated("phone_number", keep=False)].dropna().reset_index()

df = df[["phone_number", "patient_id"]]
df = df.merge(df, on="phone_number").drop(columns="phone_number")
df = df.groupby(["patient_id_x"]).min()
df = df.rename_axis("patient_id").rename(columns={"patient_id_y": "dedup_id"})

df_patient.update(df)

df_patient

Unnamed: 0_level_0,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,address_2,dedup_id
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
221958,matisse,clarke,13,rene street,ellenbrook,2527,nsw,1971-07-08,32,08 86018809,westella,221958
771155,joshua,elrick,23,andrea place,east preston,2074,nsw,1912-09-21,34,02 97793152,foxdown,771155
231932,alice,conboy,35,mountain circuit,prospect,2305,nsw,1981-09-05,22,02 20403934,,231932
465838,sienna,craswell,39,cumberlegeicrescent,henty,3620,vic,1984-08-09,30,02 62832318,jodane,465838
359178,joshua,bastiaans,144,lowrie street,campbell town,4051,qld,1934-04-30,31,03 69359594,,359178
...,...,...,...,...,...,...,...,...,...,...,...,...
368617,abby,,19,john cleland crescent,boyne island,2290,nsw,1957-02-10,,02 92489054,glenmore,368617
796658,dale,priest,19,bellchambers crescent,hoppers crossing,3033,vic,1959-06-19,31,07 20549476,,796658
511885,alessia,mawlai,4,miller street,smithfield,3616,vic,NaT,26,03 83466737,,511885
985932,hannah,mason,1,halligan place,beaumaris,2443,nsw,1914-03-04,25,04 41125514,,985932


Taux de déduplication

In [5]:
len(df_patient[df_patient.dedup_id.duplicated(keep=False)]) / len(df_patient)

0.016992396795427873

Conversion des résultats de test PCR en variable catégorielle

In [6]:
df_pcr.pcr = pd.Categorical(df_pcr.pcr.str[0], categories=["N", "P"], ordered=True)

df_pcr.pcr.value_counts()

N    6616
P    2184
Name: pcr, dtype: int64

Jointure avec le référentiel de patient

In [7]:
df_pcr = df_pcr.merge(df_patient, left_on="patient_id", right_index=True, validate="m:1")

df_pcr.pcr.value_counts()

N    6491
P    2141
Name: pcr, dtype: int64

In [8]:
df_prevalence_state = (
    df_pcr[["dedup_id", "pcr"]]
    .groupby("dedup_id").max()
    .replace({"N": 0, "P": 1})
    .merge(df_pcr[["dedup_id", "state"]], on="dedup_id")
    .groupby("state").aggregate({"dedup_id": "count", "pcr": "sum"})
    .rename(columns={"dedup_id": "tested", "pcr": "affected"})
)

df_prevalence_state.sort_values("affected", ascending=False)

Unnamed: 0_level_0,tested,affected
state,Unnamed: 1_level_1,Unnamed: 2_level_1
nsw,2898,739
vic,2135,516
qld,1639,410
wa,808,198
sa,677,175
tas,234,66
act,156,36
nt,75,20


In [9]:
df_prevalence_postcode = (
    df_pcr[["dedup_id", "pcr"]]
    .groupby("dedup_id").max()
    .replace({"N": 0, "P": 1})
    .merge(df_pcr[["dedup_id", "postcode"]], on="dedup_id")
    .groupby("postcode").aggregate({"dedup_id": "count", "pcr": "sum"})
    .rename(columns={"dedup_id": "tested", "pcr": "affected"})
)

df_prevalence_postcode.sort_values("affected", ascending=False)

Unnamed: 0_level_0,tested,affected
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
2170,35,18
4740,31,13
4305,27,9
4215,24,9
3690,17,8
...,...,...
3786,1,0
3787,2,0
3791,1,0
3796,2,0


Carto

In [37]:
from ipyleaflet import Choropleth, LegendControl, Map
from branca.colormap import linear
import json

geo_data = json.load(open("states.geojson"))

choro_data = df_prevalence_state["affected"].rename(index={"nsw":0, "vic":1, "qld":2, "sa":3, "wa":4, "tas":5, "nt":6, "act":7}).to_dict()

choropleth = Choropleth(
    geo_data=geo_data,
    choro_data=choro_data,
    colormap=linear.YlOrRd_04,
    style={'fillOpacity': 0.8, 'dashArray': '5, 5'},
)

map_ = Map(center=(-25.8, 136.8698), zoom=4)
map_.add_layer(choropleth)

map_

Map(center=[-25.8, 136.8698], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoo…