## PM 2.5 and deaths due to overdose

In [1]:
import os

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

from air_brain.data.get_data import DATA_DIR
from air_brain.util.air import PM25, SO2
from air_brain.util.od import od

## Connect overdose death location to PM 2.5 AQI
1. Read in Allegheny County overdose deaths, marked with timestamp, incident zipcode, and drugs involved
   - Will use zipcode to estimate a latitude/longitude for each incident in Allegheny County
   - Some deaths were outside of Allegheny County, these are not included in this analysis
2. Generate PM 2.5 AQI for each overdose death, based on
   - Mean/median of all measurement stations
   - Inverse distance weighting all station measurements
   - TODO Kriging

In [2]:
# overdose deaths
od_df = od()
# PM 2.5 by date and zipcode
pm25 = pd.read_csv(os.path.join(DATA_DIR, "pm25_zipcode.csv"))
pm25.date = pd.to_datetime(pm25.date)
# merge
df = od_df.merge(pm25, on=["date", "zipcode"], how="outer", validate="m:1")

In [3]:
# if there's no PM 2.5 data for a date (not in measurement range of PM 2.5) or zipcode (outside Allegheny County),
# can't analyse
# save for comparing to deaths with PM 2.5 data
no_pm25 = df.loc[df.idw.isna()].copy()
print(len(no_pm25))
df = df.loc[df.idw.notna()].copy()
df.head()

2968


Unnamed: 0,death_date_and_time,manner_of_death,age,sex,race,case_dispo,combined_od1,combined_od2,combined_od3,combined_od4,...,combined_od6,combined_od7,combined_od8,combined_od9,combined_od10,zipcode,case_year,date,geometry,idw
2437,NaT,,,,,,,,,,...,,,,,,15006.0,,2016-01-01,POINT (-79.8814 40.6312),28.871941
2438,NaT,,,,,,,,,,...,,,,,,15007.0,,2016-01-01,POINT (-79.931 40.6478),28.831643
2439,NaT,,,,,,,,,,...,,,,,,15014.0,,2016-01-01,POINT (-79.7414 40.6082),29.16376
2440,NaT,,,,,,,,,,...,,,,,,15015.0,,2016-01-01,POINT (-80.0811 40.6372),28.783201
2441,NaT,,,,,,,,,,...,,,,,,15017.0,,2016-01-01,POINT (-80.1153 40.3472),29.317682


In [4]:
# for each zipcode, for each date with PM 2.5 data, compute the number of overdose deaths
# TODO subset by type of overdose??
per_day = df.groupby(["date", "zipcode"]).agg({"case_dispo": "count",
                                               "idw": "max"}).reset_index()
per_day.rename(columns={"case_dispo": "od_count"}, inplace=True)
# not many overdoses per day, so also look at binary had/didn't have overdose death
per_day["od_bin"] = (per_day.od_count > 0).astype(int)

## Relationship between overdose death count and PM 2.5
For each date and zipcode, have an overdose death count and PM 2.5 estimate

In [5]:
# mixed model, grouping over zipcode, predicting count od
md = smf.mixedlm("od_count ~ idw", per_day, groups=per_day["zipcode"])
mdf = md.fit()
mdf.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,od_count
No. Observations:,463204,Method:,REML
No. Groups:,142,Scale:,0.0092
Min. group size:,3262,Log-Likelihood:,427092.6296
Max. group size:,3262,Converged:,Yes
Mean group size:,3262.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.009,0.001,7.745,0.000,0.007,0.011
idw,0.000,0.000,0.692,0.489,-0.000,0.000
Group Var,0.000,0.000,,,,


In [6]:
# mixed model, grouping over zipcode, predicting binary od
md = smf.mixedlm("od_bin ~ idw", per_day, groups=per_day["zipcode"])
mdf = md.fit()
mdf.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,od_bin
No. Observations:,463204,Method:,REML
No. Groups:,142,Scale:,0.0088
Min. group size:,3262,Log-Likelihood:,439392.9716
Max. group size:,3262,Converged:,Yes
Mean group size:,3262.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.009,0.001,7.872,0.000,0.007,0.011
idw,0.000,0.000,0.535,0.593,-0.000,0.000
Group Var,0.000,0.000,,,,
