# Merge

Combine data files into a CSV that's ready for analysis

In [29]:
import pandas as pd

Import data files

In [30]:
deaths_df = pd.read_csv(
    "../input/processed/death-records.csv",
    parse_dates=["date_of_death", "date_of_birth"],
    dtype={
        "last_name": str,
        "first_name": str,
        "middle_name": str,
        "sex": str,
        "place_of_birth_state_or_foreign_country": str,
        "place_of_death_county": str,
        "fathers_last_name": str,
        "file_name": str,
        "file_year": int,
        "year_of_death": int,
        "death_facility_name_location": str,
        "place_of_death_address_street_number": str,
        "place_of_death_address_street": str,
        "place_of_death_city": str,
        "father's_last_name": str,
        "place_of_death_facility": str,
        "place_of_death_facility_name_location": str,
        "place_of_death_address_street_name": str,
        "export_date": str,
        "father_last_name": str
    }
)

In [31]:
pops_df = pd.read_csv("../input/raw/dof/county-populations.csv")

In [32]:
recent_temps_df = pd.read_csv(
    "../input/processed/recent-temps-by-county.csv",
    parse_dates=["date"]
)

In [33]:
historic_temps_df = pd.read_csv(
    "../input/processed/historic-temps-by-county.csv",
)

Aggregate

In [34]:
deaths_by_day = deaths_df.groupby([
    "date_of_death",
    "place_of_death_county"
]).size().rename("deaths").reset_index().rename(
    columns={"date_of_death":"date", "place_of_death_county":"county"}
)

Backfill

In [35]:
backfilled_deaths_by_day = (
    deaths_by_day.set_index(["county", "date"])
    .unstack(["county"])
    .stack("county", dropna=False)
    .reset_index()
    .sort_values(["county", "date"])
).fillna(0)

Annotate

In [36]:
backfilled_deaths_by_day['day_name'] = backfilled_deaths_by_day['date'].dt.day_name()

In [37]:
backfilled_deaths_by_day['is_weekend'] = backfilled_deaths_by_day.day_name.isin(
    ['Saturday', 'Sunday']
)

In [38]:
backfilled_deaths_by_day['day'] = backfilled_deaths_by_day['date'].dt.day

In [39]:
backfilled_deaths_by_day['month'] = backfilled_deaths_by_day['date'].dt.month

In [40]:
backfilled_deaths_by_day['year'] = backfilled_deaths_by_day['date'].dt.year

Merge pops to deaths

In [41]:
pops_by_year = pops_df.set_index("county").stack().reset_index().rename(columns={
    "level_1": "year",
    0: "population"
})

In [42]:
pops_by_year.year = pops_by_year.year.astype(int)

In [43]:
pops_by_year.county = pops_by_year.county.str.upper()

In [44]:
pops_merge = backfilled_deaths_by_day.merge(pops_by_year, on=["county", "year"], how="inner")

In [45]:
len(pops_merge)

211642

In [46]:
assert len(pops_merge) == len(backfilled_deaths_by_day)

Merge temps

In [47]:
recent_temps_df.county = recent_temps_df.county.str.upper()

In [48]:
recent_temps_merge = pops_merge.merge(
    recent_temps_df,
    on=["county", "date"],
    how="inner"
)

In [49]:
#assert len(recent_temps_merge) == len(pops_merge) == len(recent_temps_df)

In [50]:
recent_temps_merge.head()

Unnamed: 0,date,county,deaths,day_name,is_weekend,day,month,year,population,tmax_f
0,2010-01-02,ALAMEDA,30.0,Saturday,True,2,1,2010,1516721,61.6
1,2010-01-03,ALAMEDA,32.0,Sunday,True,3,1,2010,1516721,57.1
2,2010-01-04,ALAMEDA,26.0,Monday,False,4,1,2010,1516721,57.0
3,2010-01-05,ALAMEDA,29.0,Tuesday,False,5,1,2010,1516721,54.3
4,2010-01-06,ALAMEDA,23.0,Wednesday,False,6,1,2010,1516721,55.7


In [51]:
historic_temps_df.county = historic_temps_df.county.str.upper()

In [52]:
historic_temps_merge = recent_temps_merge.merge(
    historic_temps_df,
    on=["county", "month", "day"],
    how="inner"
).rename(columns={'tmax_95':'heat_event_threshold_f'})

In [53]:
assert (historic_temps_merge.month.unique() == [5, 6, 7, 8, 9, 10]).all()

In [54]:
historic_temps_merge['heat_event_threshold_f'] = historic_temps_merge['heat_event_threshold_f'].round()

Calculate

In [55]:
historic_temps_merge['is_heat_event'] = historic_temps_merge.tmax_f >= historic_temps_merge.heat_event_threshold_f

Export

In [56]:
historic_temps_merge.to_csv("../output/totals-by-day.csv", index=False)