In [71]:
import urllib
import json
import pandas as pd
import numpy as np

## California

In [3]:
url = 'https://data.ca.gov/api/3/action/datastore_search?resource_id=42d33765-20fd-44b8-a978-b083b7542225&limit=100000'  
fileobj = urllib.request.urlopen(url)
hospitalizations_str = fileobj.read().decode("utf-8")
hospitalizations_dict = json.loads(hospitalizations_str)
hospitalizations = pd.DataFrame(hospitalizations_dict["result"]["records"]).rename(columns={"todays_date":"date"})



In [4]:
url = 'https://data.ca.gov/api/3/action/datastore_search?resource_id=926fd08f-cc91-4828-af38-bd45de97f8c3&limit=100000'
fileobj = urllib.request.urlopen(url)
cases_str = fileobj.read().decode("utf-8")
cases_dict = json.loads(cases_str)
cases = pd.DataFrame(cases_dict["result"]["records"])




In [5]:
ca_data = pd.merge(cases, hospitalizations, how="left").\
drop(columns=["_id", "totalcountconfirmed", "totalcountdeaths"]).\
assign(state="California").\
rename(columns={"newcountconfirmed":"cases", "newcountdeaths":"deaths"})
ca_data["date"] = pd.to_datetime(ca_data["date"])


## Washington

In [6]:
nyt_data = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv").drop(columns="fips")
nyt_data["date"] = pd.to_datetime(nyt_data["date"])



In [7]:
wa_data = nyt_data[nyt_data["state"]=="Washington"]

## Oregon

In [8]:
or_data = nyt_data[nyt_data["state"]=="Oregon"]

## Aggregate

In [199]:
all_data_raw = pd.concat([wa_data, or_data, ca_data])
first_columns = ["state", "county", "date", "cases", "deaths"]
new_col_order = first_columns + [x for x in list(all_data_raw.columns) if x not in first_columns]
all_data_raw = all_data_raw[new_col_order]
all_data_raw = all_data_raw.reset_index().drop(columns="index")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


# Negative case/death numbers

In [200]:
def infer_num (i, full_list):
    prev_numbers = full_list[:i]
    prev_numbers = [x for x in prev_numbers if x>=0]
    if (len(prev_numbers)>0):
        prev_numbers = prev_numbers[(-min(len(prev_numbers), 2)):]
    next_numbers = full_list[(i+1):]
    next_numbers = [x for x in next_numbers if x>=0]
    if (len(next_numbers)>0):
        next_numbers = next_numbers[:(min(len(next_numbers), 2))]
    numbers = prev_numbers+next_numbers
    return(sum(numbers)/len(numbers))

def impute_cases (series):
    if (sum(series<0)==0):
        return (list(series))
    series = list(series)
    if (len(series)==1):
        series[0] = 0
        return (series)
    for position in [i for i, x in enumerate(series) if x<0]:
        series[position] = infer_num(position, series)
    return (series)

def impute_cases_df (df):
    df["cases"] = impute_cases(df["cases"])
    df["deaths"] = impute_cases(df["deaths"])
    return (df)


In [201]:
all_data = all_data_raw.groupby(["state", "county"]).apply(impute_cases_df)



## Save to file

In [202]:
all_data_agg = all_data.groupby(["state", "date"]).agg(sum).reset_index()

In [203]:
all_data.to_csv("county_data.csv", index=False)
all_data_agg.to_csv("state_data.csv", index=False)