In [None]:
import pandas as pd
import numpy as np

In [None]:
df_counties_original = pd.read_csv("us-counties.csv", dtype={"date": np.string_, 
                                                             "county": np.string_,
                                                             "state": np.string_,
                                                             "fips": np.string_,
                                                            "cases": np.int32,
                                                            "deaths": np.int32})
df_counties_original = df_counties_original.replace({np.nan: None})
df_counties_original.sort_values("fips", ascending=True)

In [None]:
county_list = []
for state in df_counties_original.state.unique():
    for county in df_counties_original.loc[(df_counties_original['state'] == state)].county.unique():
        county_list.append({"name": county, "state": state})
 
print(len(county_list), county_list)


In [None]:
date_str_list = np.sort(df_counties_original.date.unique())
dt_str_start=np.min(date_str_list)
dt_str_end=np.max(date_str_list)


In [None]:
from datetime import datetime
#dt_start = datetime(2020,1,21)
#dt_end = datetime(2020,3,29)
dt_start=datetime.strptime(dt_str_start, "%Y-%m-%d")
dt_end = datetime.strptime(dt_str_end, "%Y-%m-%d")
dt_range = pd.date_range(start=dt_start,end=dt_end)
print(len(dt_range), dt_range)
dt_range_str = list(map(lambda x: x.strftime("%Y-%m-%d"), dt_range.tolist()))
print(len(dt_range_str), dt_range_str)

In [None]:
import multiprocessing as mp
manager = mp.Manager()

county_csv_new_mp = manager.list()
missing_date = 0
def generate_county_data(county_list):
    
    pid = mp.current_process()._identity[0]
    print(pid)
    counter=0
    
    for county in county_list:
      
        counter+=1
        if counter % 10 == 1:
            print("Process {} is working on {}/{}".format(pid, counter, len(county_list)))
        county_record = {}
        county_record["county"] = county['name']
        county_record["state"] = county['state']
        fips = df_counties_original.loc[(df_counties_original['county'] == county['name']) & \
                                         (df_counties_original['state'] == county['state'])].iloc[0].fips
        #county_record["fips"]= int(fips) if not np.isnan(fips) else fips
        county_record["fips"]= fips
                                         


        # dt_start, dt_end, num_days
        county_record["dt_start"] = dt_start.strftime("%Y-%m-%d")+"-"
        county_record["dt_end"] = dt_end.strftime("%Y-%m-%d")+"-"
        county_record["num_days"] = len(dt_range)

        #case, death
        cases_ts = []
        deaths_ts = []
        first_case_dt = None
        first_death_dt = None
        for dt in dt_range:
            dt_key = dt.strftime("%Y-%m-%d")
            r = df_counties_original.loc[(df_counties_original['county'] == county['name']) & \
                                         (df_counties_original['state'] == county['state']) & \
                                         (df_counties_original['date']==dt_key)]
            if len(r)==0:
                if first_case_dt is not None:
                    print("{county} {state} missing data on {dt}".format(county=county_record["county"], 
                                                                         state=county_record["state"],
                                                                         dt=dt_key))
                cases_ts.append(missing_date)
                deaths_ts.append(missing_date)
            elif len(r)==1:
                if first_case_dt is None:
                    first_case_dt=dt
                cases_ts.append(int(r.iloc[0].cases))
                deaths_ts.append(int(r.iloc[0].deaths))
                if first_death_dt is None and int(deaths_ts[-1])>0:
                    first_death_dt = dt
            else:
                print("{county} {state} duplicate data on {dt}".format(county=county_record["county"], 
                                                                         state=county_record["state"],
                                                                         dt=dt_key))
        if first_case_dt is None:
            print("{county} {state} first case date None".format(county=county_record["county"], 
                                                                         state=county_record["state"]))
        county_record["dt_first_case"] = first_case_dt.strftime("%Y-%m-%d")+"-" if first_case_dt is not None else None
        county_record["dt_first_death"] = first_death_dt.strftime("%Y-%m-%d")+"-" if first_death_dt is not None else None

        if len(cases_ts)!=county_record["num_days"]:
            print("{county} {state} missing case data".format(county=county_record["county"], 
                                                                         state=county_record["state"]))
        if len(deaths_ts)!=county_record["num_days"]:
            print("{county} {state} missing death data".format(county=county_record["county"], 
                                                                         state=county_record["state"]))
#         county_record["cases_ts"] = ",".join([str(i) for i in cases_ts])
#         county_record["deaths_ts"] = ",".join([str(i) for i in deaths_ts])
        county_record["cases_ts"] = {"values": cases_ts}
        county_record["deaths_ts"] = {"values": deaths_ts}



        county_csv_new_mp.append(county_record)
N=8
grouped_country_list=np.array_split(county_list, N)
pool = mp.Pool(processes=N) # Instantiate the pool here

pool.map(generate_county_data, grouped_country_list)
pool.close()
pool.join()

print(len(county_csv_new_mp))

In [None]:
county_csv_new = list(county_csv_new_mp)
county_csv_new

In [None]:
data_df = pd.DataFrame(county_csv_new)
data_df.head(5)
indices = data_df["county"]+", "+data_df["state"]
#indices=indices.apply(lambda x: x.replace(' ', '_'))
indices

In [None]:
data_df=data_df.set_index(indices)
#data_df.to_json("./nyt_counties_data.json", orient='index')
data_df

In [None]:
data_dict =data_df.to_dict(orient='index')

In [None]:
df = df_counties_original.groupby(['county', 'state'], sort=False)[['cases', 'deaths']].max().sort_values("cases", ascending=False)
county_state_list = list(map(lambda x: x[0]+", "+x[1], df.index.tolist()))
meta = {"dates": dt_range_str,"states": county_state_list, "cases": df["cases"].values.tolist(), "deaths": df["deaths"].values.tolist()}
data_dict["metadata"] = meta


In [None]:
#data_df.to_json("./nyt_counties_data.json", orient='index')
import json
with open('./nyt_counties_data.json', 'w') as outfile:
    json.dump(data_dict, outfile)

In [None]:
import geopandas as gpd

In [None]:
old_counties_geojson_df = gpd.read_file("counties_update_new.geojson")
old_counties_geojson_df.head(5)

In [None]:
state_name_list2 = old_counties_geojson_df.NAME.unique()

In [None]:
county_list_2 = []
for state in old_counties_geojson_df.state_name.unique():
    for county in old_counties_geojson_df.loc[(old_counties_geojson_df['state_name'] == state)].NAME.unique():
            county_list_2.append({"name": county, "state": state})

print(len(county_list_2), county_list_2)

In [None]:
missing_county_counter=0
for n in county_list:
    if n not in county_list_2:
        missing_county_counter+=1
        print(n)
print(missing_county_counter)

In [None]:
final_df = pd.merge(old_counties_geojson_df[["NAME", "state_name", "population", "geometry"]], data_df,  how='left', left_on=['NAME','state_name'], right_on = ['county','state'])
final_df


In [None]:
final_df=final_df[["NAME", "state_name", "population", "fips", "dt_start", "dt_end", "dt_first_case", "dt_first_death", "cases_ts", "deaths_ts", "geometry"]]
final_df

In [None]:
len(final_df)

In [None]:
final_df.to_file("nyt_counties_data.geojson", driver='GeoJSON')
print("done")