In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import geopandas as gpd

FIRST READ DATA

In [None]:
df_counties_original = pd.read_csv("./us-counties.csv", dtype={"date": np.string_, 
                                                             "county": np.string_,
                                                             "state": np.string_,
                                                             "fips": np.string_,
                                                            "cases": np.int32,
                                                            "deaths": np.int32})
#Tansform Unknown fips to '00000' 
df_counties_original['fips'] = df_counties_original['fips'].replace({np.nan: '00000'})
df_counties_original.sort_values("fips", ascending=True)


Generate GeoJSON

In [None]:
pivot_counties = pd.pivot_table(df_counties_original , index=['state','county','fips'],
                       columns=['date'])
pivot_counties = pivot_counties.replace(np.nan, 0)
pivot_counties

Get Date Info

In [None]:
date= pivot_counties['cases'].columns
date

In [None]:
date_str = np.sort(date)
dt_str_start=np.min(date_str)
dt_str_end=np.max(date_str)

In [None]:
dt_start = datetime.strptime(dt_str_start, "%Y-%m-%d")
dt_end = datetime.strptime(dt_str_end, "%Y-%m-%d")
dt_range = pd.date_range(start=dt_start,end=dt_end)
print(len(dt_range), dt_range)
dt_range_str = list(map(lambda x: x.strftime("%Y-%m-%d"), dt_range.tolist()))
print(len(dt_range_str), dt_range_str)

Add Cases Time Series, First Case Date, Death Time Series, First Death Date

In [None]:
# import json
# pivot_counties['cases_ts'] = json.dumps({"values": pivot_counties['cases'].values.tolist()[0]})
# pivot_counties['deaths_ts'] =  json.dumps({"values": pivot_counties['deaths'].values.tolist()[0]})

pivot_counties['cases_ts'] =  pivot_counties['cases'].values.tolist()
pivot_counties['cases_ts'] = pivot_counties['cases_ts'].apply(lambda x: ','.join(map(str, x)))
pivot_counties['deaths_ts'] =  pivot_counties['deaths'].values.tolist()
pivot_counties['deaths_ts'] = pivot_counties['deaths_ts'].apply(lambda x: ','.join(map(str, x)))

pivot_counties

In [None]:
yesterday = date.values[-2]
pivot_counties['today_case'] = pivot_counties['cases'][dt_str_end]
pivot_counties['today_new_case'] = pivot_counties['cases'][dt_str_end] - pivot_counties['cases'][yesterday]
pivot_counties['today_death'] = pivot_counties['deaths'][dt_str_end]
pivot_counties['today_new_death'] = pivot_counties['deaths'][dt_str_end] - pivot_counties['deaths'][yesterday]

In [None]:
pivot_counties['dt_first_case'] = (pivot_counties['cases'] > 0).idxmax(axis=1)
pivot_counties['dt_first_death'] = (pivot_counties['deaths'] > 0).idxmax(axis=1)
#For death reports, deal with no deaths counties
pivot_counties.loc[pivot_counties['deaths'].iloc[:, -1] <= 0, 'dt_first_death'] = np.nan
pivot_counties.head(5)


Read County Boundary GeoJSON file

In [None]:
old_counties_geojson_df = gpd.read_file(r"./counties_update_new.geojson")
old_counties_geojson_df.head(5)


Old Data Structure

In [None]:
report_df = pivot_counties[['cases_ts','deaths_ts','dt_first_case','dt_first_death','today_case','today_new_case', 'today_death','today_new_death']]
report_df = report_df.reset_index()
report_df.columns = ['state','county','fips','cases_ts','deaths_ts','dt_first_case','dt_first_death','today_case','today_new_case', 'today_death','today_new_death']
report_df.head(5)

In [None]:
report_df['today_case'].sum()
# Truth from report: 528422

In [None]:
geojson_counties = (old_counties_geojson_df['state_name'] + "," + old_counties_geojson_df['NAME']).unique()
geojson_counties

In [None]:
report_counties = (report_df['state'] + "," + report_df['county']).unique()
report_counties

In [None]:
for county in geojson_counties:
    if county not in report_counties:
        print(county);

In [None]:
for state in report_counties:
    if state not in geojson_counties:
        print(state);        

In [None]:
report_df[(report_df['state'] == 'Guam')
          |(report_df['state'] == 'Northern Mariana Islands')
          |(report_df['state'] == 'Virgin Islands')]

In [None]:
exception = report_df[(report_df['fips'] == '00000') & (report_df['county'] == 'Unknown')][['county','state','today_case']]

In [None]:
exception

In [None]:
exception['today_case'].sum()
# cases which has no county level info: 6558

In [None]:
report_df[report_df['county'] == 'Cass'][['county','state','today_case']]

In [None]:
final_df = pd.merge(old_counties_geojson_df[["NAME", "state_name", "population", "geometry"]], report_df, how='left', left_on=['NAME','state_name'], right_on = ['county','state'])
final_df.columns

In [None]:
final_df=final_df[["NAME", "state_name", "population", "fips", "dt_first_case", "dt_first_death", "cases_ts", "deaths_ts", 'today_case','today_new_case', 'today_death','today_new_death', "geometry"]]
final_df['dt_start'] = dt_str_start
final_df['dt_end'] = dt_str_end
final_df['dt_unit'] = "day"
final_df

In [None]:
final_df[pd.notnull(final_df["cases_ts"])]['dt_start'] = dt_str_start
final_df.head(5)

In [None]:
final_df[['today_case','today_new_case','today_death','today_new_death']] = final_df[['today_case','today_new_case','today_death','today_new_death']].replace(np.nan,0)


In [None]:
final_df['today_case'].sum()
# Truth total from case report: 528422
# Previous total without Kansas City: 522461
# Adding Kansas City: 522759 + Unknown cases: 6558 = 529317?

In [None]:
final_df

In [None]:
final_df[final_df['NAME'] == 'New York City']

In [None]:
final_df.to_file(r"./nyt_counties_data.geojson", driver='GeoJSON', encoding='utf-8')
print("done")

New Data Structure (Full version)

In [None]:
# pivot_counties.columns = ['_'.join(col).strip() for col in pivot_counties.columns.values]
# report_df_2 = pivot_counties.reset_index()

In [None]:
# report_df_2.head(5)

In [None]:
# pivot_counties.head(5)

In [None]:
# final_df_2 = pd.merge(old_counties_geojson_df[["NAME", "state_name", "population", "geometry"]], report_df_2, how='right', left_on=['NAME','state_name'], right_on = ['county','state'])
# final_df_2 = final_df_2.drop(['state','county'], axis = 1)
# final_df_2['dt_start'] = dt_start
# final_df_2['dt_end'] = dt_end
# final_df_2.head(5)

In [None]:
# final_df_2.to_file(r"./nyt_counties_data_full.geojson", driver='GeoJSON', encoding='utf-8')
# print("done")

Generate JSON file

In [None]:
# county_list = []
# for state in df_counties_original.state.unique():
#     for county in df_counties_original.loc[(df_counties_original['state'] == state)].county.unique():
#         county_list.append({"name": county, "state": state})
 
# print(len(county_list), county_list)


In [None]:
# data_df = report_df.set_index(report_df["county"]+", "+report_df["state"])
# data_dict =data_df.to_dict(orient='index')

In [None]:
# df = df_counties_original.groupby(['county', 'state'], sort=False)[['cases', 'deaths']].max().sort_values("cases", ascending=False)
# county_state_list = list(map(lambda x: x[0]+", "+x[1], df.index.tolist()))
# meta = {"dates": dt_range_str,"states": county_state_list, "cases": df["cases"].values.tolist(), "deaths": df["deaths"].values.tolist()}
# data_dict["metadata"] = meta

In [None]:
# data_dict

In [None]:
# import json
# with open('./nyt_counties_data.json', 'w') as outfile:
#     json.dump(data_dict, outfile)