In [None]:
import pandas as pd
import numpy as np

In [None]:
df_states_original = pd.read_csv("us-states.csv", dtype={"date": np.string_,
                                                             "state": np.string_,
                                                             "fips": np.string_,
                                                            "cases": np.int32,
                                                            "deaths": np.int32})
df_states_original = df_states_original.replace({np.nan: None})
df_states_original.sort_values("fips", ascending=True)

In [None]:
#df_states_original.set_index("state")
df_states_original.groupby(['state'], sort=False)['cases'].max().sort_values(ascending=False)


In [None]:
date_str_list = np.sort(df_states_original.date.unique())
dt_str_start=np.min(date_str_list)
dt_str_end=np.max(date_str_list)

In [None]:
state_name_list = df_states_original.state.unique()
print(len(state_name_list), state_name_list)


In [None]:
from datetime import datetime
dt_start=datetime.strptime(dt_str_start, "%Y-%m-%d")
dt_end = datetime.strptime(dt_str_end, "%Y-%m-%d")
#dt_start = datetime(2020,1,21)
#dt_end = datetime(2020,3,28)
dt_range = pd.date_range(start=dt_start,end=dt_end)

dt_range_str = list(map(lambda x: x.strftime("%Y-%m-%d"), dt_range.tolist()))
print(len(dt_range), dt_range)

In [None]:
import json
missing_data = 0 #np.NAN
state_csv_new = []
for state in state_name_list:
    state_record = {}
    state_record["state"] = state
    
    #fips
    state_record["fips"] = df_states_original.loc[(df_states_original['state'] == state)].iloc[0].fips
    
    # dt_start, dt_end, num_days
    state_record["dt_start"] = dt_start.strftime("%Y-%m-%d")+"-"
    state_record["dt_end"] = dt_end.strftime("%Y-%m-%d")+"-"
    state_record["num_days"] = int(len(dt_range))
    
    #case, death
    cases_ts = []
    deaths_ts = []
    
    first_case_dt = None
    first_death_dt = None
    for dt in dt_range:
        dt_key = dt.strftime("%Y-%m-%d")
        r = df_states_original.loc[(df_states_original['state'] == state) & (df_states_original['date']==dt_key)]
        if len(r)==0:
            if dt!=first_case_dt and first_case_dt is not None:
                print("{state} missing data on {dt}".format(state=state, dt=dt_key))
            cases_ts.append(missing_data)
            deaths_ts.append(missing_data)
            
        elif len(r)==1:
            if first_case_dt is None:
                first_case_dt=dt
            cases_ts.append(int(r.iloc[0].cases))
            deaths_ts.append(int(r.iloc[0].deaths))
            
            if first_death_dt is None and deaths_ts[-1]>0:
                first_death_dt=dt
        else:
            print("{state} {dt} duplicate".format(state=state, dt=dt_key))
    state_record["dt_first_case"] = first_case_dt.strftime("%Y-%m-%d")+"-" if first_case_dt is not None else None
    state_record["dt_first_death"] = first_death_dt.strftime("%Y-%m-%d")+"-" if first_death_dt is not None else None
    
    if len(cases_ts)!=state_record["num_days"]:
        print("{state} missing case data".format(state=state))
    if len(deaths_ts)!=state_record["num_days"]:
        print("{state} missing death data".format(state=state))
#     state_record["cases_ts"] = ",".join([str(i) for i in cases_ts])
#     state_record["deaths_ts"] = ",".join([str(i) for i in deaths_ts])
#    print(cases_ts)
#     state_record["cases_ts"] = json.dumps({"values": cases_ts})
#     state_record["deaths_ts"] = json.dumps({"values": deaths_ts})
    state_record["cases_ts"] = {"values": cases_ts}
    state_record["deaths_ts"] ={"values": deaths_ts}
    
    
    
    
    state_csv_new.append(state_record)

state_csv_new
  

In [None]:
data_df = pd.DataFrame(state_csv_new).set_index("state")
data_df.head(5)

In [None]:
 data_df.to_dict(orient='index')

In [None]:
 data_dict =data_df.to_dict(orient='index')

In [None]:

df = df_states_original.groupby(['state'], sort=False)[['cases', 'deaths']].max().sort_values("cases", ascending=False)

meta = {"dates": dt_range_str,"states": df.index.to_list(), "cases": df["cases"].values.tolist(), "deaths": df["deaths"].values.tolist()}
data_dict["metadata"] = meta

In [None]:
#data_df.to_json("./nyt_states_data.json", orient='index')
with open('./nyt_states_data.json', 'w') as outfile:
    json.dump(data_dict, outfile)

In [None]:
import geopandas as gpd

In [None]:
old_states_geojson_df = gpd.read_file("states_update.geojson")
old_states_geojson_df.head(5)

In [None]:
state_name_list2 = old_states_geojson_df.NAME.unique()

In [None]:
len(state_name_list2)

In [None]:
for n in state_name_list2:
    if n not in state_name_list:
        print(n)

In [None]:
subset_old_df = old_states_geojson_df[["NAME", "STUSPS", "STATEFP", "population", "geometry"]].set_index("NAME")
subset_old_df.head(5)

In [None]:
len(subset_old_df)

In [None]:
final_df = subset_old_df.join(data_df, on='NAME')

In [None]:
final_df

In [None]:
final_df.to_file("nyt_states_data.geojson", driver='GeoJSON')