In [1]:
#README
# Uses covid19india.org api to extract district wise data which is reliable as of 5th May 2020 
# only for 'confirmed' cases
# For state level data - there are more reliable APIs


In [2]:
import json
import random
from pathlib import Path
import collections

import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())
# from urllib.parse import quote_plus

In [3]:
#Select granularity to get the data
REGION_FIELDNAME = 'detecteddistrict' #detecteddistrict - for district; detectedcity - for city (as of 4th May-unreliable as entries are accurate at district level only); detectedstate - for state

#OUTPUT filename region specifier
REGION_OUTPUT_FILE_REG_SPECCIFIER='district'

#what data to extract - *ONLY* Confirmed cases is reliably extracted from these APIs. From 20th Apr onwards: Hospitalized = confirmed
DATA_TYPE = 'confirmed' #'confirmed' - default; 'Hospitalized'; 'Recovered'; 'Deceased'

In [4]:
from datetime import date
from dateutil.relativedelta import relativedelta

today = date.today()
yesterday = today - relativedelta(days=1)

# dd/mm/YY
current_date = yesterday.strftime("%Y%m%d")
print("d1 =", current_date)

d1 = 20200508


In [5]:
import urllib.request, json
from urllib.error import HTTPError

# def get_raw_data(raw_web_url="https://api.covid19india.org/raw_data.json"):
#     with urllib.request.urlopen(raw_web_url) as url:
#         data_dict = json.loads(url.read().decode())
#         return data_dict["raw_data"]


def get_stats_history(stats_history_url="https://api.rootnet.in/covid19-in/stats/history"):
    try:
        with urllib.request.urlopen(stats_history_url) as url:
            data_dict = json.loads(url.read().decode())
            return data_dict
    except HTTPError as e:
        print(f"Using local backup of {stats_history_url}")
        with open("history.json") as f:
            return json.loads(f.read())

def get_stats_history_district(stats_history_url="https://api.covid19india.org/raw_data.json"):
    try:
        with urllib.request.urlopen(stats_history_url) as url:
            data_dict = json.loads(url.read().decode())
            return data_dict
    except HTTPError as e:
        print(f"Using local backup of {stats_history_url}")
        with open("history.json") as f:
            return json.loads(f.read())
        
#Given the different jsons across time ranges. Expecting an array of source urls
def get_stats_history_district_multiple(stats_history_urls=["https://api.covid19india.org/raw_data1.json","https://api.covid19india.org/raw_data2.json"]):
    data_list=[]
    try:
        for stats_history_url in stats_history_urls:
            with urllib.request.urlopen(stats_history_url) as url:
                data_dict = json.loads(url.read().decode())
                print (len(data_dict['raw_data']))
                data_list.extend(data_dict['raw_data'])
        return data_list
    except HTTPError as e:
        print(f"Failure! Source URLs not found!!!")
#         with open("history.json") as f:
#             return json.loads(f.read())
        
# def get_state_data(
#     case_count_url="https://api.covid19india.org/state_district_wise.json",
# ):
#     with urllib.request.urlopen(case_count_url) as url:
#         data_dict = json.loads(url.read().decode())
#         return data_dict


# def get_case_count(data, state="Karnataka"):
#     df = pd.DataFrame(data[state]["districtData"])
#     df = df.transpose()
#     df.reset_index(inplace=True)
#     df.rename(columns={"confirmed": "CaseCount", "index": "District"}, inplace=True)
#     df.drop(columns=["lastupdatedtime"], inplace=True)
#     return df

In [6]:
from typing import List, Dict
#stats:List = get_stats_history()["data"]
# stats:List = get_stats_history_district()["raw_data"]
stats_pre27Apr:List = get_stats_history_district_multiple()
stats_post27Apr:List = get_stats_history_district_multiple(["https://api.covid19india.org/raw_data3.json"])

17306
10585
9051


In [7]:
with urllib.request.urlopen('https://api.covid19india.org/raw_data1.json') as url:
    data_dict = json.loads(url.read().decode())
raw_data1_df = pd.DataFrame.from_records(data_dict['raw_data'])
with urllib.request.urlopen('https://api.covid19india.org/raw_data2.json') as url:
    data_dict = json.loads(url.read().decode())
raw_data2_df = pd.DataFrame.from_records(data_dict['raw_data'])
with urllib.request.urlopen('https://api.covid19india.org/raw_data3.json') as url:
    data_dict = json.loads(url.read().decode())
raw_data3_df = pd.DataFrame.from_records(data_dict['raw_data'])

In [8]:
# set([i['currentstatus'] for i in stats_pre27Apr])
# [i for i in stats_pre27Apr if i['currentstatus']=='Migrated']

In [9]:
if DATA_TYPE == 'confirmed':
#     district_df = pd.DataFrame([(i[REGION_FIELDNAME],i['dateannounced']) for i in stats])
    district_df_ind = pd.DataFrame([(i[REGION_FIELDNAME],i['dateannounced'],1) 
                                    for i in stats_pre27Apr])
    district_df_agg = pd.DataFrame([(i[REGION_FIELDNAME],i['dateannounced'],int(i['numcases'])) 
                                    for i in stats_post27Apr 
                                    if i['currentstatus']=='Hospitalized' and 'numcases' in i and i['numcases']!=''])
    district_df = district_df_ind.append(district_df_agg)
else:
    district_df_ind = pd.DataFrame([(i[REGION_FIELDNAME],i['statuschangedate'],1) 
                                    for i in stats_pre27Apr 
                                    if i['currentstatus']==DATA_TYPE])
    district_df_agg = pd.DataFrame([(i[REGION_FIELDNAME],i['dateannounced'],int(i['numcases'])) 
                                    for i in stats_post27Apr 
                                    if i['currentstatus']==DATA_TYPE and 'numcases' in i and i['numcases']!=''])
    district_df = district_df_ind.append(district_df_agg)
# column names in JHU format
REGION = 'Province/State'
district_df.columns = [REGION,'date','counts']

In [10]:
# display(district_df)

In [11]:
import pandas as pd
from datetime import datetime
date_list = pd.date_range(start="2020-01-22",end=datetime.today()).strftime("%d/%m/%Y")

In [12]:
# district_df_pivot = pd.pivot_table(district_df,values=REGION,index=[REGION],columns=['date'],aggfunc=np.count_nonzero,fill_value=0)
# district_df_pivot = pd.pivot_table(district_df,values=REGION,index=[REGION],columns=['date'],aggfunc={REGION: np.count_nonzero},fill_value=0)
district_df_pivot = pd.pivot_table(district_df,values='counts',index=[REGION],columns=['date'],aggfunc={'counts': np.sum},fill_value=0)
district_df_pivot_dated = district_df_pivot.reindex(date_list, axis=1).fillna(0)
district_df_final = district_df_pivot_dated.cumsum(axis=1)

In [13]:
# district_df_pivot_dated[district_df_pivot_dated[REGION]=='Pune']
display(district_df_pivot_dated)

Unnamed: 0_level_0,22/01/2020,23/01/2020,24/01/2020,25/01/2020,26/01/2020,27/01/2020,28/01/2020,29/01/2020,30/01/2020,31/01/2020,...,30/04/2020,01/05/2020,02/05/2020,03/05/2020,04/05/2020,05/05/2020,06/05/2020,07/05/2020,08/05/2020,09/05/2020
Province/State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,131,260,511,-107,349,217,428,448,359,0
Adilabad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
Agar Malwa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,1,0,0,0
Agra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,38,29,39,60,32,12,15,15,36,0
Ahmedabad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,249,267,250,274,259,349,291,275,269,0
Ahmednagar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,1,-1,11,0,0,0,0
Aizawl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
Ajmer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,4,11,4,3,4,5,5,5,9,11
Akola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,10,13,-7,8,20,16,22,0
Alappuzha,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Transform to the JHU data format
See this [Github link to data](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv) for reference

In [14]:
district_df_final.insert(0, "Country/Region", "India")
district_df_final.insert(1, "Lat", 20)
district_df_final.insert(2, "Long", 70)

In [15]:
for date in date_list:
    datenew = datetime.strptime(date, "%d/%m/%Y")
    datenew = datetime.strftime(datenew, "%-m/%-d/%y")
    district_df_final = district_df_final.rename(columns = {date:datenew})

In [16]:
district_df_final.head()

Unnamed: 0_level_0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20
Province/State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8425.0,8685.0,9196.0,9089.0,9438.0,9655.0,10083.0,10531.0,10890.0,10890.0
Adilabad,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
Agar Malwa,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,8.0,8.0,8.0,8.0,8.0,9.0,9.0,9.0,9.0
Agra,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.0,499.0,538.0,598.0,630.0,642.0,657.0,672.0,708.0,708.0
Ahmedabad,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3026.0,3293.0,3543.0,3817.0,4076.0,4425.0,4716.0,4991.0,5260.0,5260.0


In [17]:
fname = f'../data/time_series_covid19_{DATA_TYPE}_India_{REGION_OUTPUT_FILE_REG_SPECCIFIER}_{current_date}.csv'
district_df_final.to_csv(fname, header=True)

In [18]:
# !cat $fname

In [19]:
fname

'../data/time_series_covid19_confirmed_India_district_20200508.csv'

In [20]:
temp = district_df_final.reset_index()
temp[temp[REGION]=='Pune']

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20
393,Pune,India,20,70,0.0,0.0,0.0,0.0,0.0,0.0,...,876.0,944.0,967.0,1024.0,1646.0,1690.0,1715.0,1757.0,1805.0,1805.0
