In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
pd.set_option("display.max_columns", 100)
import os
import geojson
from geojson import Feature, FeatureCollection
import requests
from bs4 import BeautifulSoup
import json
import requests
from datetime import datetime
from geopy import geocoders
import random

In [115]:
url = "https://api.covid19india.org/csv/latest/raw_data.csv"
r = requests.get(url)
f = open('../data/26042020/raw_data.csv','w')
f.write(r.text)
f.close()

In [26]:
url = "https://api.covid19india.org/csv/latest/state_wise_daily.csv"
r = requests.get(url)
f = open('../data/26042020/statewise_daily_confirmed_deceased_recovered.csv','w')
f.write(r.text)
f.close()

### Clusters

In [1016]:
df = pd.read_csv("../data/26042020/raw_data.csv")
df.dropna(subset=["Date Announced"], inplace=True)
df.rename(columns={"Contracted from which Patient (Suspected)":"contracted_from", "Current Status":"status", "Notes":"notes",
                   "Detected City":"city", "Detected District":"district", "Detected State": "state","State code":"state_code",
                   "Date Announced":"date", "Type of transmission":"transmission_type", "Age Bracket":"age", "Gender":"gender",
                   "State Patient Number": "state_pid", "Patient Number":"pid"}, inplace=True)
df = df[["pid", "date", "city", "district", "state", "contracted_from", "transmission_type", "notes"]]
total_case_count = len(df)

### District name mapping
for i in range(len(df)):
    district_name = df.iloc[i]["district"]
    state_name = df.iloc[i]["state"]
    
    if district_name in district_name_mapping:
        district_name_mapped = district_name_mapping[district_name]
        district_name = district_name_mapped

    if district_name in district_name_resolution:
        district_name_resolved = resolve_district_name(state_name, district_name)
        district_name = district_name_resolved
    
    df.at[i, "district"] = district_name


In [1017]:
## Get root node of "contracted_from"
for i in range(len(df)):
    dft = df.iloc[i]
    cf = dft["contracted_from"] #.values[0]

    while cf==cf: 
        if cf.startswith("https"):
            break
        if ((cf!="E0") and (cf!="E1")):
            pid = cf.split("P")[1]
            if ", " in pid:
                pid = pid.split(", ")[0]
            elif "," in pid:
                pid = pid.split(",")[0]
                
            cfpid = int(pid)
            dft = df[df["pid"]==cfpid].copy()
            df.at[i, "derived_contracted_from"] = "P"+str(cfpid) 
            cf = dft["contracted_from"].values[0]
        else:
            df.at[i, "derived_contracted_from"] = cf
            cf = np.nan

In [1018]:
# Map to finite clusters
for i in range(len(df)):
    cf = df.iloc[i]["contracted_from"]
    dcf = df.iloc[i]["derived_contracted_from"]
    notes = df.iloc[i]["notes"]
    
    # IF both notes and contracted from are empty, skip this entry
    if ((cf!=cf) and (notes!=notes)):
        continue

    if ((cf == "E0") or (cf == "P4862") or (cf == "P531") ):
        df.at[i, "cluster"] = "Delhi Religious meeting"
        df.at[i, "cluster_location"] = "New Delhi"

    elif ((cf == "P689") or (cf == "P1215") ):
        df.at[i, "cluster"] = "Mysuru Pharmaceutical industry"
        df.at[i, "cluster_location"] = "Mysore"
    
    elif cf == "P182":
        df.at[i, "cluster"] = "Punjab Preacher"
        df.at[i, "cluster_location"] = "Shaheed Bhagat Singh Nagar"
    
    elif cf == "P20410":
        df.at[i, "cluster"] = "Bengaluru scrap segregation worker"
        df.at[i, "cluster_location"] = "Bangalore"
    
    elif cf == "P6":
        df.at[i, "cluster"] = "Italian tourists in Rajasthan"
        df.at[i, "cluster_location"] = "Italy"
        if df.iloc[i]["state"] == "Rajasthan":
            df.at[i, "district"] = "Jaipur"
        elif df.iloc[i]["state"] == "Haryana":
            df.at[i, "district"] = "Gurgaon"
        
    elif cf == "P301":
        df.at[i, "cluster"] = "Thai national in Tamil Nadu"
        df.at[i, "cluster_location"] = "Erode"
    
    elif cf == "E1":
        df.at[i, "cluster"] = "Contact with UK returnee"
        df.at[i, "cluster_location"] = "Gautam Buddha Nagar"
    
    elif cf == "P2868":
        df.at[i, "cluster"] = "Dubai returnee, hosted feast for 1500 people"
        df.at[i, "cluster_location"] = "Morena"
    
    elif cf == "P10454":
        df.at[i, "cluster"] = "Doctor from Bethany Hospital"
        df.at[i, "cluster_location"] = "East Khasi Hills"
    
    else:
        if ((is_in_iran_evacuee_list(notes)) or ((df.iloc[i]["district"]=="Evacuees*") and (df.iloc[i]["state"]=="Rajasthan"))):
            df.at[i, "cluster"] = "Iran evacuees"
            df.at[i, "cluster_location"] = "Iran"
            df.at[i, "district"] = "Jodhpur"

        elif is_in_travel_list(notes):
            travelled_from_place = is_in_travel_list(notes)
            df.at[i, "cluster"] = "Travel History"
            df.at[i, "cluster_location"] = travelled_from_place

        elif is_in_family_list(notes):
            df.at[i, "cluster"] = "Family member"
        
        elif is_in_contact_list(notes):
            df.at[i, "cluster"] = "Close Contact"
        
        elif is_in_healthcare_list(notes):
            df.at[i, "cluster"] = "Healthcare worker"
        
        elif is_in_worker_list(notes):
            df.at[i, "cluster"] = "Domestic Worker"
        
        elif is_in_no_travel_history_list(notes):
            df.at[i, "cluster"] = "No travel history"
        
        elif is_in_delhi_religious_list(notes):
            df.at[i, "cluster"] = "Delhi Religious meeting"
            df.at[i, "cluster_location"] = "New Delhi"
            df.at[i, "contracted_from"] = "E0"
            df.at[i, "derived_contracted_from"] = "E0"
        
        elif is_in_pharma_company_list(notes):
            df.at[i, "cluster"] = "Mysuru Pharmaceutical industry"
            df.at[i, "cluster_location"] = "Mysore"
            df.at[i, "contracted_from"] = "E1"
            df.at[i, "derived_contracted_from"] = "E1"

        elif is_in_misc_list(notes):
            cluster_category = is_in_misc_list(notes)
            df.at[i, "cluster"] = cluster_category

        elif is_in_skip_list(notes):
            continue;
            
        else:
            print(notes)
            continue;

In [1019]:
### Get cluster location for derived_contracted_from
for i in range(len(df)):
    cf = df.iloc[i]["contracted_from"]
    dcf = df.iloc[i]["derived_contracted_from"]
    cl = df.iloc[i]["cluster_location"] #.values[0]
    
    # If cluster location is null and derived_contracted_from is not null
    if ((cl!=cl) and (dcf==dcf)):
        if ((dcf!="E0") and (dcf!="E1")):
            pid = dcf.split("P")[1]
            if ", " in pid:
                pid = pid.split(", ")[0]
            elif "," in pid:
                pid = pid.split(",")[0]
            cfpid = int(pid)
            dft = df[df["pid"]==cfpid].copy()
        df.at[i, "cluster_location"] = dft["district"].values[0]

    # If cluster location is null and contracted_from is not null
    elif ((cl!=cl) and (cf==cf)):
        if ((cf!="E0") and (cf!="E1") and ("http" not in cf)):
            pid = cf.split("P")[1]
            if ", " in pid:
                pid = pid.split(", ")[0]
            elif "," in pid:
                pid = pid.split(",")[0]
            cfpid = int(pid)
            dft = df[df["pid"]==cfpid].copy()
        df.at[i, "cluster_location"] = dft["district"].values[0]



In [1020]:
cluster_district_mapping = {"Saudi":"Saudi Arabia", "Evacuees*":"Iran", "Abhudhabi":"Abu dhabi", "WB":"West Bengal"}
df["cluster_location"] = df["cluster_location"].apply(lambda x: cluster_district_mapping[x] if x in cluster_district_mapping.keys() else x)

cluster_district_skip = ["to", "the", "Asansole"]
for i in range(len(cluster_district_skip)):
    df = df[df["cluster_location"]!=cluster_district_skip[i]]

In [1035]:
### Removing isolated clusters
df = df.groupby("cluster").filter(lambda x: len(x)>=10)
df.to_csv("../data/extracted_data_api/code_data/clusters.csv", index=False)

### Cluster table

In [1036]:
min_case_count_cluster_table = 10
dfc = df.groupby("cluster").count()["pid"].reset_index()
dfc.rename(columns={"pid":"case_count"}, inplace=True)
dfc = dfc[dfc["case_count"]>=min_case_count_cluster_table]
dfc = dfc[dfc["cluster"] != "No travel history"]
dfc.at[dfc["cluster"]=="SARI", "cluster"] = "Severe Acute Respiratory Infections"
dfc.sort_values("case_count", ascending=False, inplace=True)
dfc.to_csv("../data/extracted_data_api/code_data/cluster_table.csv", index=False)
dfc

Unnamed: 0,cluster,case_count
3,Delhi Religious meeting,913
10,Travel History,621
5,Family member,294
6,Iran evacuees,61
8,Mysuru Pharmaceutical industry,53
1,Close Contact,33
0,Bengaluru scrap segregation worker,20
7,Italian tourists in Rajasthan,16
9,Thai national in Tamil Nadu,13
2,Contact with UK returnee,11


### Cluster network map

In [56]:
df = pd.read_csv("../data/extracted_data_api/code_data/clusters.csv")
df = df[["pid", "date", "district", "state", "contracted_from", "derived_contracted_from", "cluster", "cluster_location"]]

# Handle special case
df.at[(df["district"]=="Italians*") & (df["state"]=="Rajasthan"), "district"] = "Jaipur"
   
## To avoid loop onto same node in network map, remove district==cluster_location
df.dropna(subset=["district", "cluster_location"], inplace=True)
##df = df[df["district"] != df["cluster_location"]]

### Cluster type mapping
cluster_type_mapping = {}
cluster_list = df["cluster"].unique()
for i in range(len(cluster_list)):
    cluster_name = cluster_list[i]
    if cluster_name not in cluster_type_mapping.keys():
        cluster_type_mapping[cluster_name] = int(i)
df["cluster_type"] = df["cluster"].apply(lambda x: cluster_type_mapping[x])

## Remove unmapped districts
for i in range(len(district_name_unmapped)):
    df = df[df["district"] != district_name_unmapped[i]]

In [57]:
dfcm = df.copy()
dfcm = dfcm[["date", "district", "cluster_location", "cluster", "cluster_type", "derived_contracted_from"]]

dfll = pd.read_csv("../data/extracted_data_api/code_data/district_center_bounds.csv")
district_list = dfll["district"].unique()

dfcl = pd.read_csv("../data/extracted_data_api/code_data/cluster_center_bounds.csv")
dfcl.rename(columns={"location":"cluster_location", "latitude":"cluster_latitude",
                     "longitude":"cluster_longitude"}, inplace=True)


cluster_pid_list = {}
unknown_dcf_id = 0

for i in range(len(dfcm)):
    district_name = dfcm.iloc[i]["district"]
    cluster_location = dfcm.iloc[i]["cluster_location"]
    dcf = dfcm.iloc[i]["derived_contracted_from"]
    
    if (district_name==district_name):
        dft = dfll[dfll["district"]==district_name]
        if len(dft)==1:
            minx = dft["minx"].values[0]
            maxx = dft["maxx"].values[0]
            miny = dft["miny"].values[0]
            maxy = dft["maxy"].values[0]
            dfcm.at[dfcm.index[i], "patient_latitude"] = random.uniform(miny, maxy)    
            dfcm.at[dfcm.index[i], "patient_longitude"] = random.uniform(minx, maxx)
        elif len(dft)==0:
            print(district_name + " -- No Location found")
        elif len(dft)>1:
            print(district_name + " -- More than one Location found")

    if dcf in cluster_pid_list.keys():
        cluster_latitude = cluster_pid_list[dcf][0]
        cluster_longitude = cluster_pid_list[dcf][1]
    else:
        if cluster_location in district_list:
            dft = dfll[dfll["district"]==cluster_location]
            minx = dft["minx"].values[0]
            maxx = dft["maxx"].values[0]
            miny = dft["miny"].values[0]
            maxy = dft["maxy"].values[0]
            cluster_latitude = dft["latitude"].values[0] #random.uniform(miny, maxy)
            cluster_longitude = dft["longitude"].values[0] #random.uniform(minx, maxx)
        else:
            dft = dfcl[dfcl["cluster_location"]==cluster_location]
            if len(dft)==1:
                minx = dft["minx"].values[0]
                maxx = dft["maxx"].values[0]
                miny = dft["miny"].values[0]
                maxy = dft["maxy"].values[0]
                cluster_latitude = dft["cluster_latitude"].values[0]  #random.uniform(miny, maxy)
                cluster_longitude = dft["cluster_longitude"].values[0]  #random.uniform(minx, maxx)
            else:
                cluster_latitude = np.nan
                cluster_longitude = np.nan
                

        if dcf!=dcf:
            dcf = "U" + str(unknown_dcf_id)
            unknown_dcf_id+=1
        #cpl = {dcf:(cluster_latitude, cluster_longitude)}
        #cluster_pid_list.append()
        cluster_pid_list[dcf] = [cluster_latitude, cluster_longitude]

    dfcm.at[dfcm.index[i], "cluster_latitude"] = cluster_latitude
    dfcm.at[dfcm.index[i], "cluster_longitude"] = cluster_longitude


In [58]:
dfcm.dropna(subset=["cluster_latitude", "cluster_longitude"], inplace=True)
dfcm["date"] = pd.to_datetime(dfcm["date"], dayfirst=True)
dfcm.reset_index(inplace=True)
dfcm.to_csv("../data/extracted_data_api/code_data/cluster_network_map.csv", index=False)


### Combine with daily cases and deaths
dfc = pd.read_csv("../data/26042020/statewise_daily_confirmed_deceased_recovered.csv")[["Date", "Status", "TT"]]
dfc.rename(columns={"Date":"date", "Status":"status"}, inplace=True)
dfc["date"] = pd.to_datetime(dfc["date"])
dfc1 = dfc[dfc["status"]=="Confirmed"].copy()
del dfc1["status"]
dfc1.rename(columns={"TT":"total_cases"}, inplace=True)
dfc1["total_cases"] = dfc1["total_cases"].cumsum()
dfc2 = dfc[dfc["status"]=="Deceased"].copy()
del dfc2["status"]
dfc2.rename(columns={"TT":"total_deaths"}, inplace=True)
dfc2["total_deaths"] = dfc2["total_deaths"].cumsum()

dfr = pd.read_csv("../data/26042020/raw_data.csv")
dfr.rename(columns={"Date Announced":"date"}, inplace=True)
dfr["date"] = pd.to_datetime(dfr["date"], dayfirst=True)
init_date = pd.datetime(2020, 3, 14)
dfinit_case_count = dfr.groupby("date").count()["Patient Number"].reset_index()
dfinit_case_count.rename(columns={"Patient Number":"total_cases"}, inplace=True)
dfinit_case_count["total_cases"] = dfinit_case_count["total_cases"].cumsum()
dfinit_case_count = dfinit_case_count[dfinit_case_count["date"] <= init_date]

dfc1 = dfc1[dfc1["date"]>init_date]
dfc1 = dfinit_case_count.append(dfc1)

dfcm = dfcm.set_index("date").join(dfc1.set_index("date")).reset_index()
#dfcm = dfcm.set_index("date").join(dfc2.set_index("date")).reset_index()

dfcm.to_csv("../data/extracted_data_api/code_data/cluster_network_map.csv", index=False)
dfcm

Unnamed: 0,date,index,district,cluster_location,cluster,cluster_type,derived_contracted_from,patient_latitude,patient_longitude,cluster_latitude,cluster_longitude,total_cases
0,2020-01-30,0,Thrissur,Wuhan,Travel History,0,,10.548925,76.717859,30.595105,114.299935,1
1,2020-02-02,1,Alappuzha,Wuhan,Travel History,0,,9.423722,76.502106,30.595105,114.299935,2
2,2020-02-03,2,Kasaragod,Wuhan,Travel History,0,,12.760040,74.984361,30.595105,114.299935,3
3,2020-03-02,3,East Delhi,Austria,Travel History,0,,28.587916,77.335460,47.200034,13.199959,5
4,2020-03-02,4,Hyderabad,Dubai,Travel History,0,,17.446530,78.492699,25.065700,55.171300,5
5,2020-03-03,5,Jaipur,Italy,Travel History,0,,26.945979,75.577603,42.638426,12.674297,6
6,2020-03-04,6,Gurgaon,Italy,Italian tourists in Rajasthan,1,P6,28.252581,77.014259,42.638426,12.674297,28
7,2020-03-04,7,Gurgaon,Italy,Italian tourists in Rajasthan,1,P6,28.235628,76.968028,42.638426,12.674297,28
8,2020-03-04,8,Gurgaon,Italy,Italian tourists in Rajasthan,1,P6,28.470262,77.066151,42.638426,12.674297,28
9,2020-03-04,9,Gurgaon,Italy,Italian tourists in Rajasthan,1,P6,28.521585,76.970157,42.638426,12.674297,28


In [23]:
#https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
cluster_color_mapping = {}
cluster_color_mapping["Mysuru Pharmaceutical industry"] = "#f032e6"  # Magenta
cluster_color_mapping["Bengaluru scrap segregation worker"] = "#800000" #Maroon
cluster_color_mapping["Travel History"] = "#f58231" #Orange 
cluster_color_mapping["Italian tourists in Rajasthan"] = "#3cb44b"  # Green
cluster_color_mapping["Delhi Religious meeting"] = "#4363d8" #Blue 
cluster_color_mapping["Iran evacuees"] = "#911eb4" #Purple
cluster_color_mapping["Family member"] = "#ff0000" # Red "#ffe119" #Yellow  
cluster_color_mapping["Close Contact"] = "#000075"  #Navy blue "#bcf60c" #Lime
cluster_color_mapping["Thai national in Tamil Nadu"] = "#008080" # Teal
cluster_color_mapping["Dubai returnee, hosted feast for 1500 people"] = "#9a6324" #Brown
cluster_color_mapping["Contact with UK returnee"] = "#808000" #Olive

cluster_color_mapping

{'Mysuru Pharmaceutical industry': '#f032e6',
 'Bengaluru scrap segregation worker': '#800000',
 'Travel History': '#f58231',
 'Italian tourists in Rajasthan': '#3cb44b',
 'Delhi Religious meeting': '#4363d8',
 'Iran evacuees': '#911eb4',
 'Family member': '#ff0000',
 'Close Contact': '#000075',
 'Thai national in Tamil Nadu': '#008080',
 'Dubai returnee, hosted feast for 1500 people': '#9a6324',
 'Contact with UK returnee': '#808000'}

### % of districts without the disease

In [48]:
dfcc = pd.read_csv("../data/extracted_data_api/code_data/districtwise_case_death_growth_density_withDelhiSeparate.csv")
total_population = dfcc["population"].sum()
per_outbreak_free_districts = np.round(len(dfcc[dfcc["cases"]==0])/len(dfcc)*100)
per_outbreak_free_population = np.round(dfcc[dfcc["cases"]==0]["population"].sum()/total_population*100)
df = pd.DataFrame()
for i in range(len(dfcc)):
    district_name = dfcc.iloc[i]["district"].replace(" ","-") + ", " + dfcc.iloc[i]["state_code"]
    df.at[i, "id"] = "district."+district_name
    df.at[i, "value"] = int(dfcc.iloc[i]["cases"])+1
    df.at[i, "district"] = district_name
    df.at[i, "per_outbreak_free_districts"] = per_outbreak_free_districts
    df.at[i, "per_outbreak_free_population"] = per_outbreak_free_population
df["value"] = df["value"].astype(int)
df["per_outbreak_free_districts"] = df["per_outbreak_free_districts"].astype(int)
df["per_outbreak_free_population"] = df["per_outbreak_free_population"].astype(int)
# Shuffling
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("../data/extracted_data_api/code_data/outbreak_free_districts.csv", index=False)
df

Unnamed: 0,id,value,district,per_outbreak_free_districts,per_outbreak_free_population
0,"district.Sahibganj, JH",1,"Sahibganj, JH",44,31
1,"district.Kamrup, AS",2,"Kamrup, AS",44,31
2,"district.Buldana, MH",12,"Buldana, MH",44,31
3,"district.Kasaragod, KL",169,"Kasaragod, KL",44,31
4,"district.Karnal, HR",7,"Karnal, HR",44,31
5,"district.Haora, WB",24,"Haora, WB",44,31
6,"district.Dindori, MP",1,"Dindori, MP",44,31
7,"district.Kiphire, NL",1,"Kiphire, NL",44,31
8,"district.Baksa, AS",1,"Baksa, AS",44,31
9,"district.Alwar, RJ",8,"Alwar, RJ",44,31


### Recovery rate

In [112]:
dfc = pd.read_csv("../data/extracted_data_api/statewise_daily_confirmed.csv")
del dfc["TT"], dfc["Unnamed: 39"]
state_code_list = dfc.columns[1:]
dfcc = dfc.set_index("date").cumsum().reset_index()

dfr = pd.read_csv("../data/extracted_data_api/statewise_daily_recovered.csv")
del dfr["TT"], dfr["Unnamed: 39"]
l = [["date"], [col+"_r" for col in dfr.columns if col in state_code_list]]
dfr.columns = [item for sublist in l for item in sublist]
dfr = dfr.set_index("date").cumsum().reset_index()

dfcr = dfcc.set_index("date").join(dfr.set_index("date")).reset_index()
dfrr = dfcc.copy()
for col in state_code_list:
    dfrr[col] = np.round(dfcr[col+"_r"] / dfcr[col],2)
dfrr.dropna(axis=1, how="all", inplace=True)
### Replace nan with -1, handle in color_Scale in javascript
dfrr.fillna(value=-1, inplace=True)
### Concat total case count at top of table
dfrr = pd.concat([pd.DataFrame(dfc.sum(axis=0, numeric_only=True)).transpose(), dfrr], ignore_index=True, axis=0, sort=True)
l = [["date"], [col for col in dfrr.columns[:len(dfrr.columns)-1]]]
columns = [item for sublist in l for item in sublist]
dfrr = dfrr[columns]
dfrr.at[pd.isnull(dfrr["date"]), "date"] = "total_count"
dfrr.to_csv("../data/extracted_data_api/code_data/state_recovery_rate.csv", index=False)
dfrr

Unnamed: 0,date,AN,AP,AR,AS,BR,CH,CT,DD,DL,DN,GA,GJ,HP,HR,JH,JK,KA,KL,LA,LD,MH,ML,MN,MP,MZ,NL,OR,PB,PY,RJ,SK,TG,TN,TR,UP,UT,WB
0,total_count,17.0,757.0,1.0,35.0,126.0,27.0,36.0,0.0,2156.0,0.0,7.0,2178.0,39.0,255.0,46.0,380.0,418.0,426.0,18.0,0.0,5218.0,12.0,2.0,1552.0,1.0,0.0,79.0,251.0,7.0,1735.0,0.0,928.0,1596.0,2.0,1337.0,46.0,392.0
1,14-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,,0.14,,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.16,-1.0,,0.0,-1.0,-1.0,-1.0,-1.0,,-1.0,0.0,-1.0,0.33,,0.0,0.0,-1.0,0.33,-1.0,-1.0
2,15-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,,0.29,,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.12,-1.0,,0.0,-1.0,-1.0,-1.0,-1.0,,-1.0,0.0,-1.0,0.75,,0.33,0.0,-1.0,0.31,-1.0,-1.0
3,16-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,,0.29,,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.11,-1.0,,0.0,-1.0,-1.0,-1.0,-1.0,,0.0,0.0,0.0,0.75,,0.25,1.0,-1.0,0.31,0.0,-1.0
4,17-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,,0.25,,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.11,-1.0,,0.0,-1.0,-1.0,-1.0,-1.0,,0.0,0.0,0.0,0.75,,0.2,1.0,-1.0,0.33,0.0,0.0
5,18-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,,0.2,,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.11,0.0,,0.0,-1.0,-1.0,-1.0,-1.0,,0.0,0.0,0.0,0.43,,0.08,0.5,-1.0,0.29,0.0,0.0
6,19-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,,0.21,,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.11,0.0,,0.0,-1.0,-1.0,-1.0,-1.0,,0.0,0.0,0.0,0.33,,0.06,0.33,-1.0,0.45,0.0,0.0
7,20-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,,0.25,,-1.0,0.0,0.0,0.0,-1.0,0.0,0.07,0.08,0.0,,0.0,-1.0,-1.0,0.0,-1.0,,0.0,0.0,0.0,0.18,,0.05,0.33,-1.0,0.38,0.0,0.0
8,21-Mar-20,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,,0.19,,-1.0,0.0,0.0,0.0,-1.0,0.0,0.05,0.06,0.0,,0.0,-1.0,-1.0,0.0,-1.0,,0.0,0.0,0.0,0.12,,0.05,0.17,-1.0,0.32,0.0,0.0
9,22-Mar-20,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,,0.19,,-1.0,0.0,0.0,0.0,-1.0,0.0,0.04,0.04,0.0,,0.0,-1.0,-1.0,0.0,-1.0,,0.0,0.0,0.0,0.11,,0.04,0.11,-1.0,0.3,0.0,0.0


### Statewise charts

In [138]:
init_date = pd.datetime(2020,3,15)

df = pd.read_csv("../data/extracted_data_api/districtwise_time_series.csv")
df["date"] = pd.to_datetime(df["date"])
df = df[df["date"]>=init_date]
dfsc = pd.read_csv("../data/extracted_data_api/state_codes.csv")
df = df.set_index("state").join(dfsc.set_index("state")).reset_index()

### MAp district names
for i in range(len(df)):
    district_name = df.iloc[i]["district"]
    state_name = df.iloc[i]["state"]
    
    if district_name in district_name_mapping:
        district_name_mapped = district_name_mapping[district_name]
        district_name = district_name_mapped

    if district_name in district_name_resolution:
        district_name_resolved = resolve_district_name(state_name, district_name)
        district_name = district_name_resolved
    
    df.at[i, "district"] = district_name

In [143]:
district_list = df["district"].unique()
dfd = pd.DataFrame()
for i in range(len(district_list)):
    district_name = district_list[i]

    if district_name==district_name:
        dft = df[df["district"]==district_name].copy()
        dft["count"] = dft["count"].cumsum()
        state_name = dft["state"].unique()[0]
        state_code = dft["state_code"].unique()[0]
        dft.reset_index(inplace=True)
        del dft["index"], dft["state"], dft["district"], dft["state_code"]
        dft.rename(columns={"count":district_name}, inplace=True)
        dft[district_name] = dft[district_name].astype(str)
        dft["date"] = pd.to_datetime(dft["date"]).dt.date
        dft.at[-1, "date"] = "state"
        dft.at[-1, district_name] = state_name
        dft.at[-2, "date"] = "state_code"
        dft.at[-2, district_name] = state_code
        dft.sort_index(inplace=True)
        dft = dft.transpose()
        new_header = dft.iloc[0]
        dft = dft[1:]
        dft.columns = new_header
        dft.reset_index(inplace=True)
        dft.rename(columns={"index":"district"}, inplace=True)

        if i==0:
            dfd = dft.copy()
        else:
            dfd = dfd.append(dft)

dfd.to_csv("../data/extracted_data_api/code_data/districtwise_daily_case_count.csv", index=False)

### topojson to geojson

In [None]:
##https://gis.stackexchange.com/questions/207731/generating-random-coordinates-in-multipolygon-in-python
import random
from shapely.geometry import Point

def generate_random(number, polygon):
    list_of_points = []
    minx, miny, maxx, maxy = polygon.bounds
    counter = 0
    while counter < number:
        pnt = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if polygon.contains(pnt):
            list_of_points.append(pnt)
            counter += 1
    return list_of_points

In [922]:
#https://gist.github.com/perrygeo/1e767e42e8bc54ad7262
#https://github.com/sgillies/topojson/blob/master/topojson.py

import json
import sys
from topojson_geometry import geometry
from shapely.geometry import asShape
from shapely.geometry import shape

dfll = pd.DataFrame()
idx = 0

for f in os.listdir("../data/extracted_data_api/code_data/state_geojson_data_howIndiaLives/"):
    state_name = f.split(".json")[0]
    topojson_path = "../data/extracted_data_api/code_data/state_geojson_data_howIndiaLives/"+f
    geojson_path = "../data/extracted_data_api/code_data/geojson_map/"+f

    with open(topojson_path, 'r') as fh:
        f = fh.read()
        topology = json.loads(f)

    features = topology['objects'][state_name+"_district"]['geometries']
    scale = topology['transform']['scale']
    trans = topology['transform']['translate']

    with open(geojson_path, 'w') as dest:
        fc = {'type': "FeatureCollection", 'features': []}

        for id, tf in enumerate(features):
            f = {'id': id, 'type': "Feature"}
            f['properties'] = tf['properties'].copy()

            geommap = geometry(tf, topology['arcs'], scale, trans)
            geom = asShape(geommap).buffer(0)
            assert geom.is_valid
            f['geometry'] = geom.__geo_interface__

            fc['features'].append(f) 

        # Save geojson file
        dest.write(json.dumps(fc))
    
        # Compute district center
        for d in range(len(fc["features"])):
            geom = shape(fc["features"][d]["geometry"])    
            district_bounds = geom.bounds
            district_name = fc["features"][d]["properties"]["district"]
            if district_name in district_name_mapping:
                district_name_mapped = district_name_mapping[district_name]
                district_name = district_name_mapped
                
            if district_name in district_name_resolution:
                district_name_resolved = resolve_district_name(state_name, district_name)
                district_name = district_name_resolved
            
            dfll.at[idx, "district"] = district_name
            dfll.at[idx, "latitude"] = geom.centroid.y
            dfll.at[idx, "longitude"] = geom.centroid.x

            minx, miny, maxx, maxy = geom.bounds
            dfll.at[idx, "minx"] = minx
            dfll.at[idx, "maxx"] = maxx
            dfll.at[idx, "miny"] = miny
            dfll.at[idx, "maxy"] = maxy

            idx+=1

bhadrak_bounds = (86.27553642278104, 20.735154747771226, 86.97753404572204, 21.236571014401317)
minx, miny, maxx, maxy = bhadrak_bounds
dfll.at[dfll["district"]=="Bhadrak", "minx"] = minx
dfll.at[dfll["district"]=="Bhadrak", "maxx"] = maxx
dfll.at[dfll["district"]=="Bhadrak", "miny"] = miny
dfll.at[dfll["district"]=="Bhadrak", "maxy"] = maxy

dfll.to_csv("../data/extracted_data_api/code_data/district_center_bounds.csv", index=False)

In [1100]:
"""
df = pd.read_csv("../data/extracted_data_api/code_data/clusters.csv")
cluster_location_list = df["cluster_location"].unique()

## One time - Get lat, long of cluster, foreign travel centers
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
geolocator = Nominatim(user_agent="myGeocoder")

#cluster_district_mapping = {"Saudi":"Saudi Arabia", "Evacuees*":"Iran", "Abhudhabi":"Abu dhabi", "WB":"West Bengal"}
cluster_district_skip = ["Shahid Bhagat Singh Nagar", "Thenkashi"]

#dfcll = pd.DataFrame()
#idx = 0
for loc in cluster_location_list[147:]:
    if ((not pd.isnull(loc)) and (loc not in cluster_district_skip)):

        try:
            location = geolocator.geocode(loc, timeout=10) #, geometry="geojson")
            print(loc + " " + str(location.latitude), str(location.longitude))
            dfcll.at[idx, "location"] = loc
            dfcll.at[idx, "latitude"] = location.latitude
            dfcll.at[idx, "longitude"] = location.longitude
            dfcll.at[idx, "minx"] = location.raw["boundingbox"][2]
            dfcll.at[idx, "maxx"] = location.raw["boundingbox"][3]
            dfcll.at[idx, "miny"] = location.raw["boundingbox"][0]
            dfcll.at[idx, "maxy"] = location.raw["boundingbox"][1]
            
        except GeocoderTimedOut as e:
            dfcll.at[idx, "location"] = loc
            dfcll.at[idx, "latitude"] = np.nan
            dfcll.at[idx, "longitude"] = np.nan
            dfcll.at[idx, "minx"] = np.nan
            dfcll.at[idx, "maxx"] = np.nan
            dfcll.at[idx, "miny"] = np.nan
            dfcll.at[idx, "maxy"] = np.nan
            print(loc + " Error: geocode failed on input %s with message %s"%(loc, e.message))

        idx+=1
dfcll.to_csv("../data/extracted_data_api/code_data/cluster_center_bounds.csv", index=False)
"""
#print("One time")

Dehradun 30.3255646 78.0436813
Pollachi 10.669422350000001 77.01017959253107
Surat 45.9383 3.2553
Hindupur 13.826383 77.4937723
Maharashtra 19.531932 76.0554568
Kottayam 9.62857045 76.6455250291479


In [3]:
def is_in_pharma_company_list(notes):
    pharma_company_list = ["Pharmaceutical company employee", "Pharma Company Worker"]
    for f in pharma_company_list:
        if f in notes:
            return True
    return False

def is_in_delhi_religious_list(notes):
    is_in_delhi_religious_list = ["Attended Delhi Religious Conference"]
    for f in is_in_delhi_religious_list:
        if f in notes:
            return True
    return False
    
def is_in_travel_list(notes):
    travel_list = ["Travelled from ", "Travelled to ", "Tavelled from ", "Travel history to ", "Travel History to ",
                  "foreign travel from ", "travelled to ", "Returned from ", "Travel HIstory to ",
                  "History of travel to ", "Evacuees from "]
    
    travel_place_unknown_list = ["No history of travel", "Travel history", "travel history", "Foreign Travel",
                                ]
    
    for f in travel_list:
        if f in notes:
            travelled_from_place = notes.split(f)[1].split(" ")[0]
            if travelled_from_place in ["United", "New", "Sri", "Abu", "West", "Uttar"]:
                travelled_from_place = notes.split(f)[1].split(" ")[0] + " " + notes.split(f)[1].split(" ")[1] 
            if "," in travelled_from_place:
                travelled_from_place = travelled_from_place.split(",")[0]
            if "." in travelled_from_place:
                travelled_from_place = travelled_from_place.split(".")[0]
            if ";" in travelled_from_place:
                travelled_from_place = travelled_from_place.split(";")[0]
            if ")" in travelled_from_place:
                travelled_from_place = travelled_from_place.split(")")[0]
            return travelled_from_place
    
    for f in travel_place_unknown_list:
        if f in notes:
            return np.nan

    return False

def is_in_iran_evacuee_list(notes):
    iran_evacuee_list = ["Evacuees from Iran", "Evacuee from Iran"]
    for f in iran_evacuee_list:
        if f in notes:
            return True
    return False

def is_in_misc_list(notes):
    misc_list = ["Co-passenger", "Cab driver",  "Indian Army jawan", "Travelled in a car with",
                "Supertech Capetown, Sector 74, NOIDA", "Tourist", "BSF officer" , "Dharavi", "Domestic Travel",
                "SARI", "Police", "Journalist", "Influenza like illness", "Conatct with patient", "Influenza like Illness"]
    for f in misc_list:
        if f in notes:
            return f
    return False

def is_in_no_travel_history_list(notes):
    no_travel_history_list = ["No travel history", "No Travel History", "No foreign travel",
                             "No history of travelling abroad",]
    for f in no_travel_history_list:
        if f in notes:
            return True
    return False

def is_in_healthcare_list(notes):
    healthcare_list = ["Doctor", "Staff of Private Hospital", "Nurse", "Hospital staff", "Health worker",
                      "Sanitation worker", "Attendent", "Hospital Staff"]
    for f in healthcare_list:
        if f in notes:
            return f
    return False
    
def is_in_skip_list(notes):
    skip_list = ["Factory Manager contact with businessman from Italy", "Was in contact with",
                "Have international history and were brought in from the Mumbai Airport", "Sector 78; Contracted from his boss",
                 "Reached Chennai MGR Central from Delhi via Train on 12.03.2020", "Travelled to UK", "Details awaited",
                 "Bangar Hospital Compounder","Admitted on Swine Flu suspicion; COVID19 confirmed",
                 "Worked at a beauty salon that Chandigarh’s first coronavirus patient visited",
                 "Landed in Mangalore airport, had been to Kasturba Medical College Attavar",
                 "No international travel history, has travelled to  Bilaspur by train",
                 "One more person, who is under observation at Dehradun's Doon Hospita", "Details Awaited",
                 "Sonipat", "Anganwadi Worker, Visited Velha, Attended Marriage, Tested Positive on 24th March, 2020",
                 "Native of", "details awaited", "Travel history (first case from Ramganj?)", "Travelled with ",
                 "Matched ", "A 50-year-old man, a railway guard, tested positive at the All India Institute of Medical Sciences, Bhopal",
                 "Match", "IPD patient at of Bangar Hospital", "earlier tested positive", "Woman in 40s",
                 "Worked at", "Patient is from", "Admit Noble Hospital on 2nd April, 2020", "Further Details not known",
                 "From ", "Belongs to", "Admitted to", "Admit to", "Visited", "Organized Medical camp with participation of NRIs",
                 "The detailed investigation is under process", "admit", "confimed positive", "No Info", "Classified as",
                 "They work at a bakery in Jawaharpur, Dera Bassi", "Source of", "Travelle from", "Returned to",
                 "ambulance", "Travelld from", "As per MohFW update", "Conference", "History of travel from Hidupur on 2 wheeler on 7/4/2020",
                 "Neighour of", "WB returnees", "Kolkata Returnees", "New case", "Trancing under process"
                ]
    for f in skip_list:
        if f in notes:
            return True
    return False
    
def is_in_family_list(notes):
    family_list = ["Father of", "Mother of", "Wife of", "Son of", "Daughter of", "Brother of", "Sister of", "Son in law of",
                  "Daughter in law of", "Close contact of", "Family members of" ,"Relative of", 
                  "Friends of", "Friend of", "First contacts", "family members", "Family members", "Spouse of",
                  "Father", "Mother", "Related to", "Employee of", "Contact of", "contact of", "husband",
                  "Daughter in Law", "Uncle of", "Aunt of", "Close relative of", "Relatives of", "Grandmother of",
                  "Family member of", "Cousin of", "Brother-in-law of", "Husband of", "Sister in law of", "Family",
                   "family", "relative", "Sister-in-law of"
                  ]
    for f in family_list:
        if f in notes:
            return True
    return False

def is_in_contact_list(notes):
    contact_list = ["Close contact of", "neighbour of", "Friends of", "Friend of", "First contacts",
                    "Employee of", "Contact of", "contact of", "Contact", "contact", "Local transmission",
                   "locally transmitted", "Local Transmission", "Tenant of", "Neighbour of", "neighbour of",
                   "Neighbor of", "Co-worker", "Came in touch with a positive person"]
    for f in contact_list:
        if f in notes:
            return True
    return False

def is_in_worker_list(notes):
    worker_list = ["Cook of", "Domestic worker of"]
    for f in worker_list:
        if f in notes:
            return True
    return False

In [4]:
state_name_mapping = {'odisha':'Odisha', 'telangana':'Telangana', 'meghalaya':'Meghalaya','karnataka':'Karnataka',
                     'haryana':'Haryana','bihar':'Bihar','andhrapradesh':'Andhra Pradesh','jammukashmir':'Jammu and Kashmir',
                     'westbengal':'West Bengal','kerala':'Kerala','chhattisgarh':'Chhattisgarh','andamannicobarislands':'Andaman and Nicobar Islands',
                     'jharkhand':'Jharkhand','ladakh':'Ladakh','uttarpradesh':'Uttar Pradesh','mizoram':'Mizoram','lakshadweep':'Lakshadweep',
                     'nagaland':'Nagaland','tamilnadu':'Tamil Nadu','dadranagarhaveli':'Dadra and Nagar Haveli','delhi':'Delhi',
                     'puducherry':'Puducherry','madhyapradesh':'Madhya Pradesh','arunachalpradesh':'Arunachal Pradesh','uttarakhand':'Uttarakhand',
                     'manipur':'Manipur','tripura':'Tripura','gujarat':'Gujarat','goa':'Goa','assam':'Assam','maharashtra':'Maharashtra',
                     'punjab':'Punjab','sikkim':'Sikkim','rajasthan':'Rajasthan','chandigarh':'Chandigarh','himachalpradesh':'Himachal Pradesh'}

state_name_mapping_howindialives = {"Jammu & Kashmir":"Jammu and Kashmir",
                                    "A& N Islands":"Andaman and Nicobar Islands",
                                    "D & N Haveli":"Dadra and Nagar Haveli"}

district_name_mapping = {"Jagitial":"Jagtial", "Jangoan":"Jangaon", "Kumuram Bheem Asifabad":"Komaram Bheem Asifabad",
                         "Mahabubnagar": "Mahbubnagar", "Ranga Reddy": "Rangareddy", "Yadadri Bhuvanagiri":"Yadadri Bhongir",
                         "Jayashankar": "Jayashankar Bhupalapally",
                         "West Khasi Hills":"West khasi Hills", "East Jaintia Hills":"East Jainta Hills",
                        "Kalaburagi":"Gulbarga", "Belagavi":"Belgaum", "Bagalkote":"Bagalkot", "Ballari":"Bellary",
                        "Shivamogga":"Shimoga", "Chikkamagaluru":"Chikmagalur", "Bengaluru Rural":"Bangalore Rural",
                        "Bengaluru":"Bangalore", "Chamarajanagara":"Chamarajanagar", "Tumakuru":"Tumkur", "Mysuru":"Mysore","Vijayapura":"Bijapur",
                        "Gurugram":"Gurgaon", "Mahendragarh":"Narnaul","Charki Dadri":"Ch-Dadri",
                        "West Champaran": "Pashchim Champaran", "East Champaran":"Purba Champaran", "Kaimur Bhabhua":"Kaimur (Bhabua)", "Kaimur Bhabua":"Kaimur (Bhabua)",
                        "S.P.S. Nellore":"Nellore", "Chittoor":"chittoor",
                        "South 24 Parganas":"South Twenty Four Parganas", "Medinipur West":"Paschim Medinipur", "Medinipur East":"Purba Medinipur",
                        "Darjeeling":"Darjiling", "Purulia":"Puruliya", "North 24 Parganas":"North Twenty Four Parganas", "Hooghly":"Hugli",
                        "Howrah":"Haora", "Cooch Behar":"Koch Bihar",
                        "Gariaband":"Gariyaband", "Kabeerdham":"Kawardha", "Janjgir Champa":"Janjgir - Champa", "Bametara":"Bemetara",
                        "Baloda Bazar":"Balodabazar",
                        "North and Middle Andaman":"North & Middle Andman", "South Andaman":"South Andman","Nicobars":"Nicobar",
                        "Saraikela-kharsawan":"Saraikela-Kharsawan",
                        "Leh":"Leh(Ladakh)",
                        "Amroha":"J P Nagar (Amroha)", "Kheri":"Lakhimpur Kheri", "Kasganj":"Kanshiram Nagar", "Prayagraj":"Allahabad",
                        "Bhadohi":"Sant Ravidas Nagar (Bhadohi)", "Amethi":"C S M Nagar (Amethi)",
                        "Thoothukudi":"Thoothukkudi","Dadra and Nagar Haveli":"Uni District UT", 
                        "Khargone":"Khargone (West Nimar)", "Khandwa":"Khandwa (East Nimar)", "Ashoknagar":"Ashok Nagar",
                        "Kra Daadi":"Kra daadi", "Lower Dibang Valley":"Lower Dibang valley", "Upper Dibang Valley":"Dibang Valley",
                        "Haridwar":"Hardwar", "Noney":"None", "Pherzawl":"Pherjawl", "Sipahijala":"Sepahijala District","Gomati":"Gomati District",
                        "Khowai":"Khowai District", "Unokoti":"Unakoti District", "The Dangs":"Dang", "Chota Udaipur":"Chhota Udepur",
                        "Aravalli":"Arvalli",
                        "S.A.S. Nagar":"Mohali", "Sri Muktsar Sahib":"Muktsar","Jalore":"Jalor",
                        "Jhunjhunu":"Jhunjhunun", "Lahul and Spiti":"Lahul & Spiti","North  District":"North District"
                        }


district_name_resolution = ["Aurangabad", "Bijapur", "Raigarh", "Bilaspur", "Balrampur", "Hamirpur", "Pratapgarh"]

"""
district_name_unmapped_old = ["Muzaffarabad", "Mirpur", "Shi Yomi", "Pakke Kessang", "Pakke-Kessang", "Lepa Rada", "Pauri Garhwal",
                         "Charaideo", "Majuli", "Biswanath", "Hojai", "South Salmara Mancachar",
                         "Shahid Bhagat Singh Nagar", "West Karbi Anglong"]
"""

district_name_wiki_mapping = {"Sri Potti Sriramulu Nellore":"Nellore", "Kadapa":"Y.S.R.",
    "Dantewada":"Dakshin Bastar Dantewada", "Janjgir-Champa":"Janjgir - Champa","Kabirdham":"Kawardha", "Kanker":"Uttar Bastar Kanker",
    "Ahmedabad":"Ahmadabad", "Banaskantha": "Banas Kantha", "Dahod":"Dohad", "Devbhoomi Dwarka":"Devbhumi Dwarka",
    "Kutch":"Kachchh", "Mehsana":"Mahesana", "Panchmahal":"Panch Mahals", "Sabarkantha":"Sabar Kantha",
    "Charkhi Dadri":"Ch-Dadri", "Hissar":"Hisar", "Yamuna Nagar":"Yamunanagar",
    "Lahaul and Spiti":"Lahul & Spiti",
    "East Singhbhum":"Purbi Singhbhum", "West Singhbhum":"Pashchimi Singhbhum", "Hazaribag":"Hazaribagh", "Koderma":"Kodarma", "Seraikela Kharsawan":"Saraikela-Kharsawan",
    "Bangalore Urban":"Bangalore","Chamarajnagar":"Chamarajanagar","Chikkaballapur":"Chikkaballapura",
    "Narsinghpur":"Narsimhapur", "Ahmednagar":"Ahmadnagar", "Beed":"Bid", "Buldhana":"Buldana", "Gondia":"Gondiya", "Mumbai City":"Mumbai","Mumbai suburban":"Mumbai Suburban",
    "Raigad":"Raigarh (Maharashtra)","Ri Bhoi":"Ribhoi", "Angul":"Anugul", "Boudh (Bauda)":"Baudh", "Bargarh (Baragarh)":"Bargarh", "Balasore":"Baleshwar",
    "Debagarh (Deogarh)":"Debagarh", "Jajpur":"Jajapur", "Jagatsinghpur":"Jagatsinghapur","Kendujhar (Keonjhar)":"Kendujhar", "Nabarangpur":"Nabarangapur", "Subarnapur (Sonepur)":"Subarnapur",
    "Chittorgarh":"Chittaurgarh", "Dholpur":"Dhaulpur", "East Sikkim":"East District", "North Sikkim":"North District", "South Sikkim":"South District", "West Sikkim":"West District",
    "Kanchipuram":"Kancheepuram", "Kanyakumari":"Kanniyakumari", "Nilgiris":"The Nilgiris", "Tirupur":"Tiruppur",
    "Tiruvallur":"Thiruvallur","Tiruvarur":"Thiruvarur", "Komaram Bheem":"Komaram Bheem Asifabad","Jayashankar Bhupalpally":"Jayashankar Bhupalapally", "Medchal-Malkajgiri":"Medchal Malkajgiri",
    "Sepahijala":"Sepahijala District", "Bagpat":"Baghpat", "Barabanki":"Bara Banki", "Gautam Buddh Nagar":"Gautam Buddha Nagar",
    "Maharajganj":"Mahrajganj", "Raebareli":"Rae Bareli", "Shravasti":"Shrawasti","Bandipora":"Bandipore", "Baramulla":"Baramula",
    "Poonch":"Punch", "Shopian":"Shupiyan","Mahé":"Mahe","Pondicherry":"Puducherry",
    "Kaimur":"Kaimur (Bhabua)", "Sahibzada Ajit Singh Nagar":"Mohali","Jalore":"Jalor"
}

district_name_unmapped = ["Muzaffarabad", "Mirpur", "Shi Yomi", "Pakke Kessang", "Pakke-Kessang", "Lepa Rada", "Pauri Garhwal",
                         "Charaideo", "Majuli", "Biswanath", "Hojai", "South Salmara Mancachar", "South Salmara", "Bishwanath",
                         "Shahid Bhagat Singh Nagar", "West Karbi Anglong",
                         "Gaurela-Pendra-Marwahi", "Chachaura-Binaganj", "Maihar", "Niwari", "Nagda", "Noklak",
                         "Sahibzada Ajit Singh Nagar", "Jalore", "Chengalpattu", "Kallakurichi", "Mayiladuthurai",
                         "Ranipet","Tenkasi","Tirupattur", "Daman", "Diu", "Niwari", "Tirupathur"]

def resolve_district_name(state_name, district_name):
    if ((state_name=="bihar") and (district_name=="Aurangabad")):
        district_name = "Aurangabad (Bihar)"
    if ((state_name=="Bihar") and (district_name=="Aurangabad")):
        district_name = "Aurangabad (Bihar)"
    if ((state_name=="maharashtra") and (district_name=="Aurangabad")):
        district_name = "Aurangabad (Maharashtra)"
    if ((state_name=="Maharashtra") and (district_name=="Aurangabad")):
        district_name = "Aurangabad (Maharashtra)"

    if ((state_name=="karnataka") and (district_name=="Bijapur")):
        district_name = "Bijapur (Karnataka)"
    if ((state_name=="Karnataka") and (district_name=="Bijapur")):
        district_name = "Bijapur (Karnataka)"
    if ((state_name=="chhattisgarh") and (district_name=="Bijapur")):
        district_name = "Bijapur (Chhattisgarh)"
    if ((state_name=="Chhattisgarh") and (district_name=="Bijapur")):
        district_name = "Bijapur (Chhattisgarh)"

    if ((state_name=="maharashtra") and (district_name=="Raigarh")):
        district_name = "Raigarh (Maharashtra)"
    if ((state_name=="Maharashtra") and (district_name=="Raigarh")):
        district_name = "Raigarh (Maharashtra)"
    if ((state_name=="chhattisgarh") and (district_name=="Raigarh")):
        district_name = "Raigarh (Chhattisgarh)"
    if ((state_name=="Chhattisgarh") and (district_name=="Raigarh")):
        district_name = "Raigarh (Chhattisgarh)"
    
    if ((state_name=="himachalpradesh") and (district_name=="Bilaspur")):
        district_name = "Bilaspur (Himachal Pradesh)"
    if ((state_name=="Himachal Pradesh") and (district_name=="Bilaspur")):
        district_name = "Bilaspur (Himachal Pradesh)"
    if ((state_name=="chhattisgarh") and (district_name=="Bilaspur")):
        district_name = "Bilaspur (Chhattisgarh)"
    if ((state_name=="Chhattisgarh") and (district_name=="Bilaspur")):
        district_name = "Bilaspur (Chhattisgarh)"
    
    if ((state_name=="uttarpradesh") and (district_name=="Balrampur")):
        district_name = "Balrampur (Uttar Pradesh)"
    if ((state_name=="Uttar Pradesh") and (district_name=="Balrampur")):
        district_name = "Balrampur (Uttar Pradesh)"
    if ((state_name=="chhattisgarh") and (district_name=="Balrampur")):
        district_name = "Balrampur (Chhattisgarh)"
    if ((state_name=="Chhattisgarh") and (district_name=="Balrampur")):
        district_name = "Balrampur (Chhattisgarh)"

    if ((state_name=="uttarpradesh") and (district_name=="Hamirpur")):
        district_name = "Hamirpur (Uttar Pradesh)"
    if ((state_name=="Uttar Pradesh") and (district_name=="Hamirpur")):
        district_name = "Hamirpur (Uttar Pradesh)"
    if ((state_name=="himachalpradesh") and (district_name=="Hamirpur")):
        district_name = "Hamirpur (Himachal Pradesh)"
    if ((state_name=="Himachal Pradesh") and (district_name=="Hamirpur")):
        district_name = "Hamirpur (Himachal Pradesh)"
    
    if ((state_name=="uttarpradesh") and (district_name=="Pratapgarh")):
        district_name = "Pratapgarh (Uttar Pradesh)"
    if ((state_name=="Uttar Pradesh") and (district_name=="Pratapgarh")):
        district_name = "Pratapgarh (Uttar Pradesh)"
    if ((state_name=="rajasthan") and (district_name=="Pratapgarh")):
        district_name = "Pratapgarh (Rajasthan)"
    if ((state_name=="Rajasthan") and (district_name=="Pratapgarh")):
        district_name = "Pratapgarh (Rajasthan)"


    return district_name
  

### Misc

In [None]:
elif "No travel history" in notes:
            df.at[i, "cluster"] = "No travel history"
            df.at[i, "cluster_district"] = district
            df.at[i, "cluster_state"] = state
        
        elif "Co-passenger" in notes:
            df.at[i, "cluster"] = "Co-passenger"
            df.at[i, "cluster_district"] = district
            df.at[i, "cluster_state"] = state

        elif "Cab driver" in notes:
            df.at[i, "cluster"] = "Cab driver"
            df.at[i, "cluster_district"] = district
            df.at[i, "cluster_state"] = state
        
        elif "Doctor" in notes:
            df.at[i, "cluster"] = "Doctor"
            df.at[i, "cluster_district"] = district
            df.at[i, "cluster_state"] = state
        
        elif "Indian Army jawan" in notes:
            df.at[i, "cluster"] = "Indian Army jawan"
            df.at[i, "cluster_district"] = district
            df.at[i, "cluster_state"] = state

In [None]:
# Map to finite clusters
for i in range(len(df)):
    cf = df.iloc[i]["contracted_from"]
    dcf = df.iloc[i]["derived_contracted_from"]
    notes = df.iloc[i]["notes"]
    district = df.iloc[i]["district"]
    state = df.iloc[i]["state"]
    
    df[df[""]]

    # IF both notes and contracted from are empty, skip this entry
    if ((cf!=cf) and (notes!=notes)):
        continue

    if ((cf == "E0") or (cf == "P4862") or (cf == "P531") ):
        df.at[i, "cluster"] = "Delhi Religious meeting"
        df.at[i, "cluster_district"] = "New Delhi"
        df.at[i, "cluster_state"] = "Delhi"

    elif ((cf == "P689") or (cf == "P1215") ):
        df.at[i, "cluster"] = "Mysuru Pharmaceutical industry"
        df.at[i, "cluster_district"] = "Mysore"
        df.at[i, "cluster_state"] = "Karnataka"
    
    elif cf == "P182":
        df.at[i, "cluster"] = "Punjab Preacher"
        df.at[i, "cluster_district"] = "Shaheed Bhagat Singh Nagar"
        df.at[i, "cluster_state"] = "Punjab"
    
    elif cf == "P20410":
        df.at[i, "cluster"] = "Bengaluru scrap segregation worker"
        df.at[i, "cluster_district"] = "Bangalore"
        df.at[i, "cluster_state"] = "Karnataka"
    
    elif cf == "P6":
        df.at[i, "cluster"] = "Italian tourists in Rajasthan"
        df.at[i, "cluster_district"] = "Jaipur"
        df.at[i, "cluster_state"] = "Rajasthan"
    
    elif cf == "P301":
        df.at[i, "cluster"] = "Thai national in Tamil Nadu"
        df.at[i, "cluster_district"] = "Erode"
        df.at[i, "cluster_state"] = "Tamil Nadu"
    
    elif cf == "E1":
        df.at[i, "cluster"] = "Contact with UK returnee"
        df.at[i, "cluster_district"] = "Gautam Buddha Nagar"
        df.at[i, "cluster_state"] = "Uttar Pradesh"
    
    elif cf == "P2868":
        df.at[i, "cluster"] = "Dubai returnee, hosted feast for 1500 people"
        df.at[i, "cluster_district"] = "Morena"
        df.at[i, "cluster_state"] = "Madhya Pradesh"
    
    elif cf == "P10454":
        df.at[i, "cluster"] = "Doctor from Bethany Hospital"
        df.at[i, "cluster_district"] = "East Khasi Hills"
        df.at[i, "cluster_state"] = "Meghalaya"
    
    else:
        if is_in_iran_evacuee_list(notes):
            df.at[i, "cluster"] = "Iran evacuee"
            df.at[i, "cluster_district"] = "Iran"
            df.at[i, "cluster_state"] = "Iran"

        elif is_in_travel_list(notes):
            travelled_from_place = is_in_travel_list(notes)
            df.at[i, "cluster"] = "Travel History"
            df.at[i, "cluster_district"] = travelled_from_place
            df.at[i, "cluster_state"] = travelled_from_place

        elif is_in_family_list(notes):
            df.at[i, "cluster"] = "Family member"
        
        elif is_in_contact_list(notes):
            df.at[i, "cluster"] = "Close Contact"
        
        elif is_in_healthcare_list(notes):
            df.at[i, "cluster"] = "Healthcare worker"
        
        elif is_in_worker_list(notes):
            df.at[i, "cluster"] = "Domestic Worker"
        
        elif is_in_no_travel_history_list(notes):
            df.at[i, "cluster"] = "No travel history"
        
        elif is_in_delhi_religious_list(notes):
            df.at[i, "cluster"] = "Delhi Religious meeting"
            df.at[i, "cluster_district"] = "New Delhi"
            df.at[i, "cluster_state"] = "Delhi"
            df.at[i, "contracted_from"] = "E0"
            df.at[i, "derived_contracted_from"] = "E0"
        
        elif is_in_pharma_company_list(notes):
            df.at[i, "cluster"] = "Mysuru Pharmaceutical industry"
            df.at[i, "cluster_district"] = "Mysore"
            df.at[i, "cluster_state"] = "Karnataka"
            df.at[i, "contracted_from"] = "E1"
            df.at[i, "derived_contracted_from"] = "E1"

        elif is_in_misc_list(notes):
            cluster_category = is_in_misc_list(notes)
            df.at[i, "cluster"] = cluster_category

        elif is_in_skip_list(notes):
            continue;
            
        else:
            print(notes)
            continue;