## All COVID-19 Cases by Country

In [43]:
import pandas as pd
import numpy as np
import os

In [44]:
# Specific to JHCSSE data
def clean_data(file):
    df = pd.read_csv(file)
    df = df.drop(['Lat','Long','Province/State'], axis=1)
    df.rename(columns={'Country/Region': 'Country_Region'}, inplace = True)
    df = df.groupby('Country_Region').sum()
    df.rename(index={'US':'United States',
        'Taiwan*':'Taiwan',
        'Korea, South':'South Korea',
        'Congo (Brazzaville)':'Congo',
        'Congo (Kinshasa)':'Democratic Republic of Congo',
        'Czechia':'Czech Republic',
        'Gambia, The':'Gambia',},inplace=True)
    return df

#### Confirmed Cases

In [45]:
# All Countries
confirmed_df = clean_data('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
confirmed_df.head(1)

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,299,349,367,423,444,484,521,555,607,665


In [48]:
# Global row
global_row = pd.Series(confirmed_df.sum(), name='Global')
confirmed_df = confirmed_df.append(global_row)
confirmed_df.tail()

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Western Sahara,0,0,0,0,0,0,0,0,0,0,...,0,4,4,4,4,4,4,4,6,6
Yemen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
Zambia,0,0,0,0,0,0,0,0,0,0,...,39,39,39,39,39,39,40,40,43,45
Zimbabwe,0,0,0,0,0,0,0,0,0,0,...,9,9,10,11,11,11,13,14,14,17
Global,555,654,941,1434,2118,2927,5578,6166,8234,9927,...,1197408,1272115,1345101,1426096,1511104,1595350,1691719,1771514,1846679,1917319


In [49]:
# Top Countries
n = 15
last_day = confirmed_df.columns[-1]
top_confirmed_df = confirmed_df.nlargest(n, last_day)
top_confirmed_df.drop(index='Global', inplace=True)
top_confirmed_df.index

Index(['United States', 'Spain', 'Italy', 'France', 'Germany',
       'United Kingdom', 'China', 'Iran', 'Turkey', 'Belgium', 'Netherlands',
       'Switzerland', 'Canada', 'Brazil'],
      dtype='object', name='Country_Region')

#### Fatalities

In [6]:
# All Countries
fatalities_df = clean_data('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')

# Global row
global_row = pd.Series(fatalities_df.sum(), name='Global')
fatalities_df = fatalities_df.append(global_row)

# Top Countries
top_fatalities_df = fatalities_df.loc[fatalities_df.index.isin(top_confirmed_df.index)]

#### Recovered

In [7]:
# All Countries
recovered_df = clean_data('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

# Global row
global_row = pd.Series(recovered_df.sum(), name='Global')
recovered_df = recovered_df.append(global_row)

# Top Countries
top_recovered_df = recovered_df.loc[recovered_df.index.isin(top_confirmed_df.index)]

#### Infected = Confirmed - Recovered

In [8]:
# All Countries
infected_df = confirmed_df-recovered_df

# Global row
global_row = pd.Series(infected_df.sum(), name='Global')
infected_df = infected_df.append(global_row)

# Top Countries
top_infected_df = infected_df.loc[infected_df.index.isin(top_confirmed_df.index)]

In [18]:
# peak = top_infected_df.loc['China','2/18/20']
peak = top_infected_df.loc['United States','4/12/20']
peak

522325

#### Most recent

In [10]:
def recent_stat(country_name, df):
    lookup = list(df.loc[df.index == country_name].values[0])
    lookup.reverse()
    most_recent = next((i for i in lookup if i.any()), None)
    return most_recent

# print(recent_stat("United States", top_infected_df))
# print(recent_stat("Italy", top_infected_df))

#### All Cases

In [11]:
countries = confirmed_df.index
all_df = pd.DataFrame(columns=['Country_Region', 'Confirmed', 'Recovered', 'Fatalities'], data=[])
for country in countries:
    all_df = all_df.append({'Country_Region': country,
                            'Confirmed': recent_stat(country, confirmed_df),
                            'Recovered': recent_stat(country, recovered_df),
                            'Fatalities': recent_stat(country, fatalities_df),
                           }, ignore_index=True)

In [12]:
# First Day Values
def first_day(df, country):
    row = list(df.loc[df.index == country].values[0])
    col = next((i for i, j in enumerate(row) if j.any()), None)     
    first_day = df.columns[col]
    return first_day

# Example:
# print(first_day(top_confirmed_df, "Italy"))
# print(first_day(top_fatalities_df, "Italy"))

In [13]:
# First Day column
firsts = [] 
for country in countries:
    firsts.append(first_day(confirmed_df, country))
all_df['First Day'] = firsts

In [14]:
# Global row
all_df = all_df.append(all_df.sum(), ignore_index=True)
all_df = all_df.replace(all_df.iloc[-1,0], 'Global')
all_df.replace(all_df.iloc[-1, -1], all_df['First Day'].values.min(), inplace=True)
all_df.tail()

Unnamed: 0,Country_Region,Confirmed,Recovered,Fatalities,First Day
182,Yemen,1,,,4/10/20
183,Zambia,45,30.0,2.0,3/18/20
184,Zimbabwe,17,,3.0,3/20/20
185,Global,1917319,448655.0,119482.0,1/22/20
186,Global,3834638,897326.0,238964.0,1/22/20


In [15]:
# Save JSON
# all_df.to_json('all.json', orient='records')

## Reorganizing Data to First Day of Confirmed Case

In [16]:
# New dataframes with all non-zero values at beginning
def first_day_df(df):
    dates = df.columns.tolist()
    days = range(len(dates))
    
    new_df = pd.DataFrame(columns=days)

    countries = df.index              # list of countries
    for country in countries:
        # Grab the country's row
        row = list(df.loc[df.index == country].values[0])
        # Grab the column index of first non-zero value
        col = next((i for i, j in enumerate(row) if j.any()), None)     
        # Calculate date
        first_day = df.columns[col]
        # Reset first day
        row = row[col:] + row[:col]
        # New rearranged dataframe
        new_df = new_df.append([row])

    # Same countries indexed
    new_df.index = df.index
    new_df = new_df.replace(0,np.NaN)
    
    return new_df

In [17]:
first_confirmed_top = first_day_df(top_confirmed_df)
first_infected_top = first_day_df(top_infected_df)

first_confirmed_all = first_day_df(confirmed_df)
# first_fatalities_all = first_day_df(fatalities_df)
# first_recovered_all = first_day_df(recovered_df)
# first_infected_all = first_day_df(infected_df)

#### By Region

In [60]:
# regions_df = pd.read_csv('../demographics/regions.csv')
# confirmed_byregion = pd.merge(regions_df, all_df, on='Country_Region', how='inner')
# confirmed_byregion.sort_values('First Day', inplace=True)
# confirmed_byregion.head(40)

In [61]:
# centralasia = confirmed_byregion.loc[confirmed_byregion['Region'].str.contains('Central Asia')]
# centralasia.sort_values('First Day')