# Data generation notebook

Processing and generating data files for easier use

In [1]:
import pandas as pd

## Load data

In [2]:
data_frames = {
    'confirmed':  pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'),
    'deaths': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'),
    'recovered': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
}

## Remove cruises

In [3]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df[df['Country/Region'] != 'Diamond Princess']
    df = df[df['Country/Region'] != 'MS Zaandam']
    
    data_frames[key] = df

## Rename countries

In [4]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.replace("Cote d'Ivoire", 'Ivory Coast')
    df = df.replace("Cabo Verde", 'Cape Verde')
    df = df.replace("Congo (Brazzaville)", 'Congo')
    df = df.replace("Congo (Kinshasa)", 'Democratic Republic of the Congo')
    df = df.replace("Czechia", 'Czech Republic')
    df = df.replace("Holy See", 'Vatican City')
    df = df.replace("Korea, South", 'South Korea')
    df = df.replace("Taiwan*", 'Taiwan')
    df = df.replace("US", 'United States')
    df = df.replace("West Bank and Gaza", 'State of Palestine')
    df = df.replace("Burma", 'Myanmar')
    
    data_frames[key] = df

## Separate 'countries' such as Greenland

In [5]:
for key in data_frames.keys():
    df = data_frames[key]
    
    regions = ['Greenland', 'French Guiana']
    
    for region in regions:
        df.loc[df['Province/State'] == region, 'Country/Region'] = region
        
    data_frames[key] = df

## Group by country

In [6]:
for key in data_frames.keys():
    df = data_frames[key]
    df = df.drop(columns='Province/State')
    
    # Compute average of Lat and Long
    agg_dict = {'Lat': 'mean',
                'Long': 'mean'}
    
    # Sum the rest of the columns
    for column in df.columns[3:]:
        agg_dict[column] = 'sum'
    
    data_frames[key] = df.groupby('Country/Region').agg(agg_dict).reset_index()

## Reformat dates

In [7]:
for key in data_frames.keys():
    df = data_frames[key]
    dates = [x.strftime("%Y-%m-%d") for x in pd.to_datetime(df.columns[3:])]
    df.columns = list(df.columns[:3]) + dates

## Add world population column

In [8]:
population_df = pd.read_csv('generated/population.csv')

for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.merge(population_df, how='left', left_on='Country/Region', right_on='name')
    df = df.drop(columns='name')
    
    # Reorder columns
    df = df[['Country/Region', 'population'] + list(df.columns[1:-1])]
    df = df.copy()
    
    data_frames[key] = df

## Add sick dataset (confirmed - recovered)

In [9]:
confirmed = data_frames['confirmed']
recovered = data_frames['recovered']
# Check that the countries are in the same order
assert((confirmed.loc[:, 'Country/Region'] != recovered.loc[:, 'Country/Region']).sum() == 0)

sick = confirmed.copy()
sick.loc[:, sick.columns[4:]] = confirmed.loc[:, sick.columns[4:]] - recovered.loc[:, sick.columns[4:]]
data_frames['sick'] = sick

## Add world row

In [10]:
for key in data_frames.keys():
    df = data_frames[key]
    
    world = pd.Series(df.loc[0, :])
    world['Country/Region'] = "World"
    world['population'] = df['population'].sum()
    world['Lat'] = 0
    world['Long'] = 0
    world.loc[world.index[4:]] = df.loc[:, df.columns[4:]].sum()
    
    df = pd.concat([pd.DataFrame(world).T, df]).reset_index(drop=True)
    data_frames[key] = df

## Write back data

In [11]:
for key in data_frames.keys():
    data_frames[key].to_csv(f'generated/{key}.csv', index=False)