# Data generation notebook

Processing and generating data files for easier use

In [1]:
import pandas as pd

## Load data

In [2]:
data_frames = {
    'confirmed':  pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'),
    'deaths': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'),
    'recovered': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
}

## Rename countries

In [3]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df[df['Country/Region'] != 'Diamond Princess']
    df = df[df['Country/Region'] != 'MS Zaandam']
    
    data_frames[key] = df

## Remove cruises

In [4]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.replace("Cote d'Ivoire", 'Ivory Coast')
    df = df.replace("Cabo Verde", 'Cape Verde')
    df = df.replace("Congo (Brazzaville)", 'Congo')
    df = df.replace("Congo (Kinshasa)", 'Democratic Republic of the Congo')
    df = df.replace("Czechia", 'Czech Republic')
    df = df.replace("Holy See", 'Vatican City')
    df = df.replace("Korea, South", 'South Korea')
    df = df.replace("Taiwan*", 'Taiwan')
    df = df.replace("US", 'United States')
    df = df.replace("West Bank and Gaza", 'State of Palestine')
    df = df.replace("Burma", 'Myanmar')
    
    data_frames[key] = df

## Group by country

In [5]:
for key in data_frames.keys():
    df = data_frames[key]
    df = df.drop(columns='Province/State')
    
    # Compute average of Lat and Long
    agg_dict = {'Lat': 'mean',
                'Long': 'mean'}
    
    # Sum the rest of the columns
    for column in df.columns[3:]:
        agg_dict[column] = 'sum'
    
    data_frames[key] = df.groupby('Country/Region').agg(agg_dict).reset_index()

## Add world population column

In [6]:
population_df = pd.read_csv('generated/population.csv')

for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.merge(population_df, how='left', left_on='Country/Region', right_on='name')
    df = df.drop(columns='name')
    
    # Reorder columns
    df = df[['Country/Region', 'population'] + list(df.columns[1:-1])]
    df = df.copy()
    
    data_frames[key] = df

## Write back data

In [7]:
for key in data_frames.keys():
    data_frames[key].to_csv(f'generated/{key}.csv', index=False)