In [124]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from pandas.plotting import register_matplotlib_converters
import seaborn as sns
sns.set()

In [114]:
df_confirmed = pd.read_csv('./time_series_covid19_confirmed_global.csv')
df_deaths = pd.read_csv('./time_series_covid19_deaths_global.csv')
df_recovered = pd.read_csv('./time_series_covid19_recovered_global.csv')

format_columns = {
    'Province/State': 'subregion',
    'Country/Region': 'country',
    'Lat': 'lat',
    'Long':'long'
}

# Renombrar
df_confirmed = df_confirmed.rename(columns = format_columns)
df_deaths = df_deaths.rename(columns = format_columns )
df_recovered = df_recovered.rename(columns = format_columns)

In [115]:
# Forma 'tidy'
df_confirmed = df_confirmed.melt(id_vars = ['country', 'subregion', 'lat', 'long'], var_name='date_RAW', value_name='confirmed')
df_deaths = df_deaths.melt(id_vars = ['country', 'subregion', 'lat', 'long'], var_name='date_RAW', value_name='dead')
df_recovered = df_recovered.melt(id_vars = ['country', 'subregion', 'lat', 'long'], var_name='date_RAW', value_name='recovered')


In [116]:
# Fechas
df_confirmed = df_confirmed.assign(date = pd.to_datetime(df_confirmed.date_RAW, format='%m/%d/%y'))
df_deaths = df_deaths.assign(date = pd.to_datetime(df_confirmed.date_RAW, format='%m/%d/%y'))
df_recovered = df_recovered.assign(date = pd.to_datetime(df_confirmed.date_RAW, format='%m/%d/%y'))

df_confirmed.drop(columns = ['date_RAW'], inplace=True)
df_deaths.drop(columns = ['date_RAW'], inplace=True)
df_recovered.drop(columns = ['date_RAW'], inplace=True)



In [117]:
# Ordenando datos
df_confirmed = (df_confirmed.filter(['country', 'subregion', 'date','lat','long', 'confirmed']).sort_values(['country','subregion','date','lat','long']).reset_index(drop=True))
df_deaths = (df_deaths.filter(['country', 'subregion', 'date','lat','long', 'dead']).sort_values(['country','subregion','date','lat','long']).reset_index(drop=True))
df_recovered = (df_recovered.filter(['country', 'subregion', 'date','lat','long', 'recovered']).sort_values(['country','subregion','date','lat','long']).reset_index(drop=True))

In [118]:
# Combinando df
# botando info repetida
df_recovered.drop(columns=['lat','long'], inplace=True)
df_deaths.drop(columns=['lat','long'], inplace=True)

df_AllData = (df_confirmed.merge(df_deaths, on = ['country', 'subregion', 'date'], how='left').merge(df_recovered, on = ['country', 'subregion', 'date'], how='left'))
df_AllData

Unnamed: 0,country,subregion,date,lat,long,confirmed,dead,recovered
0,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0
1,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0
2,Afghanistan,,2020-01-23,33.939110,67.709953,0,0,0.0
3,Afghanistan,,2020-01-24,33.939110,67.709953,0,0,0.0
4,Afghanistan,,2020-01-25,33.939110,67.709953,0,0,0.0
...,...,...,...,...,...,...,...,...
303769,Zimbabwe,,2022-10-15,-19.015438,29.154857,257827,5605,
303770,Zimbabwe,,2022-10-16,-19.015438,29.154857,257893,5606,
303771,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893,5606,
303772,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893,5606,


In [103]:
df_AllData.head()

Unnamed: 0,country,subregion,date,lat,long,confirmed,dead,recovered
0,Afghanistan,,2020-01-22,33.93911,67.709953,0,0,0.0
1,Afghanistan,,2020-01-22,33.93911,67.709953,0,0,0.0
2,Afghanistan,,2020-01-23,33.93911,67.709953,0,0,0.0
3,Afghanistan,,2020-01-24,33.93911,67.709953,0,0,0.0
4,Afghanistan,,2020-01-25,33.93911,67.709953,0,0,0.0


In [123]:
# Remove duplicates
pd.set_option('display.max_rows', 180)
(df_AllData
    .filter(['country'])
    .drop_duplicates()
)

df_AllData

Unnamed: 0,country,subregion,date,lat,long,confirmed,dead,recovered
0,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0
1,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0
2,Afghanistan,,2020-01-23,33.939110,67.709953,0,0,0.0
3,Afghanistan,,2020-01-24,33.939110,67.709953,0,0,0.0
4,Afghanistan,,2020-01-25,33.939110,67.709953,0,0,0.0
...,...,...,...,...,...,...,...,...
303769,Zimbabwe,,2022-10-15,-19.015438,29.154857,257827,5605,
303770,Zimbabwe,,2022-10-16,-19.015438,29.154857,257893,5606,
303771,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893,5606,
303772,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893,5606,


In [111]:
(df_AllData.filter(['date']).drop_duplicates())
(df_AllData.filter(['long', 'lat']).agg(['min','max']))
df_AllData


df_AllData = df_AllData.fillna(value = {'subregion':'', 'lat': np.NaN, 'long': np.NaN, 'recovered': 0})

In [125]:
profile = ProfileReport(df_AllData)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# Casos