In [1]:
!pip install plotly==4.14.3
!pip install plotly_express -U

Collecting plotly==4.14.3
[?25l  Downloading https://files.pythonhosted.org/packages/1f/f6/bd3c17c8003b6641df1228e80e1acac97ed8402635e46c2571f8e1ef63af/plotly-4.14.3-py2.py3-none-any.whl (13.2MB)
[K     |████████████████████████████████| 13.2MB 279kB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.14.3
Collecting plotly_express
  Downloading https://files.pythonhosted.org/packages/d4/d6/8a2906f51e073a4be80cab35cfa10e7a34853e60f3ed5304ac470852a08d/plotly_express-0.4.1-py2.py3-none-any.whl
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [2]:
import pandas as pd
import requests
import os
import urllib
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import math
import datetime
import numpy as np

pio.templates.default = "simple_white"

In [30]:
#Predição do encoding do arquivo
def predict_encoding(file_path, n_lines=20):
    '''Predict a file's encoding using chardet'''
    import chardet
    with open(file_path, 'rb') as f:
        rawdata = b''.join([f.readline() for _ in range(n_lines)])

def color_location(location):
  if location == 'Brazil':
    return '#90d595'
  else:
    return '#f2f3f2'  

def fill_title_and_subtitle(title:str, subtitle:str):
    return f"{title}<br><sub>{subtitle} (<a href='https://www.linkedin.com/in/ricardocoelhoandrade/'>linkedin.com/in/ricardocoelhoandrade/</a>)</sub>"

def fm_quantity(num):
    if np.isnan(num):
      num = 0
    else:
      num = int(num)
    
    i_offset = 15 # change this if you extend the symbols!!!
    prec = 3
    fmt = '.{p}g'.format(p=prec)
    symbols = ['Y', 'T', 'G', 'M', 'K', '', 'm', 'u', 'n']

    e = math.log10(abs(num))
    if e >= i_offset + 3:
        return '{:{fmt}}'.format(num, fmt=fmt)
    for i, sym in enumerate(symbols):
        e_thresh = i_offset - 3 * i
        if e >= e_thresh:
            return '{:{fmt}}{sym}'.format(num/10.**e_thresh, fmt=fmt, sym=sym)
    return '{:{fmt}}'.format(num, fmt=fmt)    

### Source https://data.humdata.org/dataset/covid-19-vaccinations

In [31]:
remote_file_name = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv"
covid_file = os.path.join("vaccinations.csv")
urllib.request.urlretrieve(remote_file_name, covid_file) 
df_vac = pd.read_csv(covid_file,sep=',',encoding=predict_encoding(covid_file),error_bad_lines=False)

In [32]:
remote_file_name = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/locations.csv"
covid_file = os.path.join("locations.csv")
urllib.request.urlretrieve(remote_file_name, covid_file) 
df_loc = pd.read_csv(covid_file,sep=',',encoding=predict_encoding(covid_file),error_bad_lines=False)

In [33]:
remote_file_name = "https://covid19.who.int/WHO-COVID-19-global-data.csv"
covid_file = os.path.join("WHO-COVID-19-global-data.csv")
urllib.request.urlretrieve(remote_file_name, covid_file) 
df_cases = pd.read_csv(covid_file,sep=',',encoding=predict_encoding(covid_file),error_bad_lines=False)

### EDA

In [34]:
print(df_cases.shape, df_vac.shape)

(107616, 8) (10234, 12)


In [35]:
df_vac.dtypes

location                                object
iso_code                                object
date                                    object
total_vaccinations                     float64
people_vaccinated                      float64
people_fully_vaccinated                float64
daily_vaccinations_raw                 float64
daily_vaccinations                     float64
total_vaccinations_per_hundred         float64
people_vaccinated_per_hundred          float64
people_fully_vaccinated_per_hundred    float64
daily_vaccinations_per_million         float64
dtype: object

In [36]:
df_cases.dtypes

Date_reported        object
Country_code         object
Country              object
WHO_region           object
New_cases             int64
Cumulative_cases      int64
New_deaths            int64
Cumulative_deaths     int64
dtype: object

In [37]:
df_cases.rename(columns={'Date_reported': 'date', 'New_cases': 'cases', 'Country' : 'location'}, inplace=True)

In [38]:
# convert dates
df_cases['date'] =  pd.to_datetime(df_cases['date'], format='%Y-%m-%d', infer_datetime_format=True)

In [39]:
df_vac['date'] =  pd.to_datetime(df_vac['date'], format='%Y-%m-%d', infer_datetime_format=True)

In [40]:
df_vac.tail()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
10229,Zimbabwe,ZWE,2021-03-29,81610.0,69751.0,11859.0,2471.0,5434.0,0.55,0.47,0.08,366.0
10230,Zimbabwe,ZWE,2021-03-30,85866.0,72944.0,12922.0,4256.0,5810.0,0.58,0.49,0.09,391.0
10231,Zimbabwe,ZWE,2021-03-31,91880.0,76995.0,14885.0,6014.0,5712.0,0.62,0.52,0.1,384.0
10232,Zimbabwe,ZWE,2021-04-01,105307.0,87791.0,17516.0,13427.0,6617.0,0.71,0.59,0.12,445.0
10233,Zimbabwe,ZWE,2021-04-02,124753.0,103815.0,20938.0,19446.0,8156.0,0.84,0.7,0.14,549.0


In [41]:
# Clean rows that identify region codes from vaccination dataset
df_vac = df_vac.drop(df_vac[df_vac['iso_code'].str.contains('OWID')].index)

In [42]:
# replace country names
df_cases = df_cases.replace('United States of America', 'United States')
df_cases = df_cases.replace('The United Kingdom', 'United Kingdom')
df_cases = df_cases.replace('Venezuela (Bolivarian Republic of)', 'Venezuela')
df_cases = df_cases.replace('Iran (Islamic Republic of)', 'Iran')
df_cases = df_cases.replace('Republic of Moldova', 'Moldova')
df_cases = df_cases.replace('Bolivia (Plurinational State of)', 'Bolivia')

In [43]:
df_join = pd.merge(df_cases, df_vac, on=['location','date'])

In [44]:
df_join.dtypes

date                                   datetime64[ns]
Country_code                                   object
location                                       object
WHO_region                                     object
cases                                           int64
Cumulative_cases                                int64
New_deaths                                      int64
Cumulative_deaths                               int64
iso_code                                       object
total_vaccinations                            float64
people_vaccinated                             float64
people_fully_vaccinated                       float64
daily_vaccinations_raw                        float64
daily_vaccinations                            float64
total_vaccinations_per_hundred                float64
people_vaccinated_per_hundred                 float64
people_fully_vaccinated_per_hundred           float64
daily_vaccinations_per_million                float64
dtype: object

In [45]:
len(df_cases.location.unique())

236

In [46]:
len(df_vac.location.unique())

157

In [47]:
len(df_join.location.unique())

144

In [48]:
df_vac[~df_vac['location'].isin(df_join['location'])].location.unique()

array(['Cape Verde', "Cote d'Ivoire", 'Faeroe Islands',
       'Falkland Islands', 'Hong Kong', 'Laos', 'Macao', 'Palestine',
       'Russia', 'Saint Helena', 'South Korea', 'Taiwan', 'Vietnam'],
      dtype=object)

In [49]:
df_join = df_join[['date','location','cases','daily_vaccinations','people_vaccinated','total_vaccinations']]

In [50]:
print("Last update dataset cases: {}".format(df_cases.date.describe(datetime_is_numeric=True)['max']))

Last update dataset cases: 2021-04-02 00:00:00


In [51]:
print("Last update dataset vaccination: {}".format(df_vac.date.describe(datetime_is_numeric=True)['max']))

Last update dataset vaccination: 2021-04-02 00:00:00


In [52]:
df_vac.dtypes

location                                       object
iso_code                                       object
date                                   datetime64[ns]
total_vaccinations                            float64
people_vaccinated                             float64
people_fully_vaccinated                       float64
daily_vaccinations_raw                        float64
daily_vaccinations                            float64
total_vaccinations_per_hundred                float64
people_vaccinated_per_hundred                 float64
people_fully_vaccinated_per_hundred           float64
daily_vaccinations_per_million                float64
dtype: object

In [53]:
df = df_vac.groupby(['location']).daily_vaccinations_raw.max().sort_values(ascending=True).head(20).to_frame()
#df['quantity'] = df['total_vaccinations_per_hundred'].apply(lambda x: fm_quantity(x)) 
df

Unnamed: 0_level_0,daily_vaccinations_raw
location,Unnamed: 1_level_1
Algeria,30.0
New Zealand,70.0
Saint Lucia,346.0
Namibia,350.0
Faeroe Islands,454.0
San Marino,657.0
Cayman Islands,1202.0
Gibraltar,1330.0
Anguilla,1421.0
Macao,1825.0


In [54]:
df = df_vac.groupby(['location']).total_vaccinations.max().sort_values(ascending=True).tail(20).to_frame()
df['quantity'] = df['total_vaccinations'].apply(lambda x: fm_quantity(x)) 
df

Unnamed: 0_level_0,total_vaccinations,quantity
location,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangladesh,5370431.0,5.37M
Canada,5968907.0,5.97M
Poland,6462769.0,6.46M
Morocco,8171651.0,8.17M
Spain,8342155.0,8.34M
United Arab Emirates,8491382.0,8.49M
Mexico,8644446.0,8.64M
Israel,10057609.0,10.1M
Italy,10769413.0,10.8M
Chile,10780764.0,10.8M


In [55]:
title = fill_title_and_subtitle("World Vaccination", "Vaccinations in the World")
 
fig = px.bar(df, x=df['total_vaccinations'],y=df.index, orientation='h', height=700, text='quantity', title=title)
 
fig.update_yaxes(showticklabels=True, visible=True, title='')
fig.update_xaxes(showticklabels=True, visible=False, title='')
fig.update_traces(marker_coloraxis=None)
fig.update_layout(yaxis_categoryorder = 'total ascending')
fig.show()
fig.write_html(os.path.join('world-vaccination.html'))

In [56]:
df_join['vaccination_per_cases'] = (df_join['cases'] / df_join['daily_vaccinations'] ) * 100
df_join['vaccination_per_cases']

0            NaN
1       2.121434
2       1.316752
3       1.170446
4       1.170446
          ...   
8480    0.073611
8481    0.292599
8482    0.332633
8483    0.362702
8484    0.171653
Name: vaccination_per_cases, Length: 8485, dtype: float64

In [57]:
def plot_daily(df_join, pais):
  title = fill_title_and_subtitle("Vaccine vs New Cases in {}".format(pais), "Comparing the total number of daily new cases and daily vaccinations")
  df = df_join[df_join['location'] == pais]
  fig = go.Figure(data=[
                  go.Bar(
                      name="New cases",
                      x=df['date'], 
                      y=df['cases'],
                      marker_color="crimson"
                  ),
                  go.Bar(
                      name="Vaccined",
                      x=df['date'], 
                      y=df['daily_vaccinations'],
                      marker_color="lightseagreen",
                  )
              ])
  fig.update_yaxes(showticklabels=True, visible=True, title='')
  fig.update_xaxes(showticklabels=True, visible=True, title='')
  fig.update_traces(marker_coloraxis=None)
  fig.update_layout(
          title=title,
          plot_bgcolor='rgba(0,0,0,0)',
          barmode='stack',
          hovermode="x"
      )
  #fig.update_layout(annotations=annotations)
  fig.show()
  fig.write_html(os.path.join('{}-vaccination-day.html'.format(pais.lower().replace(' ','-'))))

In [58]:
plot_daily(df_join, 'Brazil')
plot_daily(df_join, 'Spain')

In [59]:
from google.colab import files

In [63]:
for filename in os.listdir():
    if filename.endswith(".html"):
        files.download(os.path.join(filename)) 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>