In [3]:
import pandas as pd
import numpy as np

## Load data from CSV file

In [4]:
dataset = pd.read_csv('../data/country_vaccinations.csv')
dataset.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


In [5]:
dataset.shape

(86512, 15)

In [6]:
dataset.dtypes

country                                 object
iso_code                                object
date                                    object
total_vaccinations                     float64
people_vaccinated                      float64
people_fully_vaccinated                float64
daily_vaccinations_raw                 float64
daily_vaccinations                     float64
total_vaccinations_per_hundred         float64
people_vaccinated_per_hundred          float64
people_fully_vaccinated_per_hundred    float64
daily_vaccinations_per_million         float64
vaccines                                object
source_name                             object
source_website                          object
dtype: object

### Convert to date time object

In [7]:
dataset['date'] = pd.to_datetime(dataset['date'])
dataset.dtypes

country                                        object
iso_code                                       object
date                                   datetime64[ns]
total_vaccinations                            float64
people_vaccinated                             float64
people_fully_vaccinated                       float64
daily_vaccinations_raw                        float64
daily_vaccinations                            float64
total_vaccinations_per_hundred                float64
people_vaccinated_per_hundred                 float64
people_fully_vaccinated_per_hundred           float64
daily_vaccinations_per_million                float64
vaccines                                       object
source_name                                    object
source_website                                 object
dtype: object

## Data Visualization and pre-processing

In [8]:
dataset.isnull().sum()

country                                    0
iso_code                                   0
date                                       0
total_vaccinations                     42905
people_vaccinated                      45218
people_fully_vaccinated                47710
daily_vaccinations_raw                 51150
daily_vaccinations                       299
total_vaccinations_per_hundred         42905
people_vaccinated_per_hundred          45218
people_fully_vaccinated_per_hundred    47710
daily_vaccinations_per_million           299
vaccines                                   0
source_name                                0
source_website                             0
dtype: int64

```shell
 Because the missing values for columns: [total_vaccinations, people_vaccinated, people_fully_vaccinated, daily_vaccinations_raw, 
total_vaccinations_per_hundred, total_vaccinated_per_hundred, people_fully_vaccinated_per_hundred] are to large, 
we analyze dataset depending on features: [country, vaccines]
```

In [9]:
dataset.describe()

Unnamed: 0,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
count,43607.0,41294.0,38802.0,35362.0,86213.0,43607.0,41294.0,38802.0,86213.0
mean,45929640.0,17705080.0,14138300.0,270599.6,131305.5,80.188543,40.927317,35.523243,3257.049157
std,224600400.0,70787310.0,57139200.0,1212427.0,768238.8,67.913577,29.290759,28.376252,3934.31244
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,526410.0,349464.2,243962.2,4668.0,900.0,16.05,11.37,7.02,636.0
50%,3590096.0,2187310.0,1722140.0,25309.0,7343.0,67.52,41.435,31.75,2050.0
75%,17012300.0,9152520.0,7559870.0,123492.5,44098.0,132.735,67.91,62.08,4682.0
max,3263129000.0,1275541000.0,1240777000.0,24741000.0,22424290.0,345.37,124.76,122.37,117497.0


In [10]:
pd.set_option('display.max_rows', 500)
dataset.vaccines.value_counts()

Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                                                                 7608
Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                                                                                  6263
Oxford/AstraZeneca                                                                                                            6022
Oxford/AstraZeneca, Pfizer/BioNTech                                                                                           4629
Johnson&Johnson, Moderna, Novavax, Oxford/AstraZeneca, Pfizer/BioNTech                                                        3564
Johnson&Johnson, Oxford/AstraZeneca, Sinopharm/Beijing                                                                        2484
Moderna, Pfizer/BioNTech                                                                                                      2309
Pfizer/BioNTech                                                                    

In [11]:
dataset.country.value_counts()

Norway                              482
Latvia                              480
Denmark                             476
United States                       471
Russia                              470
Canada                              470
China                               470
Israel                              466
Qatar                               463
Liechtenstein                       463
Switzerland                         463
Mexico                              461
Costa Rica                          460
Germany                             458
Slovenia                            458
Estonia                             458
Lithuania                           458
Italy                               458
Czechia                             458
Romania                             457
France                              457
Greece                              457
Poland                              457
Hungary                             457
Ireland                             456


### The United Kingdom is made up of 4 nations: England, Wales, Scotland and Northern Ireland. All of them have the same value counts as 443.
### We should remove all the records of 4 nations.

In [12]:
dataset = dataset.loc[~(dataset.country.isin(['Northern Ireland', 'Wales', 'Scotland', 'England']))]
dataset.country.value_counts()

Norway                              482
Latvia                              480
Denmark                             476
United States                       471
Canada                              470
China                               470
Russia                              470
Israel                              466
Liechtenstein                       463
Qatar                               463
Switzerland                         463
Mexico                              461
Costa Rica                          460
Italy                               458
Slovenia                            458
Germany                             458
Czechia                             458
Lithuania                           458
Estonia                             458
France                              457
Poland                              457
Hungary                             457
Greece                              457
Romania                             457
Bulgaria                            456


### Explore which vaccine is taken by which country

In [17]:

import plotly.express as px

vaccine_map = px.choropleth(dataset, locations = 'iso_code', color = 'vaccines')
vaccine_map.update_layout(height=300, margin={"r":0,"t":0,"l":0,"b":0})
vaccine_map.show()