In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv("country_vaccination_stats.csv")

### Question - 4

Code Implementation Task: Implement code to fill the missing data (impute) in daily_vaccinations column per country with the minimum daily vaccination number of relevant countries.  
Note: If a country does not have any valid vaccination number yet, fill it with “0” (zero). 
Please  provide the link to your code as answer to this question.

In [4]:
df.head(40)

Unnamed: 0,country,date,daily_vaccinations,vaccines
0,Argentina,12/29/2020,,Sputnik V
1,Argentina,12/30/2020,15656.0,Sputnik V
2,Argentina,12/31/2020,15656.0,Sputnik V
3,Argentina,1/1/2021,11070.0,Sputnik V
4,Argentina,1/2/2021,8776.0,Sputnik V
5,Argentina,1/3/2021,7400.0,Sputnik V
6,Argentina,1/4/2021,6483.0,Sputnik V
7,Argentina,1/5/2021,7984.0,Sputnik V
8,Argentina,1/6/2021,8173.0,Sputnik V
9,Argentina,1/7/2021,8363.0,Sputnik V


In [5]:
df.isna().sum() # to calculate the total number of missing values for each column

country                0
date                   0
daily_vaccinations    60
vaccines               0
dtype: int64

In [6]:
df.daily_vaccinations.isna().groupby(by = df.country).sum() # seems all the null values are distributed uniformly (each country has 1) 

country
Argentina               1
Austria                 1
Bahrain                 1
Belgium                 1
Brazil                  1
Bulgaria                1
Canada                  1
Chile                   1
China                   1
Costa Rica              1
Croatia                 1
Cyprus                  1
Czechia                 1
Denmark                 1
Ecuador                 1
England                 1
Estonia                 1
Finland                 1
France                  1
Germany                 1
Gibraltar               1
Greece                  1
Hungary                 1
Iceland                 1
India                   1
Indonesia               1
Ireland                 1
Isle of Man             1
Israel                  1
Italy                   1
Kuwait                  1
Latvia                  1
Lithuania               1
Luxembourg              1
Malta                   1
Mexico                  1
Netherlands             1
Northern Ireland        1
Norw

In [7]:
df.country.unique()

array(['Argentina', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Bulgaria',
       'Canada', 'Chile', 'China', 'Costa Rica', 'Croatia', 'Cyprus',
       'Czechia', 'Denmark', 'Ecuador', 'England', 'Estonia', 'Finland',
       'France', 'Germany', 'Gibraltar', 'Greece', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Ireland', 'Isle of Man', 'Israel', 'Italy',
       'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Mexico',
       'Netherlands', 'Northern Ireland', 'Norway', 'Oman', 'Panama',
       'Poland', 'Portugal', 'Romania', 'Russia', 'Saudi Arabia',
       'Scotland', 'Serbia', 'Seychelles', 'Singapore', 'Slovakia',
       'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey',
       'United Arab Emirates', 'United Kingdom', 'United States', 'Wales'],
      dtype=object)

In [8]:
df.daily_vaccinations.groupby(by = df.country).sum().sort_values() # to detect, "Is there any country does not have any valid vaccination number?"

country
Kuwait                         0.0
Ecuador                      108.0
Isle of Man                  546.0
Iceland                     5367.0
Luxembourg                  5454.0
Panama                      5746.0
Gibraltar                  10780.0
Cyprus                     12047.0
Seychelles                 14751.0
Malta                      17191.0
Latvia                     20287.0
Estonia                    24448.0
Costa Rica                 24969.0
Bulgaria                   28259.0
Oman                       32549.0
Slovenia                   49932.0
Singapore                  52490.0
Croatia                    58762.0
Chile                      70078.0
Lithuania                  71120.0
Norway                     71592.0
Slovakia                   82443.0
Finland                    85460.0
Bahrain                   114506.0
Netherlands               115472.0
Switzerland               119239.0
Sweden                    125389.0
Ireland                   125638.0
Serbia      

In [9]:
df[df.country=="Ecuador"] # Ecuador has just 1 value

Unnamed: 0,country,date,daily_vaccinations,vaccines
384,Ecuador,1/21/2021,,Pfizer/BioNTech
385,Ecuador,1/22/2021,108.0,Pfizer/BioNTech


In [10]:
df[df.country =="Kuwait" ] = df[df.country =="Kuwait" ].fillna(0) #according to the task, country does not have any valid vaccination number which is Kuwait here, filled with “0” (zero).

In [11]:
dictmin = {} #To see the minimum daily vaccination number for each country
for i in df.country.unique():
   dictmin[i]= df[df.country==i]["daily_vaccinations"].min()
dictmin

{'Argentina': 6483.0,
 'Austria': 3368.0,
 'Bahrain': 943.0,
 'Belgium': 1.0,
 'Brazil': 112.0,
 'Bulgaria': 472.0,
 'Canada': 866.0,
 'Chile': 51.0,
 'China': 187500.0,
 'Costa Rica': 240.0,
 'Croatia': 989.0,
 'Cyprus': 534.0,
 'Czechia': 1822.0,
 'Denmark': 1994.0,
 'Ecuador': 108.0,
 'England': 37625.0,
 'Estonia': 309.0,
 'Finland': 459.0,
 'France': 5000.0,
 'Germany': 19721.0,
 'Gibraltar': 520.0,
 'Greece': 549.0,
 'Hungary': 1339.0,
 'Iceland': 63.0,
 'India': 112150.0,
 'Indonesia': 11823.0,
 'Ireland': 550.0,
 'Isle of Man': 90.0,
 'Israel': 6887.0,
 'Italy': 926.0,
 'Kuwait': 0.0,
 'Latvia': 338.0,
 'Lithuania': 311.0,
 'Luxembourg': 62.0,
 'Malta': 259.0,
 'Mexico': 1300.0,
 'Netherlands': 4000.0,
 'Northern Ireland': 1563.0,
 'Norway': 224.0,
 'Oman': 817.0,
 'Panama': 6.0,
 'Poland': 4300.0,
 'Portugal': 2791.0,
 'Romania': 712.0,
 'Russia': 3357.0,
 'Saudi Arabia': 14153.0,
 'Scotland': 3047.0,
 'Serbia': 1150.0,
 'Seychelles': 500.0,
 'Singapore': 2800.0,
 'Slovakia': 

In [12]:
df["daily_vaccinations"] = df["daily_vaccinations"].fillna(df.groupby("country")["daily_vaccinations"].transform("min")) 
#Missing values are filled with minimum daily vaccination number for each country

In [13]:
rows_with_nan = [index for index, row in df.iterrows() if row.isnull().any()] #to check that, is there any missing value still?
rows_with_nan

[]

In [14]:
df[df.country == 'Kuwait'] #to ensure Kuwait's daily vaccination number is filled with 0 (zero)

Unnamed: 0,country,date,daily_vaccinations,vaccines
750,Kuwait,12/28/2020,0.0,Pfizer/BioNTech


In [15]:
df.groupby(by = df.country).median().sort_values(by = "daily_vaccinations", ascending= False).head(3)

Unnamed: 0_level_0,daily_vaccinations
country,Unnamed: 1_level_1
United States,378253.0
China,276786.0
India,173922.0


### Question- 5

Code Implementation Task: Implement code to list the top-3 countries with highest median daily vaccination numbers
 by considering missing values imputed version of dataset.

In [16]:
df.groupby(by = df.country).median().sort_values(by = "daily_vaccinations", ascending= False).head(3)
#sorted the median of daily vaccination number for each country, than listed the top-3 countries with highest median 

Unnamed: 0_level_0,daily_vaccinations
country,Unnamed: 1_level_1
United States,378253.0
China,276786.0
India,173922.0


### Question - 6 

What is the number of total vaccinations done on 1/6/2021 (MM/DD/YYYY) by considering missing values imputed version of dataset? Please  just provide the number as answer.

In [17]:
df['date'] = pd.to_datetime(df['date']) # converted the type of 'date' column, object to datetime

In [18]:
df[df.date == "01-06-2021"]["daily_vaccinations"].sum() # according to the task, calculated total vaccinations done by the date: 01-06-2021

1485255.0