# Exploratory Analysis

# Contents
1. Importing libraries
2. Data consistency check
3. Top 10 leading and lagging countries in vaccination progress
4. Top 20 countries with the highest and lowest of fully vaccinated people per hundred?
5. What are the global average vaccinations by month?
6. What are the top 20 countries having highest count for daily vaccinations?

# 1. Importing Libraries and Data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import seaborn as sns


In [2]:
#Creating common folder path
path = r'C:\Users\Mukund\Desktop\Career Foundry\Covid-19 Vaccination Progress' 

In [3]:
path

'C:\\Users\\Mukund\\Desktop\\Career Foundry\\Covid-19 Vaccination Progress'

In [4]:
# Importing Corona vaccination Data
df_CVN = pd.read_csv(os.path.join(path, 'Data', 'Prepared_Data', 'Country_Vaccinations_Checked.csv'), index_col = False)

In [5]:
# Checking for data
df_CVN.head()

Unnamed: 0,Country,iso_code,Date,Total_No_Of_vaccinations,Total_No_People_Vaccinated,Total_No_Of_People_Fully_Vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,Total_No_People_Vaccinated_Per_Hundred,Total_No_OF_People_Fully_Vaccinated_Per_Hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,22/02/2021,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,23/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,24/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,25/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,26/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


In [6]:
df_CVN.shape

(36063, 15)

# 2. Data Consistency Check

In [7]:
# Renaming the column names
df_CVN.rename(columns = {'Total_No_Of_vaccinations':'Total_Vaccinations',
                        'Total_No_People_Vaccinated':'Total_Ppl_Vaccinated', 'Total_No_Of_People_Fully_Vaccinated':'Total_Ppl_Fully_Vaccinated',
                         'Total_No_People_Vaccinated_Per_Hundred':'Total_ppl_Vaccinated_Per_Hundred',
                         'Total_No_OF_People_Fully_Vaccinated_Per_Hundred':'Total_ppl_Fully_Vaccinated_Per_Hundred'
                        }, inplace = True)

In [8]:
df_CVN.columns

Index(['Country', 'iso_code', 'Date', 'Total_Vaccinations',
       'Total_Ppl_Vaccinated', 'Total_Ppl_Fully_Vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'Total_ppl_Vaccinated_Per_Hundred',
       'Total_ppl_Fully_Vaccinated_Per_Hundred',
       'daily_vaccinations_per_million', 'vaccines', 'source_name',
       'source_website'],
      dtype='object')

In [9]:
df_CVN.isnull().sum()

Country                                       0
iso_code                                      0
Date                                          0
Total_Vaccinations                        15995
Total_Ppl_Vaccinated                      16913
Total_Ppl_Fully_Vaccinated                19761
daily_vaccinations_raw                    19510
daily_vaccinations                          238
total_vaccinations_per_hundred            15995
Total_ppl_Vaccinated_Per_Hundred          16913
Total_ppl_Fully_Vaccinated_Per_Hundred    19761
daily_vaccinations_per_million              238
vaccines                                      0
source_name                                   0
source_website                                0
dtype: int64

Since the vaccination columns are cumulative, I am not going to remove the null values.

In [10]:
# Checking for duplicate records
df_dups = df_CVN[df_CVN.duplicated()]

In [11]:
df_dups

Unnamed: 0,Country,iso_code,Date,Total_Vaccinations,Total_Ppl_Vaccinated,Total_Ppl_Fully_Vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,Total_ppl_Vaccinated_Per_Hundred,Total_ppl_Fully_Vaccinated_Per_Hundred,daily_vaccinations_per_million,vaccines,source_name,source_website


No duplicates found

In [12]:
# Checking for mixed-type data
for col in df_CVN.columns.tolist():
  mixed_type = (df_CVN[[col]].applymap(type) != df_CVN[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_CVN[mixed_type]) > 0:
    print (col)

# 3. Top 10 leading and lagging countries in vaccination progress

In [13]:
# Top 10 leading countries in vaccination progress 

df_CVN_10 = df_CVN.groupby('Country')['Total_Vaccinations'].max().reset_index()
df_CVN_10 = df_CVN_10.sort_values('Total_Vaccinations', ascending=False)
pd.set_option('display.float_format', lambda x: '%.0f'% x)
df_CVN_10.head(10)


Unnamed: 0,Country,Total_Vaccinations
40,China,1782525000
90,India,508664759
208,United States,351400930
27,Brazil,152989038
99,Japan,99651092
74,Germany,95010040
207,United Kingdom,86466264
69,France,76469330
200,Turkey,76029390
91,Indonesia,74275263


In [14]:
# Top 5 smallest countries with vaccinations progress
df_CVN_10 = df_CVN_10.sort_values('Total_Vaccinations', ascending=False).tail(10)
df_CVN_10

Unnamed: 0,Country,Total_Vaccinations
135,Nauru,14784
85,Haiti,14074
215,Wallis and Futuna,9372
163,Saint Helena,7892
24,Bonaire Sint Eustatius and Saba,7391
203,Tuvalu,4772
66,Falkland Islands,4407
130,Montserrat,2791
143,Niue,1216
156,Pitcairn,83


# 4. Top 20 countries with the highest and lowest of fully vaccinated people per hundred?

In [15]:
# Top 10 countries with highest fully vaccinated people per hundred

df_ful_vac = df_CVN.groupby('Country')['Total_ppl_Fully_Vaccinated_Per_Hundred'].max().reset_index()
df_ful_vac = df_ful_vac.sort_values('Total_ppl_Fully_Vaccinated_Per_Hundred', ascending=False)
pd.set_option('display.float_format', lambda x: '%.0f'% x)
df_ful_vac.head(20)

Unnamed: 0,Country,Total_ppl_Fully_Vaccinated_Per_Hundred
76,Gibraltar,116
122,Malta,90
156,Pitcairn,77
89,Iceland,75
36,Cayman Islands,73
206,United Arab Emirates,72
95,Isle of Man,70
174,Seychelles,70
168,San Marino,69
100,Jersey,68


In [16]:
# Top 10 countries with lowest fully vaccinated people per hundred

df_ful_vac = df_ful_vac.sort_values('Total_ppl_Fully_Vaccinated_Per_Hundred', ascending=False).tail(20)
df_ful_vac

Unnamed: 0,Country,Total_ppl_Fully_Vaccinated_Per_Hundred
152,Papua New Guinea,0.0
175,Sierra Leone,0.0
111,Liberia,0.0
20,Benin,0.0
201,Turkmenistan,0.0
38,Chad,0.0
83,Guinea-Bissau,0.0
191,Syria,0.0
211,Vanuatu,0.0
216,Yemen,0.0


# 5. What are the global average vaccinations by month?

In [24]:
# Find the global average total vaccinations by month
df_CVN['Date'] = pd.to_datetime(df_CVN['Date'])
avg = df_CVN.groupby(df_CVN['Date'].dt.strftime('%B'))['daily_vaccinations'].mean().sort_values().reset_index()
avg

Unnamed: 0,Date,daily_vaccinations
0,February,84416
1,December,90125
2,March,91597
3,January,91728
4,April,103977
5,November,109604
6,October,110120
7,September,110215
8,August,133456
9,May,138248


In [25]:
avg.to_csv(os.path.join(path, 'Data', 'Prepared_Data', 'Global_Monthly_Vaacinations.csv'))

# 6. What are the top 20 countries having highest count for daily vaccinations?

In [26]:
# Top 20 leading countries in daily vaccination 

df_CVN_20 = df_CVN.groupby('Country')['daily_vaccinations'].max().reset_index()
df_CVN_20 = df_CVN_20.sort_values('daily_vaccinations', ascending=False)
pd.set_option('display.float_format', lambda x: '%.0f'% x)
df_CVN_20.head(20)

Unnamed: 0,Country,daily_vaccinations
40,China,22424286
90,India,6276153
208,United States,3384387
99,Japan,1858539
27,Brazil,1520483
200,Turkey,1264431
149,Pakistan,1136545
125,Mexico,1088095
91,Indonesia,1054431
74,Germany,870295
