# CONTENT LIST

1.Importing Libraries
2.Importing Data
3.Data Consistency Checks
4.Descriptive Analysis

# 1.Importing Libraries

In [9]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [10]:
# Creating common folder path
path = r'C:\Users\Mukund\Desktop\Career Foundry\Covid-19 Vaccination Progress' 

In [11]:
path

'C:\\Users\\Mukund\\Desktop\\Career Foundry\\Covid-19 Vaccination Progress'

In [12]:
# Importing Country Vaccination data
df_CVN = pd.read_csv(os.path.join(path, 'Data', 'Original_Data', 'country_vaccinations.csv'), index_col = False)

In [13]:
df_CVN.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,22/02/2021,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,23/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,24/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,25/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,26/02/2021,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


In [14]:
df_CVN.shape

(36063, 15)

In [15]:
df_CVN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36063 entries, 0 to 36062
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   country                              36063 non-null  object 
 1   iso_code                             36063 non-null  object 
 2   date                                 36063 non-null  object 
 3   total_vaccinations                   20068 non-null  float64
 4   people_vaccinated                    19150 non-null  float64
 5   people_fully_vaccinated              16302 non-null  float64
 6   daily_vaccinations_raw               16553 non-null  float64
 7   daily_vaccinations                   35825 non-null  float64
 8   total_vaccinations_per_hundred       20068 non-null  float64
 9   people_vaccinated_per_hundred        19150 non-null  float64
 10  people_fully_vaccinated_per_hundred  16302 non-null  float64
 11  daily_vaccinations_per_milli

In [16]:
# Renaming the column names
df_CVN.rename(columns = {'country' : 'Country', 'date':'Date', 'total_vaccinations':'Total_No_Of_vaccinations',
                        'people_vaccinated':'Total_No_People_Vaccinated', 'people_fully_vaccinated':'Total_No_Of_People_Fully_Vaccinated',
                         'people_vaccinated_per_hundred':'Total_No_People_Vaccinated_Per_Hundred',
                         'people_fully_vaccinated_per_hundred':'Total_No_OF_People_Fully_Vaccinated_Per_Hundred'
                        }, inplace = True)

In [17]:
df_CVN.columns

Index(['Country', 'iso_code', 'Date', 'Total_No_Of_vaccinations',
       'Total_No_People_Vaccinated', 'Total_No_Of_People_Fully_Vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred',
       'Total_No_People_Vaccinated_Per_Hundred',
       'Total_No_OF_People_Fully_Vaccinated_Per_Hundred',
       'daily_vaccinations_per_million', 'vaccines', 'source_name',
       'source_website'],
      dtype='object')

# 3. Data Consistency Check

In [18]:
# Checking for number of missing values in dataset
df_CVN.isnull().sum()

Country                                                0
iso_code                                               0
Date                                                   0
Total_No_Of_vaccinations                           15995
Total_No_People_Vaccinated                         16913
Total_No_Of_People_Fully_Vaccinated                19761
daily_vaccinations_raw                             19510
daily_vaccinations                                   238
total_vaccinations_per_hundred                     15995
Total_No_People_Vaccinated_Per_Hundred             16913
Total_No_OF_People_Fully_Vaccinated_Per_Hundred    19761
daily_vaccinations_per_million                       238
vaccines                                               0
source_name                                            0
source_website                                         0
dtype: int64

Since the null values from above columns do not affect our calculation and the values at the total_vaccionations columns are cumulative. Therefore, we dont need to clean this data.

In [19]:
# Checking for duplicate records
df_dups = df_CVN[df_CVN.duplicated()]

In [20]:
df_dups

Unnamed: 0,Country,iso_code,Date,Total_No_Of_vaccinations,Total_No_People_Vaccinated,Total_No_Of_People_Fully_Vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,Total_No_People_Vaccinated_Per_Hundred,Total_No_OF_People_Fully_Vaccinated_Per_Hundred,daily_vaccinations_per_million,vaccines,source_name,source_website


No duplicates found

In [21]:
# Checking for mixed-type data
for col in df_CVN.columns.tolist():
  mixed_type = (df_CVN[[col]].applymap(type) != df_CVN[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_CVN[mixed_type]) > 0:
    print (col)

No mixed-type data found

# 4. Descriptive Analysis

In [22]:
# Checking for mean, max, min and other parameters of columns
df_CVN.describe()

Unnamed: 0,Total_No_Of_vaccinations,Total_No_People_Vaccinated,Total_No_Of_People_Fully_Vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,Total_No_People_Vaccinated_Per_Hundred,Total_No_OF_People_Fully_Vaccinated_Per_Hundred,daily_vaccinations_per_million
count,20068.0,19150.0,16302.0,16553.0,35825.0,20068.0,19150.0,16302.0,35825.0
mean,15018780.0,6722639.0,3987322.0,246149.9,123640.7,34.540485,21.884808,14.992973,3526.776776
std,85157440.0,24367510.0,13885630.0,1319599.0,883062.9,38.942081,22.271319,18.100962,4620.617784
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,153475.5,126189.0,57734.0,4695.0,854.0,3.7375,3.0525,1.63,453.0
50%,1075853.0,764670.0,446146.5,22761.0,6855.0,18.42,13.085,7.16,2026.0
75%,5421286.0,3580238.0,2120094.0,103233.0,38110.0,54.8675,37.1475,22.815,5268.0
max,1782525000.0,622000000.0,223299000.0,24741000.0,22424290.0,232.88,116.77,116.11,118759.0


In [23]:
# Get the column names
df_CVN.dtypes

Country                                             object
iso_code                                            object
Date                                                object
Total_No_Of_vaccinations                           float64
Total_No_People_Vaccinated                         float64
Total_No_Of_People_Fully_Vaccinated                float64
daily_vaccinations_raw                             float64
daily_vaccinations                                 float64
total_vaccinations_per_hundred                     float64
Total_No_People_Vaccinated_Per_Hundred             float64
Total_No_OF_People_Fully_Vaccinated_Per_Hundred    float64
daily_vaccinations_per_million                     float64
vaccines                                            object
source_name                                         object
source_website                                      object
dtype: object

In [24]:
#Exporting df_CVN dataframe under Prepared Data folder
df_CVN.to_csv(os.path.join(path, 'Data', 'Prepared_Data', 'Country_Vaccinations_Checked.csv'),index=False)