In [1]:
# Dependencies & Setup
import pandas as pd

# Files to Load
country_codes_csv = "initial-data/country-codes.csv"
vaccinations_csv = "initial-data/vaccination-data.csv"
daily_cases_csv = "initial-data/WHO-COVID-19-global-data.csv"

# Read the files & store into pandas dataframes
country_data = pd.read_csv(country_codes_csv)
vaccination_data = pd.read_csv(vaccinations_csv)
case_data = pd.read_csv(daily_cases_csv)

In [2]:
# Create a clean vaccine dataframe
vaccine_df = pd.merge(country_data, vaccination_data, left_on="Alpha-3 code", right_on="ISO3")
vaccine_df.drop(['COUNTRY', 'ISO3'], axis=1, inplace=True)
vaccine_df.rename(columns={'Latitude (average)': 'Latitude', 'Longitude (average': 'Longitude'}, inplace=True)
vaccine_df = vaccine_df.drop_duplicates(subset=["Alpha-3 code"])
vaccine_df

#country_data
#vaccination_data

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude,Longitude (average),WHO_REGION,DATA_SOURCE,DATE_UPDATED,TOTAL_VACCINATIONS,PERSONS_VACCINATED_1PLUS_DOSE,TOTAL_VACCINATIONS_PER100,PERSONS_VACCINATED_1PLUS_DOSE_PER100,PERSONS_FULLY_VACCINATED,PERSONS_FULLY_VACCINATED_PER100,VACCINES_USED,FIRST_VACCINE_DATE,NUMBER_VACCINES_TYPES_USED
0,Afghanistan,AF,AFG,4,33.0000,65.0,EMRO,REPORTING,2022-03-06,5597130,4952744.0,14.378,12.723,4281934.0,11.000,"Beijing CNBG - BBIBP-CorV,Janssen - Ad26.COV 2...",2021-02-22,4.0
1,Albania,AL,ALB,8,41.0000,20.0,EURO,REPORTING,2022-02-20,2707658,1284034.0,94.100,45.118,1192155.0,41.889,"AstraZeneca - Vaxzevria,Gamaleya - Gam-Covid-V...",2021-01-13,5.0
2,Algeria,DZ,DZA,12,28.0000,3.0,AFRO,REPORTING,2022-02-20,13631683,7456361.0,31.086,17.004,6076272.0,13.857,"Beijing CNBG - BBIBP-CorV,Gamaleya - Gam-Covid...",2021-01-30,4.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0,WPRO,REPORTING,2022-02-16,85050,42212.0,154.084,76.475,37805.0,68.491,"Janssen - Ad26.COV 2-S,Moderna - Spikevax,Pfiz...",2020-12-21,3.0
4,Andorra,AD,AND,20,42.5000,1.6,EURO,REPORTING,2022-02-13,142420,57797.0,184.300,75.872,53250.0,69.903,"AstraZeneca - Vaxzevria,Moderna - Spikevax,Pfi...",2021-01-20,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,"Virgin Islands, British",VG,VGB,92,18.5000,-64.5,AMRO,REPORTING,2022-02-25,36610,18923.0,121.077,62.582,17325.0,57.297,"AstraZeneca - Vaxzevria,Janssen - Ad26.COV 2-S",2021-02-11,2.0
228,Wallis and Futuna,WF,WLF,876,-13.3000,-176.2,WPRO,REPORTING,2022-02-18,15633,6450.0,139.009,57.354,6399.0,56.900,Moderna - Spikevax,2021-03-19,1.0
229,Yemen,YE,YEM,887,15.0000,48.0,EMRO,REPORTING,2022-03-06,784792,624837.0,2.631,2.095,384655.0,1.290,"Janssen - Ad26.COV 2-S,SII - Covishield,Sinova...",2021-04-20,3.0
230,Zambia,ZM,ZMB,894,-15.0000,30.0,AFRO,REPORTING,2022-03-03,2858338,2510296.0,15.548,13.655,1883799.0,10.247,"Beijing CNBG - BBIBP-CorV,Janssen - Ad26.COV 2...",2021-04-14,3.0


In [3]:
# Create a clean cases dataframe
cases_df = pd.merge(country_data, case_data, left_on="Alpha-2 code", right_on="Country_code")
cases_df.drop(['Country_code', 'Country_y'], axis=1, inplace=True)
cases_df.rename(columns={'Country_x': 'Country','Latitude (average)': 'Latitude', 'Longitude (average': 'Longitude'}, inplace=True)
cases_df = cases_df.drop_duplicates(subset=["Alpha-3 code", "Date_reported"])
cases_df

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude,Longitude (average),Date_reported,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,Afghanistan,AF,AFG,4,33.0,65.0,2020-01-03,EMRO,0,0,0,0
1,Afghanistan,AF,AFG,4,33.0,65.0,2020-01-04,EMRO,0,0,0,0
2,Afghanistan,AF,AFG,4,33.0,65.0,2020-01-05,EMRO,0,0,0,0
3,Afghanistan,AF,AFG,4,33.0,65.0,2020-01-06,EMRO,0,0,0,0
4,Afghanistan,AF,AFG,4,33.0,65.0,2020-01-07,EMRO,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
188566,Zimbabwe,ZW,ZWE,716,-20.0,30.0,2022-02-25,AFRO,378,234967,2,5390
188567,Zimbabwe,ZW,ZWE,716,-20.0,30.0,2022-02-26,AFRO,500,235467,2,5392
188568,Zimbabwe,ZW,ZWE,716,-20.0,30.0,2022-02-27,AFRO,336,235803,1,5393
188569,Zimbabwe,ZW,ZWE,716,-20.0,30.0,2022-02-28,AFRO,0,235803,0,5393


In [4]:
# Export the cleaned files to csv
vaccine_df.to_csv("cleaned-data/vaccine-data.csv", index = False)
cases_df.to_csv("cleaned-data/cases-data.csv", index = False)