# Vaccination dataset - cleaning

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
vaccination=pd.read_csv('../datasets_Tableau_clean/vaccination_EU_EEA_clean.csv')
vaccination['YearWeekISO']=pd.to_datetime(vaccination['YearWeekISO'],errors='coerce')

In [3]:
vaccination.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239853 entries, 0 to 239852
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Unnamed: 0        239853 non-null  int64         
 1   YearWeekISO       239853 non-null  datetime64[ns]
 2   ReportingCountry  239853 non-null  object        
 3   FirstDose         239853 non-null  int64         
 4   SecondDose        239853 non-null  int64         
 5   DoseAdditional1   239853 non-null  int64         
 6   UnknownDose       239853 non-null  int64         
 7   Region            239853 non-null  object        
 8   TargetGroup       239853 non-null  object        
 9   Vaccine           239853 non-null  object        
 10  Population        239853 non-null  int64         
dtypes: datetime64[ns](1), int64(6), object(4)
memory usage: 20.1+ MB


In [4]:
to_drop=['Unnamed: 0','Region','DoseAdditional1','UnknownDose']
vaccination_clean=vaccination.sort_values('YearWeekISO', axis=0, ascending=True).drop(to_drop,axis=1)
vaccination_clean['first_dose_100k']=vaccination_clean['FirstDose']/vaccination_clean['Population']*100000
vaccination_clean['second_dose_100k']=vaccination_clean['SecondDose']/vaccination_clean['Population']*100000

In [5]:
vax_df=vaccination_clean.drop(['FirstDose','SecondDose','Population'],axis=1)
vax_df

Unnamed: 0,YearWeekISO,ReportingCountry,TargetGroup,Vaccine,first_dose_100k,second_dose_100k
14400,2020-12-14,DK,LTCF,COM,0.000000,0.000000
14399,2020-12-14,DK,HCW,COM,0.000000,0.000000
14398,2020-12-14,DK,Age25_49,COM,0.017174,0.000000
14397,2020-12-14,DK,ALL,COM,0.017174,0.000000
14403,2020-12-21,DK,Age25_49,COM,0.068696,0.000000
...,...,...,...,...,...,...
124338,2022-02-28,FI,Age5_9,MOD,0.000000,0.000000
124339,2022-02-28,FI,Age5_9,JANSS,0.000000,0.000000
124340,2022-02-28,FI,Age5_9,JANSS,0.000000,0.000000
124342,2022-02-28,FI,Age5_9,COM,0.217183,6.334507


In [7]:
new_columns=['YearWeekISO','ReportingCountry', 'first_dose_100k','second_dose_100k','TargetGroup','Vaccine']
vax_df=vax_df[new_columns]
display(vax_df.head())
vax_df.to_csv('../datasets_Tableau_clean/vax_dataset.csv')

In [8]:
vax_df.drop(['TargetGroup','Vaccine'], axis=1, inplace=True)

In [9]:
vax_df_grouped=vax_df.groupby(['ReportingCountry','YearWeekISO'], as_index=False).agg({'first_dose_100k':'sum', 
                         'second_dose_100k':'sum'}).sort_values(['ReportingCountry','YearWeekISO'], axis=0, ascending=True)
vax_df_grouped

Unnamed: 0,ReportingCountry,YearWeekISO,first_dose_100k,second_dose_100k
0,AT,2021-01-04,711.016121,0.000000
1,AT,2021-01-11,1914.647507,9.100036
2,AT,2021-01-18,2101.748735,103.965099
3,AT,2021-01-25,712.521559,395.503279
4,AT,2021-02-01,419.118433,1789.583807
...,...,...,...,...
1830,SK,2022-01-31,138.057445,379.580104
1831,SK,2022-02-07,81.423661,313.070678
1832,SK,2022-02-14,44.742705,229.558291
1833,SK,2022-02-21,28.124509,151.029531


In [21]:
vax_df_grouped['cmltve_first_dose']=vax_df_grouped.groupby(['ReportingCountry'])['first_dose_100k'].cumsum()
vax_df_grouped['cmltve_second_dose']=vax_df_grouped.groupby(['ReportingCountry'])['second_dose_100k'].cumsum()

In [24]:
vax_df_grouped[vax_df_grouped['ReportingCountry']=='ES']

Unnamed: 0,ReportingCountry,YearWeekISO,first_dose_100k,second_dose_100k,cmltve_first_dose,cmltve_second_dose
555,ES,2021-01-04,2639.190812,0.000000,2639.190812,0.000000
556,ES,2021-01-11,3179.209583,12.879069,5818.400395,12.879069
557,ES,2021-01-18,1577.390169,550.789356,7395.790564,563.668425
558,ES,2021-01-25,532.575699,1814.822651,7928.366263,2378.491076
559,ES,2021-02-01,440.457398,2800.536222,8368.823661,5179.027298
...,...,...,...,...,...,...
611,ES,2022-01-31,182.083753,130.313952,178281.159371,147552.640976
612,ES,2022-02-07,163.382060,718.350354,178444.541432,148270.991330
613,ES,2022-02-14,144.350785,774.618110,178588.892217,149045.609440
614,ES,2022-02-21,97.004995,454.863955,178685.897212,149500.473395


In [25]:
vax_df_grouped.to_csv('../datasets_Tableau_clean/vax_grouped_dataset.csv')

### Conclusions regarding the data
The "vax_df_grouped.csv" is now ready. It has clean data, and the corresponding cumulative columns. It has a lower level of detail than the new cases dataset; therefore, if we want to join both we will have to aggregate the "new_cases_dataset" by week.