In [1]:
import pandas as pd
import numpy as np

vaccinations = pd.read_csv('country_vaccinations.csv')

In [2]:
vaccinations.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Algeria,DZA,2021-01-29,0.0,,,,,0.0,,,,Sputnik V,Ministry of Health,https://www.aps.dz/regions/116777-blida-covid-...
1,Algeria,DZA,2021-01-30,30.0,,,30.0,30.0,0.0,,,1.0,Sputnik V,Ministry of Health,https://www.aps.dz/regions/116777-blida-covid-...
2,Andorra,AND,2021-01-25,576.0,576.0,,,,0.75,0.75,,,Pfizer/BioNTech,Government of Andorra,https://www.govern.ad/comunicats/item/12379-se...
3,Andorra,AND,2021-01-26,,,,,66.0,,,,854.0,Pfizer/BioNTech,Government of Andorra,https://www.govern.ad/comunicats/item/12379-se...
4,Andorra,AND,2021-01-27,,,,,66.0,,,,854.0,Pfizer/BioNTech,Government of Andorra,https://www.govern.ad/comunicats/item/12379-se...


In [3]:
flags = vaccinations.isna()
flags.sum() / len(vaccinations)

country                                0.000000
iso_code                               0.092759
date                                   0.000000
total_vaccinations                     0.347554
people_vaccinated                      0.469667
people_fully_vaccinated                0.684932
daily_vaccinations_raw                 0.464579
daily_vaccinations                     0.037965
total_vaccinations_per_hundred         0.347554
people_vaccinated_per_hundred          0.469667
people_fully_vaccinated_per_hundred    0.684932
daily_vaccinations_per_million         0.037965
vaccines                               0.000000
source_name                            0.000000
source_website                         0.000000
dtype: float64

# Население стран

В датасете есть 4 пары колонок - `daily_vaccinations` c `daily_vaccinations_per_million`, `total_vaccinations` c `total_vaccinations_per_hundred` и `people_vaccinated` с `people_vaccinated_per_hundred`, `people_fully_vaccinated` с `people_fully_vaccinated_per_hundred`, значения в которых прямо пропорциональны друг другу и коэффициент пропорциональности связан с населением государства. С помощью этих колонок мы можем оценить население и, при заполнении пропусков, заполнять только одну из двух колонок (а вторую заполнять используя прямую пропорциональность).

In [7]:
COLUMN_PAIRS = [
    ('daily_vaccinations', 'daily_vaccinations_per_million', 1e6),
    ('total_vaccinations', 'total_vaccinations_per_hundred', 100),
    ('people_vaccinated', 'people_vaccinated_per_hundred', 100),
    ('people_fully_vaccinated', 'people_fully_vaccinated_per_hundred', 100)
]

dfs = []
for nom, denom, mult in COLUMN_PAIRS:
    df = vaccinations[['country', nom, denom]].dropna()
    df['population'] = (df[nom] / df[denom]) * mult
    df = df.drop(columns=[nom, denom]).replace([np.inf, -np.inf], np.nan).dropna()
    dfs.append(df)
    
population = pd.concat(dfs)

In [8]:
population

Unnamed: 0,country,population
1,Algeria,3.000000e+07
3,Andorra,7.728337e+04
4,Andorra,7.728337e+04
5,Andorra,7.728337e+04
6,Andorra,7.728337e+04
...,...,...
2550,Wales,3.040000e+06
2551,Wales,3.270000e+06
2552,Wales,3.088750e+06
2553,Wales,3.257500e+06


In [10]:
population_mean = population.groupby('country').mean('population')
population_mean = {
    country: population[0]
    for country, population in zip(population_mean.index, population_mean.values)
}
population_mean

{'Algeria': 30000000.0,
 'Andorra': 77200.9520208199,
 'Argentina': 45123188.21164181,
 'Austria': 8984106.187045515,
 'Bahrain': 1701570.5120426612,
 'Bangladesh': 161122457.03014934,
 'Belgium': 11213888.00350198,
 'Bermuda': 62277.85002810513,
 'Bolivia': 12009033.224400872,
 'Brazil': 210667680.37306544,
 'Bulgaria': 6935279.44185545,
 'Canada': 37829002.34185554,
 'Cayman Islands': 65726.93613698632,
 'Chile': 19201532.63086156,
 'China': 1442524078.5830615,
 'Costa Rica': 5089941.979311699,
 'Croatia': 4106384.1258594263,
 'Cyprus': 876325.4532566043,
 'Czechia': 10715633.40887005,
 'Denmark': 5804330.625414432,
 'Ecuador': 17085221.116411086,
 'Egypt': 109500000.0,
 'England': 56236982.037986316,
 'Estonia': 1323061.7540125977,
 'Faeroe Islands': 48863.2330522679,
 'Finland': 5534459.8175375145,
 'France': 67115818.70734851,
 'Germany': 83960596.53737262,
 'Gibraltar': 33688.41265732025,
 'Greece': 10452072.449501233,
 'Greenland': 56791.208791208795,
 'Guernsey': 67021.94331554

Проверим, насколько точна оценка.

In [14]:
import math

isclose = population.apply(
    lambda row: math.isclose(
        row['population'], population_mean[row['country']],
        rel_tol=0.05
    ),
    axis=1
)

isclose.sum() / len(isclose)

0.9557249432371067

In [17]:
assert all(country in population_mean for country in vaccinations['country'].unique())

# Total vaccinations

В таблице присутствует колонка `total_vaccinations`, которая является кумулятивной суммой `daily_vaccinations_raw` за весь период времени:

In [85]:
from datetime import date, timedelta

columns = ['country', 'date', 'total_vaccinations', 'daily_vaccinations', 'daily_vaccinations_raw']
total_vaccinations = vaccinations[columns].dropna()

# total_vaccinations yesterday, daily_vaccinations_raw today, total_vaccinations today
tv_yda, dvr_tda, tv_tda = [], [], []
for _, country in total_vaccinations.groupby('country'):
    for (_, yda), (_, tda) in zip(country[:-1].iterrows(), country[1:].iterrows()):
        yda_date = date.fromisoformat(yda['date'])
        tda_date = date.fromisoformat(tda['date'])
        if yda_date + timedelta(days=1) != tda_date:
            continue
            
        tv_yda.append(yda['total_vaccinations'])
        dvr_tda.append(tda['daily_vaccinations_raw'])
        tv_tda.append(tda['total_vaccinations'])
        
tv_yda, dvr_tda, tv_tda = np.array(tv_yda), np.array(dvr_tda), np.array(tv_tda)
print(f'Found {len(tv_tda)} examples')

tv_tda_pred = tv_yda + dvr_tda
if np.sum(np.abs(tv_tda - tv_tda_pred)) > 0:
    print(f'Mean error is {np.mean(np.abs(tv_tda - tv_tda_pred))}')
else:
    print(f'There is not error')

Found 1244 examples
There is not error


Таким образом мы можем восстановить часть `total_vaccinations`, если известны `total_vaccinations` для предыдущего дня и `daily_vaccinations_raw` для текущего:

In [104]:
row = vaccinations.loc[3]
math.isnan(row['total_vaccinations'])

True

In [111]:
isna = vaccinations['total_vaccinations'].isna()
isna_indices = isna[isna].index.values

for i in isna_indices:
    yda = vaccinations.loc[i - 1]
    tda = vaccinations.loc[i]
    if yda['country'] != tda['country'] and not math.isnan(tda['daily_vaccinations_raw']):
        vaccinations.at[i, 'total_vaccinations'] = tda['daily_vaccinations_raw']
        continue
        
    yda_date = date.fromisoformat(yda['date'])
    tda_date = date.fromisoformat(tda['date'])
    if yda_date + timedelta(days=1) != tda_date:
        continue
    
    if math.isnan(yda['total_vaccinations']) or math.isnan(tda['daily_vaccinations_raw']):
        continue
    
    vaccinations.at[i, 'total_vaccinations'] = yda['total_vaccinations'] + tda['daily_vaccinations_raw']

Однако, это не сильно помогает, так как и `total_vaccinations` и `daily_vaccinations_raw` содержат много пропусков. Попробуем восстановить `daily_vaccinations_raw` через `daily_vaccinations`:

In [112]:
daily_vaccinations = vaccinations[['daily_vaccinations', 'daily_vaccinations_raw']].dropna()
ratio = (daily['daily_vaccinations_raw'] / daily['daily_vaccinations']).mean()
ratio

1.2118197846618897

In [113]:
isna = vaccinations['daily_vaccinations_raw'].isna()
isna_indices = isna[isna].index.values

for i in isna_indices:
    daily_vaccinations = vaccinations.at[i, 'daily_vaccinations']
    if math.isnan(daily_vaccinations):
        continue
    
    vaccinations.at[i, 'daily_vaccinations_raw'] = ratio * daily_vaccinations

Теперь мы можем снова попытаться восстановить `total_vaccinations` с помощью `daily_vaccinations_raw`:

In [114]:
isna = vaccinations['total_vaccinations'].isna()
isna_indices = isna[isna].index.values

for i in isna_indices:
    yda = vaccinations.loc[i - 1]
    tda = vaccinations.loc[i]
    if yda['country'] != tda['country'] and not math.isnan(tda['daily_vaccinations_raw']):
        vaccinations.at[i, 'total_vaccinations'] = tda['daily_vaccinations_raw']
        continue
        
    yda_date = date.fromisoformat(yda['date'])
    tda_date = date.fromisoformat(tda['date'])
    if yda_date + timedelta(days=1) != tda_date:
        continue
    
    if math.isnan(yda['total_vaccinations']) or math.isnan(tda['daily_vaccinations_raw']):
        continue
    
    vaccinations.at[i, 'total_vaccinations'] = yda['total_vaccinations'] + tda['daily_vaccinations_raw']

In [115]:
flags = vaccinations.isna()
flags.sum() / len(vaccinations)

country                                0.000000
iso_code                               0.092759
date                                   0.000000
total_vaccinations                     0.006654
people_vaccinated                      0.469667
people_fully_vaccinated                0.684932
daily_vaccinations_raw                 0.037965
daily_vaccinations                     0.037965
total_vaccinations_per_hundred         0.347554
people_vaccinated_per_hundred          0.469667
people_fully_vaccinated_per_hundred    0.684932
daily_vaccinations_per_million         0.037965
vaccines                               0.000000
source_name                            0.000000
source_website                         0.000000
dtype: float64