# We want to play with `pandas`

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 2

## so we're importing some data for starters

In [None]:
d = pd.read_csv('country_vaccinations.csv', parse_dates=['date'])
d

In [None]:
c = pd.read_csv('continents2.csv')
c

## We want to know each sub region is faring... 
but to do that we need ratios eventually

In [None]:
p = pd.read_csv('population_by_country_2020.csv')
p

## Let the show begin. How many already vaccinated?

In [None]:
pd.options.display.max_rows = 20
d.groupby(['vaccines','country']).people_fully_vaccinated.sum().sort_values()

## By manufacturer?

In [None]:
(
    d
    .groupby(['vaccines','date'])
    .daily_vaccinations
    .sum()
    .unstack(['vaccines'])
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

One stands out, and the list is not that huge, although it is hard to tell by colour

## By country?

In [None]:
(
    d
    .groupby(['country','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['country'])
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

Even filtering out the zeroes, it is still a big list. Can't read that!

So let's enrich the dataset

In [None]:
dc = d.merge(c, left_on=['iso_code'], right_on=['alpha-3'])

(
    dc
    .groupby(['sub-region','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['sub-region'])
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

## I don't like the messy dates. Let's resample

In [None]:
(
    dc
    .groupby(['sub-region','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['sub-region'])
    .resample(rule='D')
    .sum()
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

## cumulative?

In [None]:
(
    dc
    .groupby(['sub-region','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['sub-region'])
    .resample(rule='D')
    .sum()
    .cumsum()
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

## By population percentage?

In [None]:
dc

In [None]:
p

In [None]:
dcp = dc.merge(p, left_on=['country'], right_on=['Country (or dependency)'], how='inner')
dcp

In [None]:
(
    dc
    .groupby(['country','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['country'])
    .resample(rule='D')
    .sum()
    .cumsum()
    .stack(['country'])
    .to_frame('vac_cumsum')
    .reset_index()
    .merge(p, left_on=['country'], right_on=['Country (or dependency)'], how='inner')
    .assign(vac_ratio=lambda x: x.vac_cumsum / x['Population (2020)'])
    .set_index(['date','country'])
    .unstack(['country'])
    .vac_ratio
    .plot()
    .legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
)

In [None]:
dcp.loc[dcp['sub-region'].isin(['Western Asia'])].set_index('country').daily_vaccinations.idxmax()

In [None]:
pd.options.display.max_rows = 200

(
    dc
    .groupby(['country','date'])
    .daily_vaccinations
    .sum()
    .where(lambda x: x > 0, np.NaN)
    .dropna()
    .unstack(['country'])
    .resample(rule='D')
    .sum()
    .cumsum()
    .stack(['country'])
    .to_frame('vac_cumsum')
    .reset_index()
    .merge(p, left_on=['country'], right_on=['Country (or dependency)'], how='inner')
    .assign(vac_ratio=lambda x: x.vac_cumsum / x['Population (2020)'])
    .set_index(['date','country'])
    .unstack(['country'])
    .vac_ratio
    .max()
    .sort_values(ascending=False)
    ) * 100

In [None]:
p.set_index(['Country (or dependency)']).loc['Israel']