## Credits
This program uses the data located on Kaggle https://www.kaggle.com/gpreda/covid-world-vaccination-progress
I got inspiration for doing this after seeing an early version by https://github.com/Ken-Freeman
## Setup
Download the data csv file and put it in the data directory prior to running this program
## Design Considerations
This notebook creates new variables when that would make no sense in a large data problem.
It does this so that we can go back and look at previous values

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from io import StringIO

import os
import logging, sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.info("Local data directory holds: %s", os.listdir("data"))


In [None]:
# Function that describes the current state of the interesting fields in a vaccination dataframe
def log_vaccination_stats(banner, target_frame):
    s1 = banner
    # Display Statistics Mumber of missing total vaccinations values - must run against ungrouped (?) - count() is the number of non-nan
    num_rows=target_frame['country'].size
    s2 = "\nNumber of records: {} daily_vaccinations     with data {} empty {}".format(
        num_rows,
        target_frame['daily_vaccinations'].count(),
        target_frame['daily_vaccinations'].isna().sum()
    )
    s3 = "\nNumber of records: {} total_vaccinations     with data {} empty {}".format(
        num_rows,
        target_frame['total_vaccinations'].count(),
        target_frame['total_vaccinations'].isna().sum()
    )
    s4 = "\nNumber of records: {} vaccinated_per_hundred with data {} empty {}".format(
        num_rows,
        target_frame['people_fully_vaccinated_per_hundred'].count(),
        target_frame['people_fully_vaccinated_per_hundred'].isna().sum()
    )
    return s1+s2+s3+s4

# Display statistics: show another way to display using a list of null and not null
# create a series where the null elements are represented by true and a series where non_null elements are represented
def log_vaccination_stats_2(banner,target_frame):
    s1 =banner
    s2 ="\nNumber of records: {} daily_vaccinations     with data {} empty {}".format(
        target_frame['country'].size, 
        target_frame[pd.notnull(target_frame['daily_vaccinations'])]['daily_vaccinations'].size , 
        target_frame[pd.isnull(target_frame['daily_vaccinations'])]['daily_vaccinations'].size )
    s3 = "\nNumber of records: {} total_vaccinations     with data {} empty {}".format(
        target_frame['country'].size,                   
        target_frame[pd.notnull(target_frame['total_vaccinations'])]['total_vaccinations'].size , 
        target_frame[pd.isnull(target_frame['total_vaccinations'])]['total_vaccinations'].size )
    s4 = "\nNumber of records: {} vaccinated_per_hundred with data {} empty {}".format(
        target_frame['country'].size, 
        target_frame[pd.notnull(target_frame['people_fully_vaccinated_per_hundred'])]['people_fully_vaccinated_per_hundred'].size , 
        target_frame[pd.isnull(target_frame['people_fully_vaccinated_per_hundred'])]['people_fully_vaccinated_per_hundred'].size )
    return s1+s2+s3+s4

# logs out the field types in a dataframe                                        
def log_datatypes(banner, target_frame):
    output = StringIO()
    target_frame.info(buf=output)
    return "{}\n{}".format(banner,output.getvalue())

# Sort the data by date mixes by country so lets do ordered sorting on two columns
def log_vaccination_samples(banner, target_frame):
    return "{} \n{}".format(
        banner , 
        target_frame.sort_values(['country','date'], ascending=True)[['country','date','total_vaccinations','daily_vaccinations','total_vaccinations']])                                         


## Load the Data
1. Load the data
1. Convert the date fields from string to date

In [None]:
#import data from csv
df_loaded = pd.read_csv('data/vaccinations.csv',parse_dates=True)
# Log unique country names
logging.debug("Loaded data for countries\n %s", pd.unique(df_loaded["country"]))

# I like 'country' better than 'name'
df_loaded['country']=df_loaded['location']

# lets convert the dates from strings to real dates
logging.debug(log_datatypes("Schema prior to date conversion", df_loaded))
df_loaded['date'] = pd.to_datetime(df_loaded['date'])
logging.debug(log_datatypes("Schema after to date conversion", df_loaded))

In [None]:
#Log the countries with the most data
logging.debug("the number of days for each country %s",df_loaded.country.value_counts())
logging.debug("the number of countries with more than a month worth the data %s",df_loaded.country.value_counts()[lambda day_count: day_count > 30])

## Countries didn't supply data every day
1. Make all the countries have rows for all days
1. Fill in missing dates.
1. Do not fill with hard coded numerical the values.  
1. We will interpolate or fill based on the specific requirements for a field

## Fill in the missing rows

In [None]:
logging.info(log_vaccination_stats("Prior to filling in missing rows/days", df_loaded))

# fill in missing dates https://stackoverflow.com/questions/44978196/pandas-filling-missing-dates-and-values-within-group
# but leave new values as nan so that we can try and extrapolate
# this does not "extrapolate" the missing start/end values - probably should fill with 0 from start to first value
df_filled = df_loaded.set_index(
    ['date', 'country']
).unstack(
    #fill_value=0
).asfreq(
    'D', #fill_value=0
).stack(
    # True would drop all rows where all values are NAN which should never happen with this data because every row has a country name.
    dropna=False
).sort_index(level=1).reset_index()

logging.info(log_vaccination_stats_2("\nAfter to adding missing rows/days",df_filled))
logging.debug(log_vaccination_samples("After fill, Before Interpolation Country ",df_filled))

## Fill in missing values for the rows we are interested in
We want to smooth out our graph and eliminate dropout days so countries always have _something_ for each day
1. Basically got rid of the 0/ null problem. It takes whatever value the column held before and just inputs it for the next date.
1. This fills every property/column which may not make sense for some columns.  
1. We should only do the fields that make sense
1. Extrapolate the missing start/end values for text fields
1. We could extrapolate for missing end data but but let's not do that for now https://stackoverflow.com/questions/22491628/extrapolate-values-in-pandas-dataframe


In [None]:
df_interpolated = df_filled.sort_values(['country','date'])

# interpolate numerical values
df_interpolated['daily_vaccinations'] = df_interpolated.groupby('country').daily_vaccinations.transform(lambda x: x.interpolate())
df_interpolated['total_vaccinations'] = df_interpolated.groupby('country').total_vaccinations.transform(lambda x: x.interpolate())
df_interpolated['people_fully_vaccinated_per_hundred'] = df_interpolated.groupby('country').people_fully_vaccinated_per_hundred.transform(lambda x: x.interpolate())
df_interpolated['people_vaccinated'] = df_interpolated.groupby('country').people_vaccinated.transform(lambda x: x.interpolate())

# String values , iso code, site, etc are the same for all rows in a country so we ffill and bfill in case early rows are missing values
df_interpolated['iso_code'] = df_interpolated.groupby('country').iso_code.ffill().bfill()
logging.info("Interpolation finished")

# Show the number of missing total vaccinations values - must run against ungrouped (?)
# count() is the number of non-nan
logging.info(log_vaccination_stats("After data interpolation", df_interpolated))
logging.debug(log_vaccination_samples("After Interpolation: Country\n%s", df_interpolated))

# Graph It!
#### TODO
Present more data as bubble sizing

## log(X),log(y) axis plot with vaccinations max value 70%

The log(y) axis streteches the early progress and compresses the later gains.
This is useful when in the early stages of the pandemic where the overall vaccination rates are low

In [None]:
df_interpolated['date_str']=df_interpolated.date.astype(str)
import plotly.express as px
fig = px.scatter(df_interpolated, 
                 x="daily_vaccinations", 
                 y="people_fully_vaccinated_per_hundred", 
                 animation_frame="date_str",
                 animation_group="country", 
                 color='country',
                 hover_name="country", 
                 width=800, 
                 height=600,
                 log_x=True, 
                 log_y=True,
                 range_x=[100,5000000],
                 range_y=[0.01,70]
                )
fig.update_layout(
    title='Full Vaccinated vs Daily Vaccinations (LOG,LOG)',
    yaxis=dict(
        title='Fully Vaccinated per 100',
        gridcolor='white',
        gridwidth=2,
    ),
    xaxis=dict(
        title='Daily Vaccinations'
    )
)

fig.show()


## log(X),linear(y) axis plot with vaccinations max value 70%

This makes it more aparant how far we have to go

In [None]:
df_interpolated['date_str']=df_interpolated.date.astype(str)
import plotly.express as px
fig = px.scatter(df_interpolated, 
                 x="daily_vaccinations", 
                 y="people_fully_vaccinated_per_hundred", 
                 animation_frame="date_str",
                 animation_group="country", 
                 color='country',
                 hover_name="country", 
                 width=800, 
                 height=600,
                 log_x=True, 
                 log_y=False,
                 range_x=[100,5000000],
                 range_y=[0.01,70]
                )
fig.update_layout(
    title='Full Vaccinated vs Daily Vaccinations (LOG,LINEAR)',
    yaxis=dict(
        title='Fully Vaccinated per 100',
        gridcolor='white',
        gridwidth=2,
    ),
    xaxis=dict(
        title='Daily Vaccinations'
    )
)

fig.show()
