This notebook creates new variables when that would make no sense in a large data problem.
It does this so that we can go back and look at prvious values

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
import os
import logging, sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
print("data files in this dir", os.listdir("data"))


In [None]:
#import data
df1 = pd.read_csv('data/country_vaccinations.csv',parse_dates=True)


In [None]:
#What are the countries
logging.debug("Let show all the countries\n %s", pd.unique(df1["country"]))

In [None]:
# show the data
logging.debug("Schema prior to date conversion %s",df1.info())
# lets convert the dates to real dates
df1['date'] = pd.to_datetime(df1['date'])
# df1[xxx] is a series
# type(df1['date'])
logging.debug("Schema after to date conversion %s",df1.info())

In [None]:
# random demo here
logging.debug("the number of days for each country %s",df1.country.value_counts())
logging.debug("the number of countries with more than a month worth the data %s",df1.country.value_counts()[lambda day_count: day_count > 30])

In [None]:
logging.info("prior to adding missing days")
# Mumber of missing total vaccinations values - must run against ungrouped (?) - count() is the number of non-nan
num_rows=df1['total_vaccinations'].size
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,num_rows-df1['daily_vaccinations'].count())
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,num_rows-df1['total_vaccinations'].count())

# fill in missing dates https://stackoverflow.com/questions/44978196/pandas-filling-missing-dates-and-values-within-group
# but leave new values as nan so that we can try and extrapolate
# this does not "extrapolate" the missing end values - probably should fill with 0 from start to first value
df1 = df1.set_index(
    ['date', 'country']
).unstack(
    #fill_value=0
).asfreq(
    'D', #fill_value=0
).stack(
    dropna=False
).sort_index(level=1).reset_index()

logging.info("after to adding missing days")
# Show the number of missing total vaccinations values - must run against ungrouped (?)
# Mumber of missing total vaccinations values - must run against ungrouped (?) - count() is the number of non-nan
num_rows=df1['total_vaccinations'].size
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,num_rows-df1['daily_vaccinations'].count())
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,num_rows-df1['total_vaccinations'].count())

# # create a series where the null elements are represented by true
# bool_series = pd.isnull(df1['daily_vaccinations'])
# print ("number of records: ", bool_series.size, "empty daily_vaccinations", df1[bool_series]['daily_vaccinations'].size )
# bool_series = pd.isnull(df1['total_vaccinations'])
# print ("number of records: ", bool_series.size, "empty total_vaccinations", df1[bool_series]['total_vaccinations'].size )


In [None]:
#Sort the data by date but mixes by country so lets do ordered sorting
df_date_country = df1.sort_values(['country','date'],ascending=True)
logging.debug("%s", df_date_country)

In [None]:
# group by country then we will operate on each country
df_group_by_country = df_date_country.groupby('country')
logging.info("Number of countries / groups %s" ,len(df_group_by_country))
# show the number of non null values in each group and property
logging.debug("The number of NaN values for each country and column before interpolation ")
logging.debug("%s", df_group_by_country.count())
# Show all the groups and the index of items in each
logging.debug("Groups with the index of all contained %s", df_group_by_country.groups)


In [None]:
# lets look at one group / country
#df_group_by_country.get_group('Andorra') 

In [None]:
# Basically got rid of the 0/ null problem. It takes whatever value the column held before and just inputs it for the next date.
# This fills every property/column which may not mack sense for some columns.  we should only do the fields that make sense
# could extrapolate for missing end data but lets not do that for now https://stackoverflow.com/questions/22491628/extrapolate-values-in-pandas-dataframe
# df_date_country.info()

df_interpolated = df_date_country
df_interpolated['total_vaccinations'] = df_date_country.groupby('country').total_vaccinations.transform(lambda x: x.interpolate())
df_interpolated['people_vaccinated'] = df_date_country.groupby('country').people_vaccinated.transform(lambda x: x.interpolate())
df_interpolated['daily_vaccinations'] = df_date_country.groupby('country').daily_vaccinations.transform(lambda x: x.interpolate())
df_interpolated['people_fully_vaccinated_per_hundred'] = df_date_country.groupby('country').people_fully_vaccinated_per_hundred.transform(lambda x: x.interpolate())
# The iso_code is the same for all rows in a country so we ffill and bfill in case early rows are missing values
df_interpolated['iso_code'] = df_date_country.groupby('country').iso_code.ffill().bfill()
df_interpolated['vaccines'] = df_date_country.groupby('country').vaccines.ffill().bfill()
df_interpolated['source_name'] = df_date_country.groupby('country').source_name.ffill().bfill()
df_interpolated['source_website'] = df_date_country.groupby('country').source_website.ffill().bfill()

df_interpolated.sort_values(['country','date'])
logging.info("Interpolation finished")


In [None]:
# Show the number of missing total vaccinations values - must run against ungrouped (?)
# count() is the number of non-nan
num_rows=df_interpolated['total_vaccinations'].size
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,df_interpolated['daily_vaccinations'].count())
logging.info("Number of records: %s empty total_vaccinations %s", num_rows,df_interpolated['total_vaccinations'].count())


This grouping is left over from some other experiments and is kind of interesting

In [None]:
# put the group back together because we lost it when we did the ffill
df_interpolated_group = df_interpolated.groupby('country')
logging.debug("Number of countries / groups%s" ,len(df_interpolated_group))

# show the number of non null values in each group and property
logging.debug("The number of NaN values for each country and column after interpolation is %s",df_interpolated_group.count())

In [None]:
# lets look at one group / country
logging.debug("Show data for Andorra %s", df_interpolated_group.get_group('Andorra') )

## Lets graph this
TODO: Add sizing bubbles when we get other data

In [None]:
df_interpolated['date_str']=df_interpolated.date.astype(str)
import plotly.express as px
fig = px.scatter(df_interpolated, 
                 x="daily_vaccinations", 
                 y="people_fully_vaccinated_per_hundred", 
                 animation_frame="date_str",
                 animation_group="country", 
                 color='country',
                 hover_name="country", 
                 width=800, 
                 height=600,
                 log_x=True, 
                 log_y=True,
                 range_x=[100,5000000],
                 range_y=[0.01,100]
                )
fig.update_layout(
    title='Total Vaccinated vs Vs daily',
    yaxis=dict(
        title='Vaccinations per 100',
        gridcolor='white',
        gridwidth=2,
    ),
    xaxis=dict(
        title='Daily Vaccinations'
    )
)
#fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# for i in range(1,7):
#     fig.update_xaxes(title_text="Daily Vaccinations", row=1, col=i)
fig.show()
