In [20]:
import requests
from lxml import etree
import json
from selenium import webdriver
import pandas as pd
import numpy as np
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from random import randrange
import time
from datetime import datetime
import os, sys
import warnings
warnings.filterwarnings("ignore", message="use options instead of chrome_options")

In [21]:
# https://medium.com/analytics-vidhya/effortlessly-automate-your-python-scripts-cd295697dff6
# where to save the csv file
_DATA_CDC_PATH_1_ = "/Users/xingruchen/Dropbox/COV/Vaccine/US-COVID-19-Vaccine/data/CDC/csv_1/"
_DATA_CDC_PATH_2_ = "/Users/xingruchen/Dropbox/COV/Vaccine/US-COVID-19-Vaccine/data/CDC/csv_2/"
_LOG_PATH_ = "/Users/xingruchen/Dropbox/COV/Vaccine/US-COVID-19-Vaccine/data/CDC/"

Doses distributed and people initiating vaccination (1st dose received) are for both Moderna and Pfizer BioNTech COVID-19 vaccine and reflect current data available as of 9:00am ET on the day of reporting. Data will be regularly updated on Monday, Wednesday and Friday. Updates will occur the following day when reporting coincides with a federal holiday.

Healthcare providers report doses to federal, state, territorial, and local agencies up to 72 hours after administration. There may be additional lag for data to be transmitted from the federal, state, territorial, or local agency to CDC. 

#### A large difference between the number of doses distributed and the number of people initiating vaccination is expected at this point in the COVID vaccination program due to several factors, including delays in reporting of administered doses and management of available vaccine stocks by jurisdictions and federal pharmacy partners.

Numbers reported on CDC’s website are validated through a submission process with each jurisdiction and may differ from numbers posted on other websites. Differences between reporting jurisdictions and CDC’s website may occur due to the timing of reporting and website updates. The process used for reporting doses distributed or people vaccinated displayed by other websites may differ.

When the “Rate per 100,000” metric is selected for both doses distributed and people initiating vaccination (1st dose received), federal entities will display as n/a because population-based rates are not applicable. Doses distributed and administered for federal entities will display when the “Counts” metric is selected.

#### Doses distributed are cumulative counts of COVID-19 vaccine doses recorded as shipped in the Centers for Disease Control and Prevention’s (CDC) Vaccine Tracking System (VTrckS) since December 13, 2020.

#### People initiating vaccination (1st dose received) are cumulative counts of individual COVID-19 vaccine first doses administered as reported to the CDC by state, territorial, and local public health agencies and four federal entities (Bureau of Prisons, Department of Defense, Indian Health Service, and Veterans Health Administration) since December 14, 2020. 
For information pertaining to Department of State’s COVID-19 vaccination program, please contact the Office of Press Operations at PAPressDuty@state.gov. Veterans Health Administration totals include employees, veteran patients, and other federal partners vaccinated by VHA.

#### Long-term care facility (LTCF) data is a subset of the overall national data, specific to the Federal Pharmacy Partnership for Long-term Care (LTC) Program, and primarily includes skilled nursing and assisted living facilities. Doses distributed refers to doses distributed to pharmacy partners to administer onsite at LTCFs, and people initiating vaccination (1st dose received) includes both LTCF residents and staff vaccinated through the program. 

This data does not include doses distributed and administered to LTCF residents and staff outside the Federal Pharmacy Partnership Program. Vaccine administration through the federal program launched nationally on December 21st for Pfizer vaccine and on December 28th for Moderna vaccine. As of December 29, 2020, a total of 52 jurisdictions have started the program. Program start dates vary based on the jurisdiction. A difference between the number of doses distributed to pharmacy partners and the number of people initiating vaccination is expected because jurisdictions are transferring doses to pharmacy partners in advance to prepare for vaccination clinics in subsequent weeks.

#### Rates per 100,000 population use the U.S. Census Bureau 2019 American Community Survey 1-year population estimates. 

2018 population estimates are used for American Samoa, Federated States of Micronesia, Guam, Northern Mariana Islands, Palau, Republic of Marshall Islands, and United States Virgin Islands.

In [22]:
# before January 14
class CDCSpider_1(object):
    '''
    A CDC spider 
    '''
    def __init__(self):
        '''
        initialization
        :return: None
        '''
        self.url = 'https://covid.cdc.gov/covid-data-tracker/#vaccinations'
        # rate: per 100k
        self.headers = ['jurisdiction', 'distribution', 'initiate_1st', 
                        'distribution_rate', 'initiate_1st_rate', 'distribution_LTCF', 'initiation_1st_LTCF']
        self.df = pd.DataFrame(columns = self.headers)
    
    def start_driver(self):
        '''
        Open headless chromedriver
        '''
        options = webdriver.ChromeOptions()
        prefs = {"download.default_directory" : _DATA_CDC_PATH_}
        options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(executable_path = r'/usr/local/bin/chromedriver', chrome_options = options)
        self.driver.maximize_window()
        time.sleep(3)
        print('[start_driver] driver started')
        
    def close_driver(self):
        '''
        Close chromedriver
        '''
        self.driver.quit()
        print('[close_driver] driver closed')
        
    def get_page(self, url):
        '''
        Tell the browser to get a page
        :param url: webpage url
        '''
        self.driver.get(url)
        time.sleep(randrange(6))
        print('[get_page] url: {}'.format(url))
        time.sleep(randrange(3))
        
    def get_headers(self):
        '''
        Get column names of df
        :return: list of column names
        '''
        # Total Doses Distributed
        col_2 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[1]/div/div[1]/div/div/h4').text
        # Total Number of People Initiating Vaccination (1st Dose Received)
        col_3 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[1]/div/div[2]/div[1]/div/h4').text
        # Doses Distributed for Use in Long-Term Care Facilities
        col_6 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[2]/div/div[1]/div/div/h4').text
        # Number of People Initiating Vaccination (1st Dose Received) in Long-Term Care Facilities
        col_7 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[2]/div/div[2]/div[1]/div/h4').text
        
        cols = [col_2, col_3, col_6, col_7]
        for col in cols:
            print('[get_headers] headers: {0}'.format(col))
        
        time.sleep(randrange(3))
        
    
    def get_update_time(self):
        '''
        Get update time
        :return: date and time
        '''
        # CDC | Updated: Dec 30 2020 As of 9:00am ET
        update_text = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[1]/div/div[2]/div[2]/small').text
        # Dec 30 2020
        update_date = update_text[update_text.find('Updated') + 9: update_text.find('Updated') + 20]
        # 9:00am
        update_time = update_text[update_text.find('As of') + 6: -3]
        # datetime.datetime(2020, 12, 30, 9, 0))
        update_time = datetime.strptime(update_date + ' ' + update_time, '%b %d %Y %I:%M%p')
        # 201230
        update_time_str = update_time.strftime("%y%m%d")
        
        self.update_time = update_time
        self.update_time_str = update_time_str
        time.sleep(randrange(3))
        
        print('[get_update_time] update time: {0}'.format(self.update_time))
        
    def get_df_total(self):
        '''
        Get the first row of df
        '''
        # Total Doses Distributed 
        # '12,409,050'
        distribution = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[1]/div/div[1]/div/div/div').text
        # 12409050
        distribution = int(distribution.replace(',', ''))
        
        # Total Number of People Initiating Vaccination (1st Dose Received)
        # '2,794,588'
        initiate_1st = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[1]/div/div[2]/div[1]/div/div').text
        # 2794588
        initiate_1st = int(initiate_1st.replace(',', ''))
        
        # Doses Distributed for Use in Long-Term Care Facilities
        # '2,166,200'
        distribution_LTCF = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[2]/div/div[1]/div/div/div').text
        # 2166200
        distribution_LTCF = int(distribution_LTCF.replace(',', ''))
        
        # Number of People Initiating Vaccination (1st Dose Received) in Long-Term Care Facilities
        # '167,149'
        initiation_1st_LTCF = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div[2]/div/div[2]/div[1]/div/div').text
        # 167149
        initiation_1st_LTCF = int(initiation_1st_LTCF.replace(',', ''))
        
        self.df.loc[len(self.df)] = ['Total', distribution, initiate_1st, np.nan, np.nan, distribution_LTCF, initiation_1st_LTCF]
        
        self.df.to_csv(_DATA_CDC_PATH_ + 'vaccine_{0}.csv'.format(self.update_time_str), index = False)
        print('[get_df_total] distribution: {0}, initiate_1st: {1}'.format(distribution, initiate_1st))
        
        time.sleep(randrange(3))
        
    def get_df_state(self):
        '''
        Total Number of People Initiating Vaccination (1st Dose Received) Reported to the CDC by State/Territory 
        and for Selected Federal Entities per 100,000
        '''
        ################ States ################
        element = wait(self.driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "state")))
        elements = self.driver.find_elements_by_xpath('//*[@id="svg#vaccinations-map-wrapper"]/*[name()="g" and @class="state"]/*[name()="path"]')
        #elements = wait(self.driver, 6).until(EC.presence_of_element_located((By.XPATH, '//*[@id="svg#vaccinations-map-wrapper"]/*[name()="g" and @class="state"]/*[name()="path"]')))
        print('[get_df] number of elements: {0}'.format(len(elements)))
        for el in elements:
            hover = ActionChains(self.driver).move_to_element(el)
            hover.perform()
            time.sleep(2)
            tooltip = self.driver.find_element_by_css_selector('body > div.tooltip')
            tooltip = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.tooltip')))
            text = tooltip.text.split('\n')
            if text == ['']:
                print('[get_df] empty A')
                continue
            jurisdiction = text[0]
            # 'Total Distributed: 103,375 (5,768 per 100k)'
            distribution = int(text[1][text[1].find(':') + 2 : text[1].find('(') - 1].replace(',', '')) # 103375
            distribution_rate = int(text[1][text[1].find('(') + 1 : text[1].find('per') - 1].replace(',', '')) # 5768
            # 'People Initiating Vaccination (1st Dose Received): 49,827 (2,780 per 100k)'
            initiate_1st = int(text[2].split(' ')[6].replace(',', '')) 
            initiate_1st_rate = int(text[2].split(' ')[7][1:].replace(',', ''))
            if jurisdiction not in self.df.jurisdiction.values:
                self.df.loc[len(self.df)] = [jurisdiction, distribution, initiate_1st, distribution_rate, initiate_1st_rate, 
                                             np.nan, np.nan]
                print('[get_df] {0} jurisdiction: {1}, distribution: {2}, initiate_1st: {3}'.format(len(self.df), jurisdiction, distribution, initiate_1st))
        
        while len(self.df) < 47: # some states are too random to reach b/c their locations
            self.driver.get(self.url)
            time.sleep(randrange(6))
            element = wait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "state")))
            elements = self.driver.find_elements_by_xpath('//*[@id="svg#vaccinations-map-wrapper"]/*[name()="g" and @class="state"]/*[name()="path"]')        
            for j in range(len(elements)):
                el = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#svg\#vaccinations-map-wrapper > g > path:nth-child({0})'.format(j + 1))))
                hover = ActionChains(self.driver).move_to_element(el)
                hover.perform()
                time.sleep(8)
                #tooltip = self.driver.find_element_by_css_selector('body > div.tooltip')
                tooltip = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.tooltip')))
                text = tooltip.text.split('\n')
                if text == ['']:
                    print('[get_df] empty B')
                    continue
                jurisdiction = text[0]
                # 'Total Distributed: 103,375 (5,768 per 100k)'
                distribution = int(text[1][text[1].find(':') + 2 : text[1].find('(') - 1].replace(',', '')) # 103375
                distribution_rate = int(text[1][text[1].find('(') + 1 : text[1].find('per') - 1].replace(',', '')) # 5768
                # 'People Initiating Vaccination (1st Dose Received): 49,827 (2,780 per 100k)'
                initiate_1st = int(text[2].split(' ')[6].replace(',', '')) 
                initiate_1st_rate = int(text[2].split(' ')[7][1:].replace(',', ''))
                if jurisdiction not in self.df.jurisdiction.values:
                    self.df.loc[len(self.df)] = [jurisdiction, distribution, initiate_1st, distribution_rate, initiate_1st_rate, 
                                                 np.nan, np.nan]
                    print('[get_df] {0} jurisdiction: {1}, distribution: {2}, initiate_1st: {3}'.format(len(self.df), jurisdiction, distribution, initiate_1st))
        
        # Hawaii, Michigan, New Jersey, District of Columbia
        self.df.loc[len(self.df)] = ['Hawaii', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['Michigan', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['New Jersey', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['District of Columbia', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df = self.df.drop_duplicates(subset=['jurisdiction'])
        self.df = self.df.sort_values(by = 'jurisdiction')
        self.df = self.df.reset_index(drop = True)
        
    def get_df_territory(self):
        ################ Territories ################
        element = self.driver.find_element_by_xpath('//*[@id="vaccinations-territories-list"]/ul')
        elements = element.find_elements_by_tag_name("li")
        for ele in elements:
            # '{"Date":"2021-01-02","MMWR_week":53,"Location":"GU","ShortName":"GUA","LongName":"Guam",
            # "Doses_Distributed":19300,"Doses_Administered":3791,"Dist_Per_100K":11643,"Admin_Per_100K":2287,"
            # Census2019":165768,"dense_rank":50,"ntile":6,"bin":1,"class":1,"avg":2507.5714285714284,"pavg":2507.5714285714284,"navg":1003.5}'
            dictionary = json.loads(ele.get_attribute('data-paired'))
            jurisdiction = dictionary['LongName']
            distribution = dictionary['Doses_Distributed']
            distribution_rate = dictionary['Dist_Per_100K']
            initiate_1st = dictionary['Doses_Administered']
            initiate_1st_rate = dictionary['Admin_Per_100K']
            print('[get_df] jurisdiction: {0}, distribution: {1}, initiate_1st: {2}'.format(jurisdiction, distribution, initiate_1st))
            self.df.loc[len(self.df)] = [jurisdiction, distribution, initiate_1st, distribution_rate, initiate_1st_rate, 
                                         np.nan, np.nan]
    
        ################ Territories ################
        
    def get_df_entity(self):
        ################ Federal Entities ################
        element = self.driver.find_element_by_xpath('//*[@id="vaccinations-federal-list"]/ul')
        elements = element.find_elements_by_tag_name("li")
        for ele in elements:
            # '{"Date":"2021-01-02","MMWR_week":53,"Location":"VA2","ShortName":"VA2","LongName":"Veterans Health",
            # "Doses_Distributed":465425,"Doses_Administered":116374,"Dist_Per_100K":0,"Admin_Per_100K":0,
            # "Census2019":0,"dense_rank":1,"ntile":1,"bin":6,"class":4,"avg":559.3333333333334,"pavg":1079.6666666666667,"navg":1891.9}'
            dictionary = json.loads(ele.get_attribute('data-paired'))
            jurisdiction = dictionary['LongName']
            distribution = dictionary['Doses_Distributed']
            distribution_rate = dictionary['Dist_Per_100K']
            initiate_1st = dictionary['Doses_Administered']
            initiate_1st_rate = dictionary['Admin_Per_100K']
            print('[get_df] jurisdiction: {0}, distribution: {1}, initiate_1st: {2}'.format(jurisdiction, distribution, initiate_1st))
            self.df.loc[len(self.df)] = [jurisdiction, distribution, initiate_1st, distribution_rate, initiate_1st_rate, 
                                         np.nan, np.nan]
        ################ Federal Entities ################

        
        print('[get_df] completed')


    def parse(self):
        '''
        Pipeline
        '''
        
        self.start_driver()
        self.get_page(self.url)

        self.get_headers()
        self.get_update_time()
        self.get_df_state()
        
        self.get_df_territory()
        self.get_df_entity()
        self.get_df_total()
        
        self.close_driver()

In [15]:
self = CDCSpider_1()

In [16]:
self.parse()

[start_driver] driver started
[get_page] url: https://covid.cdc.gov/covid-data-tracker/#vaccinations
[get_headers] headers: Total Doses Distributed
[get_headers] headers: Total Number of People Initiating Vaccination (1st Dose Received)
[get_headers] headers: Doses Distributed for Use in Long-Term Care Facilities
[get_headers] headers: Number of People Initiating Vaccination (1st Dose Received) in Long-Term Care Facilities
[get_update_time] update time: 2021-01-14 09:00:00
[get_df] number of elements: 52
[get_df] 1 jurisdiction: Montana, distribution: 98500, initiate_1st: 46620
[get_df] 2 jurisdiction: West Virginia, distribution: 160975, initiate_1st: 118660
[get_df] 3 jurisdiction: North Carolina, distribution: 969700, initiate_1st: 268212
[get_df] 4 jurisdiction: New York, distribution: 1872625, initiate_1st: 688576
[get_df] 5 jurisdiction: Utah, distribution: 267975, initiate_1st: 115384
[get_df] 6 jurisdiction: Maryland, distribution: 564625, initiate_1st: 178056
[get_df] empty A


In [17]:
# Hawaii, Michigan, New Jersey, District of Columbia
pd.set_option('display.max_rows', self.df.shape[0]+1)
self.df

Unnamed: 0,jurisdiction,distribution,initiate_1st,distribution_rate,initiate_1st_rate,distribution_LTCF,initiation_1st_LTCF
0,Alabama,435350.0,92300.0,8879.0,1882.0,,
1,Alaska,150450.0,42596.0,20566.0,5823.0,,
2,Arizona,570625.0,173665.0,7840.0,2386.0,,
3,Arkansas,297800.0,108316.0,9868.0,3589.0,,
4,California,3540175.0,975293.0,8960.0,2468.0,,
5,Colorado,531775.0,254706.0,9234.0,4423.0,,
6,Connecticut,328675.0,171897.0,9219.0,4821.0,,
7,Delaware,91250.0,31090.0,9371.0,3193.0,,
8,District of Columbia,,,,,,
9,Florida,1970875.0,774485.0,9176.0,3606.0,,


In [18]:
# # Hawaii, Michigan, New Jersey, District of Columbia 
# the data of which are not accessible by the spider
# need to be added manually
data_added = {'Hawaii': [154150, 36605, 10887, 2585], 
              'Michigan': [772150, 291519, 7732, 2919], 
              'New Jersey': [654900, 262982, 7373, 2961], 
              'District of Columbia': [68325, 33049, 9681, 4683]}
for i, state in enumerate(['Hawaii', 'Michigan', 'New Jersey', 'District of Columbia']):
    print(round(data_added[state][0]*data_added[state][3]/(data_added[state][1]*data_added[state][2]), 4))
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'distribution'] = data_added[state][0]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'initiate_1st'] = data_added[state][1]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'distribution_rate'] = data_added[state][2]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'initiate_1st_rate'] = data_added[state][3]

0.9999
0.9999
1.0001
1.0001


In [19]:
self.df.to_csv(_DATA_CDC_PATH_1_ + 'vaccine_{0}.csv'.format(self.update_time_str), index = False)

In [None]:
# 2021-01-05
data_added = {'Hawaii': [82500, 20893, 5827, 1476], 
              'Michigan': [478800, 99040, 4794, 992], 
              'New Jersey': [390900, 120947, 4401, 1362], 
              'District of Columbia': [35475, 15853, 5027, 2246]}

# 2021-01-06
data_added = {'Hawaii': [82500, 22896, 5827, 1617], 
              'Michigan': [480750, 124689, 4814, 1249], 
              'New Jersey': [406500, 137586, 4577, 1549], 
              'District of Columbia': [35475, 17475, 5027, 2476]}

# 2021-01-07
data_added = {'Hawaii': [95200, 24558, 6724, 1734], 
              'Michigan': [662450, 137887, 6633, 1381], 
              'New Jersey': [572250, 155458, 6443, 1750], 
              'District of Columbia': [45950, 19366, 6511, 2744]}

# 2021-01-08
data_added = {'Hawaii': [95700, 27375, 6759, 1933], 
              'Michigan': [662550, 156251, 6634, 1565], 
              'New Jersey': [572250, 156021, 6443, 1757], 
              'District of Columbia': [49250, 21681, 6978, 3072]}

# 2021-01-11
data_added = {'Hawaii': [115250, 31462, 8140, 2222], 
              'Michigan': [765900, 222379, 7669, 2227], 
              'New Jersey': [651000, 220828, 7329, 2486], 
              'District of Columbia': [60775, 29228, 8611, 4141]}

# 2021-01-12 Updated: Dec 20 2001 As of 9:00am ET
data_added = {'Hawaii': [141750, 33483, 10011, 2365], 
              'Michigan': [768150, 227685, 7692, 2280], 
              'New Jersey': [654900, 221045, 7373, 2489], 
              'District of Columbia': [62725, 29812, 8888, 4224]}

# 2021-01-13
data_added = {'Hawaii': [154150, 36605, 10887, 2585], 
              'Michigan': [772150, 291519, 7732, 2919], 
              'New Jersey': [654900, 262982, 7373, 2961], 
              'District of Columbia': [68325, 33049, 9681, 4683]}

# 2021-01-14
data_added = {'Hawaii': [154150, 47985, 10887, 3389], 
              'Michigan': [658800, 327235, 7747, 3277], 
              'New Jersey': [658800, 263422, 7417, 2966], 
              'District of Columbia': [68325, 36679, 9681, 5197]}




In [122]:
# after January 14
class CDCSpider_2(object):
    '''
    A CDC spider 
    '''
    def __init__(self):
        '''
        initialization
        :return: None
        '''
        self.url = 'https://covid.cdc.gov/covid-data-tracker/#vaccinations'
        # rate: per 100k
        self.headers = ['jurisdiction', 'distribution', 'administration', 'initiation', 'completion', 
                        'distribution_rate', 'administration_rate', 'initiation_rate', 'completion_rate']
        self.df = pd.DataFrame(columns = self.headers)
    
    def start_driver(self):
        '''
        Open headless chromedriver
        '''
        options = webdriver.ChromeOptions()
        prefs = {"download.default_directory" : _DATA_CDC_PATH_}
        options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(executable_path = r'/usr/local/bin/chromedriver', chrome_options = options)
        self.driver.maximize_window()
        time.sleep(3)
        print('[start_driver] driver started')
        
    def close_driver(self):
        '''
        Close chromedriver
        '''
        self.driver.quit()
        print('[close_driver] driver closed')
        
    def get_page(self, url):
        '''
        Tell the browser to get a page
        :param url: webpage url
        '''
        self.driver.get(url)
        time.sleep(randrange(6))
        print('[get_page] url: {}'.format(url))
        time.sleep(randrange(3))
        
    def get_headers(self):
        '''
        Get column names of df
        :return: list of column names
        '''
        # Total Doses Distributed
        col_1 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[1]/div/div/h4').text
        # Total Doses Administered
        col_2 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[2]/div/div/h4').text
        # Number of People Receiving 1 or More Doses
        col_3 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[3]/div/div/h4').text
        # Number of People Receiving 2 Doses
        col_4 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[4]/div[1]/div/h4').text
        # Doses Administered in Long-Term Care Facilities
        col_5 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper2"]/div[2]/div/div/div[1]/div/h4').text
        # Pfizer-BioNTech
        col_6 = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > g:nth-child(2) > g:nth-child(1) > text')))
        col_6 = col_6.text
        # Moderna
        col_7 = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > g:nth-child(2) > g:nth-child(2) > text')))
        col_7 = col_7.text
        # Not Identified
        col_8 = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > g:nth-child(2) > g:nth-child(3) > text')))
        col_8 = col_8.text
        cols = [col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8]
        for col in cols:
            print('[get_headers] headers: {0}'.format(col))
        
        self.df_total = pd.DataFrame(columns = ['update_time'] + cols)
        
        time.sleep(randrange(3))
        
    
    def get_update_time(self):
        '''
        Get update time
        :return: date and time
        '''
        # CDC | Updated: Jan 14 2021 As of 6:00am ET
        update_text = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[4]/div[2]/small').text
        # Jan 14 2021
        update_date = update_text[update_text.find('Updated') + 9: update_text.find('Updated') + 20]
        # 6:00am
        update_time = update_text[update_text.find('As of') + 6: -3]
        # datetime.datetime(2021, 1, 14, 6, 0))
        update_time = datetime.strptime(update_date + ' ' + update_time, '%b %d %Y %I:%M%p')
        # 210114
        update_time_str = update_time.strftime("%y%m%d")
    
        self.update_time = update_time
        self.update_time_str = update_time_str
        time.sleep(randrange(3))
        
        print('[get_update_time] update time: {0}'.format(self.update_time))
        
    def get_df_total(self):
        '''
        Get the first df
        '''
        # Total Doses Distributed
        distribution = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[1]/div/div/div').text
        distribution = int(distribution.replace(',', ''))
        # Total Doses Administered
        administration = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[2]/div/div/div').text
        administration = int(administration.replace(',', ''))
        # Number of People Receiving 1 or More Doses
        doses_1 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[3]/div/div/div').text
        doses_1 = int(doses_1.replace(',', ''))
        # Number of People Receiving 2 Doses
        doses_2 = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper"]/div/div/div[4]/div[1]/div/div').text
        doses_2 = int(doses_2.replace(',', ''))
        # Doses Administered in Long-Term Care Facilities
        # //*[@id="vaccinations-banner-wrapper2"]/div[2]/div/div/div[1]/div/div
        administration_LTCF = self.driver.find_element_by_xpath('//*[@id="vaccinations-banner-wrapper2"]/div[2]/div/div/div[1]/div/div').text
        administration_LTCF = int(administration_LTCF.replace(',', ''))
        # Pfizer # #vaccinationsBarChart > svg > g > text:nth-child(8)
        pfizer = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > text:nth-child(8)'))).text
        pfizer = int(pfizer.replace(',', ''))
        # Moderna
        moderna = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > text:nth-child(9)'))).text
        moderna = int(moderna.replace(',', ''))
        # Not Identified
        unknown = wait(self.driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#vaccinationsBarChart > svg > g > text:nth-child(10)'))).text
        unknown = int(unknown.replace(',', ''))
        
        self.df_total.loc[len(self.df_total)] = [self.update_time, distribution, administration, 
                                           doses_1, doses_2, administration_LTCF,
                                           pfizer, moderna, unknown]
        
        self.df_total.to_csv(_DATA_CDC_PATH_2_ + 'vaccine_total_{0}.csv'.format(self.update_time_str), index = False)
        
        print('[get_df_total] Total Doses Distributed: {0}, Total Doses Administered: {1}'.format(distribution, administration))
        print('[get_df_total] Number of People Receiving 1 or More Doses: {0}, Number of People Receiving 2 Doses: {1}'.format(doses_1, doses_2))
        print('[get_df_total] Doses Administered in Long-Term Care Facilities: {0}'.format(administration_LTCF))
        print('[get_df_total] Pfizer-BioNTech: {0}, Moderna: {1}, Not Identified: {2}'.format(pfizer, moderna, unknown))
        
        
        
        time.sleep(randrange(3))
        
    def get_df_state(self):
        '''
        Get the second df
        '''
        ################ States ################
        element = wait(self.driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "state")))
        elements = self.driver.find_elements_by_xpath('//*[@id="svg#vaccinations-map-wrapper"]/*[name()="g" and @class="state"]/*[name()="path"]')
        print('[get_df] number of elements: {0}'.format(len(elements)))
        for el in elements:
            hover = ActionChains(self.driver).move_to_element(el)
            hover.perform()
            time.sleep(3)
            tooltip = self.driver.find_element_by_css_selector('body > div.tooltip')
            tooltip = wait(self.driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.tooltip')))
            text = tooltip.text.split('\n')
            if text == ['']:
                print('[get_df] empty')
                continue
            jurisdiction = text[0]
            distribution = re.findall(r'\d+', text[1].replace(',', ''))
            administration = re.findall(r'\d+', text[2].replace(',', ''))
            initiation = re.findall(r'\d+', text[3].replace(',', ''))
            completion = re.findall(r'\d+', text[4].replace(',', ''))

            distribution, distribution_rate = distribution[0], distribution[1]
            administration, administration_rate = administration[0], administration[1]
            initiation, initiation_rate = initiation[1], initiation[2]
            completion, completion_rate = completion[1], completion[2]

            if jurisdiction not in self.df.jurisdiction.values:
                self.df.loc[len(self.df)] = [jurisdiction, distribution, administration, initiation, completion, 
                                             distribution_rate, administration_rate, initiation_rate, completion_rate]
                print('[get_df] {0} jurisdiction: {1}, distribution: {2}, administration: {3}'.format(len(self.df), jurisdiction, distribution, administration))

        
        while len(self.df) < 48: # some states are too random to reach b/c their locations
            self.driver.get(self.url)
            time.sleep(randrange(10))
            element = wait(self.driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "state")))
            elements = self.driver.find_elements_by_xpath('//*[@id="svg#vaccinations-map-wrapper"]/*[name()="g" and @class="state"]/*[name()="path"]')        

            for j in range(len(elements)):
                el = wait(self.driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#svg\#vaccinations-map-wrapper > g > path:nth-child({0})'.format(j + 1))))
                hover = ActionChains(self.driver).move_to_element(el)
                hover.perform()
                time.sleep(3)
                #tooltip = self.driver.find_element_by_css_selector('body > div.tooltip')
                tooltip = wait(self.driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.tooltip')))
                text = tooltip.text.split('\n')
                if text == ['']:
                    print('[get_df] empty')
                    continue
                jurisdiction = text[0]
                distribution = re.findall(r'\d+', text[1].replace(',', ''))
                administration = re.findall(r'\d+', text[2].replace(',', ''))
                initiation = re.findall(r'\d+', text[3].replace(',', ''))
                completion = re.findall(r'\d+', text[4].replace(',', ''))

                distribution, distribution_rate = distribution[0], distribution[1]
                administration, administration_rate = administration[0], administration[1]
                initiation, initiation_rate = initiation[1], initiation[2]
                completion, completion_rate = completion[1], completion[2]

                if jurisdiction not in self.df.jurisdiction.values:
                    self.df.loc[len(self.df)] = [jurisdiction, distribution, administration, initiation, completion, 
                                                 distribution_rate, administration_rate, initiation_rate, completion_rate]
                    print('[get_df] {0} jurisdiction: {1}, distribution: {2}, administration: {3}'.format(len(self.df), jurisdiction, distribution, administration))

        # Hawaii, Michigan, New Jersey, District of Columbia
        self.df.loc[len(self.df)] = ['Hawaii', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['Michigan', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['New Jersey', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df.loc[len(self.df)] = ['District of Columbia', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        self.df = self.df.drop_duplicates(subset=['jurisdiction'])
        self.df = self.df.sort_values(by = 'jurisdiction')
        self.df = self.df.reset_index(drop = True)
        
    def get_df_territory(self):
        ################ Territories ################
        element = self.driver.find_element_by_xpath('//*[@id="vaccinations-territories-list"]/ul')
        elements = element.find_elements_by_tag_name("li")
        for ele in elements:
            # '{"Date":"2021-01-02","MMWR_week":53,"Location":"GU","ShortName":"GUA","LongName":"Guam",
            # "Doses_Distributed":19300,"Doses_Administered":3791,"Dist_Per_100K":11643,"Admin_Per_100K":2287,"
            # Census2019":165768,"dense_rank":50,"ntile":6,"bin":1,"class":1,"avg":2507.5714285714284,"pavg":2507.5714285714284,"navg":1003.5}'
            dictionary = json.loads(ele.get_attribute('data-paired'))
            jurisdiction = dictionary['LongName']
            distribution = dictionary['Doses_Distributed']
            administration = dictionary['Doses_Administered']
            initiation = dictionary['Administered_Dose1']
            completion = dictionary['Administered_Dose2']

            distribution_rate = dictionary['Dist_Per_100K']
            administration_rate = dictionary['Admin_Per_100K']
            initiation_rate = dictionary['Administered_Dose1_Per_100K']
            completion_rate = dictionary['Administered_Dose2_Per_100K']
            print('[get_df] jurisdiction: {0}, distribution: {1}, administration: {2}'.format(jurisdiction, distribution, administration))
            self.df.loc[len(self.df)] = [jurisdiction, distribution, administration, initiation, completion, 
                                         distribution_rate, administration_rate, initiation_rate, completion_rate]

        ################ Territories ################
        
    def get_df_entity(self):
        ################ Federal Entities ################
        element = self.driver.find_element_by_xpath('//*[@id="vaccinations-federal-list"]/ul')
        elements = element.find_elements_by_tag_name("li")
        for ele in elements:
            # '{"Date":"2021-01-02","MMWR_week":53,"Location":"GU","ShortName":"GUA","LongName":"Guam",
            # "Doses_Distributed":19300,"Doses_Administered":3791,"Dist_Per_100K":11643,"Admin_Per_100K":2287,"
            # Census2019":165768,"dense_rank":50,"ntile":6,"bin":1,"class":1,"avg":2507.5714285714284,"pavg":2507.5714285714284,"navg":1003.5}'
            dictionary = json.loads(ele.get_attribute('data-paired'))
            jurisdiction = dictionary['LongName']
            distribution = dictionary['Doses_Distributed']
            administration = dictionary['Doses_Administered']
            initiation = dictionary['Administered_Dose1']
            completion = dictionary['Administered_Dose2']

            distribution_rate = dictionary['Dist_Per_100K']
            administration_rate = dictionary['Admin_Per_100K']
            initiation_rate = dictionary['Administered_Dose1_Per_100K']
            completion_rate = dictionary['Administered_Dose2_Per_100K']
            print('[get_df] jurisdiction: {0}, distribution: {1}, administration: {2}'.format(jurisdiction, distribution, administration))
            self.df.loc[len(self.df)] = [jurisdiction, distribution, administration, initiation, completion, 
                                         distribution_rate, administration_rate, initiation_rate, completion_rate]

        ################ Federal Entities ################

        
        print('[get_df] completed')


    def parse(self):
        '''
        Pipeline
        '''
        
        self.start_driver()
        self.get_page(self.url)

        self.get_headers()
        self.get_update_time()
        self.get_df_total()
        
        self.get_df_state()
        self.get_df_territory()
        self.get_df_entity()
        
        
        self.close_driver()

In [118]:
self = CDCSpider_2()

In [119]:
self.parse()

[start_driver] driver started
[get_page] url: https://covid.cdc.gov/covid-data-tracker/#vaccinations
[get_headers] headers: Total Doses Distributed
[get_headers] headers: Total Doses Administered
[get_headers] headers: Number of People Receiving 1 or More Doses
[get_headers] headers: Number of People Receiving 2 Doses
[get_headers] headers: Doses Administered in Long-Term Care Facilities
[get_headers] headers: Pfizer-BioNTech
[get_headers] headers: Moderna
[get_headers] headers: Not Identified
[get_update_time] update time: 2021-01-14 06:00:00
[get_df_total] Total Doses Distributed: 30628175, Total Doses Administered: 11148991
[get_df_total] Number of People Receiving 1 or More Doses: 9690757, Number of People Receiving 2 Doses: 1342086
[get_df_total] Doses Administered in Long-Term Care Facilities: 1225493
[get_df_total] Pfizer-BioNTech: 6525252, Moderna: 4620722, Not Identified: 3017
[get_df] number of elements: 52
[get_df] 1 jurisdiction: Montana, distribution: 98500, administration

In [120]:
self.df_total

Unnamed: 0,update_time,Total Doses Distributed,Total Doses Administered,Number of People Receiving 1 or More Doses,Number of People Receiving 2 Doses,Doses Administered in Long-Term Care Facilities,Pfizer-BioNTech,Moderna,Not Identified
0,2021-01-14 06:00:00,30628175,11148991,9690757,1342086,1225493,6525252,4620722,3017


In [121]:
self.df

Unnamed: 0,jurisdiction,distribution,administration,initiation,completion,distribution_rate,administration_rate,initiation_rate,completion_rate
0,Alabama,435350.0,92300.0,80480.0,11699.0,8879.0,1882.0,1641.0,239.0
1,Alaska,150450.0,42596.0,35508.0,7013.0,20566.0,5823.0,4854.0,959.0
2,Arizona,570625.0,173665.0,157397.0,16234.0,7840.0,2386.0,2162.0,223.0
3,Arkansas,297800.0,108316.0,90339.0,16711.0,9868.0,3589.0,2994.0,554.0
4,California,3540175.0,975293.0,801998.0,170245.0,8960.0,2468.0,2030.0,431.0
5,Colorado,531775.0,254706.0,209300.0,44087.0,9234.0,4423.0,3634.0,766.0
6,Connecticut,328675.0,171897.0,156366.0,14052.0,9219.0,4821.0,4386.0,394.0
7,Delaware,91250.0,31090.0,25649.0,5354.0,9371.0,3193.0,2634.0,550.0
8,District of Columbia,68325.0,36679.0,29927.0,6741.0,9681.0,5197.0,4240.0,955.0
9,Florida,1970875.0,774485.0,709002.0,63802.0,9176.0,3606.0,3301.0,297.0


In [126]:
# # Hawaii, Michigan, New Jersey
# the data of which are not accessible by the spider
# need to be added manually
data_added = {'Hawaii': [154150, 47985, 0, 0, 10887, 3389, 0, 0], 
              'Michigan': [773650, 327235, 286349, 37918, 7747, 3277, 2867, 380], 
              'New Jersey': [658800, 263422, 235152, 26939, 7417, 2966, 2647, 303]}
for i, state in enumerate(['Hawaii', 'Michigan', 'New Jersey']):
    ratio_1 = round(data_added[state][0]*data_added[state][5]/(data_added[state][1]*data_added[state][4]), 4)
    ratio_2 = 0
    if data_added[state][3]!=0:
        ratio_2 = round(data_added[state][2]*data_added[state][7]/(data_added[state][3]*data_added[state][6]), 4)
    print(ratio_1, ratio_2)
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'distribution'] = data_added[state][0]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'administration'] = data_added[state][1]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'initiation'] = data_added[state][2]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'completion'] = data_added[state][3]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'distribution_rate'] = data_added[state][4]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'administration_rate'] = data_added[state][5]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'initiation_rate'] = data_added[state][6]
    self.df.at[self.df.loc[self.df['jurisdiction'] == state].index[0], 'completion_rate'] = data_added[state][7]

1.0 0
1.0001 1.0009
1.0001 0.9992


In [128]:
self.df.to_csv(_DATA_CDC_PATH_2_ + 'vaccine_{0}.csv'.format(self.update_time_str), index = False)

In [None]:
# 2021-01-14 06:00:00
data_added = {'Hawaii': [154150, 47985, 0, 0, 10887, 3389, 0, 0], 
              'Michigan': [773650, 327235, 286349, 37918, 7747, 3277, 2867, 380], 
              'New Jersey': [658800, 263422, 235152, 26939, 7417, 2966, 2647, 303]}