In [1]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import pandas as pd
import datetime as dt

#Where to save compiled csv file
local_folder = "C:/Data/SDCovid/Dailies/"
#location of our data
folder_url = 'http://shulok.com/SD_Covid_data/'

#get the contents of the folder
url = requests.get(folder_url).text
soup = BeautifulSoup(url)

dateformat = "%m-%d-%Y"

#Date to switch to skipping 2 rows in the race csv file due to data changes
switch_date = dt.datetime(2020,6,25)

raceframes = []
hospframes = []
deathframes = []

def DateFromFileName(link, pos):
    #Break up the file name by space
    mylist = link.split("%20")
    
    #get file date which is right before the file extension
    datestr = mylist[pos].split('.')[0]
    
    #It includes data up to the day before the file date, so subtract a day
    date = dt.datetime.strptime(datestr, dateformat) - dt.timedelta(days=1)
    return date
        
def ProcessRaceFile(link):
    skip_nrows = 1
    date = DateFromFileName(link, 5)
    
    if date > switch_date:
        skip_nrows = 2

    # Read the source file, but use the first column as an index
    df = pd.read_csv(folder_url + link, skiprows=skip_nrows)

    #The file date is the date when the data was updated
    df['Date'] = date
    df.columns = ['Race and Ethnicity', 'Count', '% with known Race/Ethnicity', 'per 100k', 'Date']

    return df
    
def ProcessFile(link, pos):
    date = DateFromFileName(link, pos)
    
    # Read the source file, but use the first column as an index
    df = pd.read_csv(folder_url + link, index_col=0)

    # Transpose the frame, which puts the row lables as column labels. 
    df = df.T

    #The file date is the date when the data was updated
    df['date'] = date

    return df
        
def CombineRaceFiles():
    df = pd.concat(raceframes, sort=True) # Concatenate all of the transposed frames

    #reorder the columns
    df=df.reindex(columns= ['Date', 'Race and Ethnicity', 'Count', '% with known Race/Ethnicity', 'per 100k'])

    df.sort_values(by=['Date'], inplace=True, ascending=True)

    filename = 'COVID_19_Race_and_Ethnicity_Statistics_San_Diego_County.csv'
    print(filename)

    df.to_csv(local_folder + filename, index=False)
    
def CombineFiles(frames, filename):
    df = pd.concat(frames, sort=True) # Concatenate all of the transposed frames

    # Move the date column to the front. 
    date_col = df.pop('date')
    df.insert(0,'date',date_col)

    df.sort_values(by=['date'], inplace=True, ascending=True)

    print(filename)

    df.to_csv(local_folder + filename, index=False)# The index column is useless, so drop it. 
    df.head()
    
for link in soup.findAll("a"):
    current_link = link.get("href")
    if current_link.endswith('csv'):
        if current_link.count('Ethnicity') == 1:
            raceframes.append(ProcessRaceFile(current_link))
        elif current_link.count('Hospitalizations') == 1:
            hospframes.append(ProcessFile(current_link, 3))
        elif current_link.count('Deaths') == 1:
            deathframes.append(ProcessFile(current_link, 4))
            
CombineRaceFiles()
CombineFiles(hospframes, 'COVID_19_Hospitalization_Statistics_San_Diego_County.csv')
CombineFiles(deathframes, 'COVID_19_Death_Statistics_San_Diego_County.csv')

COVID_19_Race_and_Ethnicity_Statistics_San_Diego_County.csv
COVID_19_Hospitalization_Statistics_San_Diego_County.csv
COVID_19_Death_Statistics_San_Diego_County.csv
