# Daily CoV-2 Update
## Run for Updating Existing Database
### Import Dependencies

In [1]:
# basic stuff
import psycopg2
import pandas as pd
import psycopg2.extras
import os
import numpy
from config import (census_key, gkey)
import gmaps
import requests
from ipywidgets.embed import embed_minimal_html
from pprint import pprint

# Imports the method used to connect to DBs
from sqlalchemy import create_engine

# function to establish a session with a connected database
from sqlalchemy.orm import Session

# database compliant datatypes
from sqlalchemy import Column, Integer, String, Float

### PostgreSQL Seteup (must be running)

In [2]:
# password is hard-coded in the connection string as "postgres"
engine = create_engine('postgresql://postgres:postgres@localhost:5432/covid_db')

### Truncate staging tables

In [21]:
truncate_sql = 'TRUNCATE TABLE staging_confirmed_cases'
engine.execute(truncate_sql)

truncate_sql = 'TRUNCATE TABLE staging_county_deaths'
engine.execute(truncate_sql)

truncate_sql = 'TRUNCATE TABLE staging_test_dates'
engine.execute(truncate_sql)

truncate_sql = 'TRUNCATE TABLE confirmed_cases'
engine.execute(truncate_sql)

truncate_sql = 'TRUNCATE TABLE county_deaths'
engine.execute(truncate_sql)


<sqlalchemy.engine.result.ResultProxy at 0x292af781b08>

### Load Confirmed Cases

In [22]:
# confirmed case data
raw_file = os.path.join("..","data","raw","covid_confirmed_usafacts.csv")

# pandas read csv to dataframe
raw_df = pd.read_csv(raw_file, encoding="ISO-8859-1")

# preview the raw data
raw_df.head()

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,...,6/4/2020,6/5/2020,6/6/2020,6/7/2020,6/8/2020,6/9/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,241,248,259,265,272,282,295,312,323,331
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,293,296,304,313,320,325,331,343,353,361
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,177,183,190,193,197,199,208,214,221,226
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,76,76,77,77,79,85,89,93,97,100


### Rename some columns

In [23]:
# set names to table column names
raw_df = raw_df.rename(columns={
    'countyFIPS': 'county_fips',
    'County Name': 'county_name',
    'State': 'state_id',
    'stateFIPS': 'state_fips'
})

# checking
raw_df.head()

Unnamed: 0,county_fips,county_name,state_id,state_fips,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,...,6/4/2020,6/5/2020,6/6/2020,6/7/2020,6/8/2020,6/9/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,241,248,259,265,272,282,295,312,323,331
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,293,296,304,313,320,325,331,343,353,361
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,177,183,190,193,197,199,208,214,221,226
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,76,76,77,77,79,85,89,93,97,100


### Load to Staging table
This will loop through the columns creating a row for each row/column intersection (normalizing)

In [24]:
# arrays for the new dataframe
countyFIPS = raw_df['county_fips']
countyNames = raw_df['county_name']
states = raw_df['state_id']
stateFIPS = raw_df['state_fips']

# column counter
col = 0

# iterate over the columns
testDates = []
for columnName, columnValues in raw_df.iteritems():
    # increment column counter
    col+=1
    
    # if the column is past the 4th position, then it is a fact column
    if col > 4:
        testDates.append(columnName)

# loop through dates, put together dataset
for d in testDates:

    # array of the confirmed cases for the date (d)
    confirmedCases = raw_df[d]
    
    # create a new dataframe of all the arrays
    new_df = pd.DataFrame({
        'county_fips': countyFIPS,
        'county_name': countyNames,
        'state_id': states,
        'state_fips': stateFIPS,
        'test_date': d,
        'confirmed_cases': confirmedCases
    })
    
    # write each data frame to the sql staging table
    new_df.to_sql('staging_confirmed_cases', con=engine, if_exists='append', index=False)


### Add new dates to DB

In [25]:
# pull from staging dates view
dates_df = pd.read_sql_query('select test_date from "staging_confirmed_cases" GROUP BY test_date',con=engine)

dates_df.head()

Unnamed: 0,test_date
0,2020-01-22
1,2020-01-23
2,2020-01-24
3,2020-01-25
4,2020-01-26


In [26]:
# write to staging table
# writing to county table
dates_df.to_sql('staging_test_dates', con=engine, if_exists='replace', index=False)

In [27]:
# move new dates to final table (test_dates)
update_sql = 'INSERT INTO test_dates SELECT s.test_date FROM staging_test_dates s LEFT OUTER JOIN test_dates td ON s.test_date = td.test_date WHERE td.test_date IS NULL'
engine.execute(update_sql)

<sqlalchemy.engine.result.ResultProxy at 0x292a82829c8>

### Load confirmed cases to confirmed_cases
#### Dataframe of data

In [28]:
# make dataframe out of confirmed cases staging view (note that we are excluding 0's in the view)
cases_df = pd.read_sql_query('select * from "staging_cases_list"',con=engine)

cases_df.head()

Unnamed: 0,county_fips,test_date,confirmed_cases
0,1001,2020-01-22,0
1,1003,2020-01-22,0
2,1005,2020-01-22,0
3,1007,2020-01-22,0
4,1009,2020-01-22,0


### Move to final table

In [29]:
# writing to confirmed cases table
cases_df.to_sql('confirmed_cases', con=engine, if_exists='append', index=False)

### Deaths by County
#### Load the file

In [30]:
# confirmed case data
raw_file = os.path.join("..","data","raw","covid_deaths_usafacts.csv")

# pandas read csv to dataframe
raw_df = pd.read_csv(raw_file, encoding="ISO-8859-1")

# preview the raw data
raw_df.head()

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,...,6/4/2020,6/5/2020,6/6/2020,6/7/2020,6/8/2020,6/9/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,5,5,5,5,5,5,6,6,6,6
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,9,9,9,9,9,9,9,9,9,9
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


### Rename columns

In [31]:
# set names to table column names
raw_df = raw_df.rename(columns={
    'countyFIPS': 'county_fips',
    'County Name': 'county_name',
    'State': 'state_id',
    'stateFIPS': 'state_fips'
})

# checking
raw_df.head()

Unnamed: 0,county_fips,county_name,state_id,state_fips,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,...,6/4/2020,6/5/2020,6/6/2020,6/7/2020,6/8/2020,6/9/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,5,5,5,5,5,5,6,6,6,6
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,9,9,9,9,9,9,9,9,9,9
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


### Loop through columns, normalizing & Load to Staging

In [32]:
# arrays for the new dataframe
countyFIPS = raw_df['county_fips']
countyNames = raw_df['county_name']
states = raw_df['state_id']
stateFIPS = raw_df['state_fips']

# column counter
col = 0

# iterate over the columns
testDates = []
for columnName, columnValues in raw_df.iteritems():
    # increment column counter
    col+=1
    
    # if the column is past the 4th position, then it is a fact column
    if col > 4:
        testDates.append(columnName)

# loop through dates, put together dataset
for d in testDates:

    # array of the confirmed cases for the date (d)
    deaths = raw_df[d]
    
    # create a new dataframe of all the arrays
    new_df = pd.DataFrame({
        'county_fips': countyFIPS,
        'test_date': d,
        'covid_deaths': deaths
    })

    # write each data frame to the sql staging table
    new_df.to_sql('staging_county_deaths', con=engine, if_exists='append', index=False)

### Pull in staging data

In [33]:
# pull from staging population view
deaths_df = pd.read_sql_query('select * from "staging_deaths"',con=engine)

deaths_df.head()

Unnamed: 0,county_fips,test_date,covid_deaths
0,1001,2020-01-22,0
1,1003,2020-01-22,0
2,1005,2020-01-22,0
3,1007,2020-01-22,0
4,1009,2020-01-22,0


### Write to final table

In [34]:
# writing to confirmed cases table
deaths_df.to_sql('county_deaths', con=engine, if_exists='append', index=False)

### Texas Moving Avg

In [35]:
# exporting for tableau
report_df = pd.read_sql_query('select * from "texas_moving_average"',con=engine)

report_df.head()

Unnamed: 0,county_fips,county_name,county_desc,daily_rank,latitude,longitude,state_id,population,test_date,confirmed_cases,daily_change,covid_deaths,observations,avg_start_date,avg_end_date,avg_daily_change,sum_daily_change,avg_daily_change_percent,rolling_sum_avg
0,48169,Garza County,Garza County (TX),35,33.19559,-101.252379,TX,6229,2020-02-26,0,0,0,4,2020-02-23,2020-02-26,0.0,0.0,,0.0
1,48199,Hardin County,Hardin County (TX),111,37.49951,-88.362785,TX,57602,2020-05-12,115,0,3,4,2020-05-09,2020-05-12,0.5,2.0,0.004425,0.5
2,48099,Coryell County,Coryell County (TX),122,31.477436,-97.87216,TX,75951,2020-05-23,226,0,2,4,2020-05-20,2020-05-23,0.25,1.0,0.001111,0.25
3,48397,Rockwall County,Rockwall County (TX),68,32.872165,-96.365157,TX,104915,2020-03-30,4,0,0,4,2020-03-27,2020-03-30,0.5,2.0,0.208333,0.5
4,48083,Coleman County,Coleman County (TX),82,31.732022,-99.456155,TX,8175,2020-04-13,0,0,0,4,2020-04-10,2020-04-13,0.0,0.0,,0.0


### Write to file for Tableau

In [36]:
full_file = os.path.join("..", "data", "clean", "texas_cases_daily.csv")
report_df.to_csv(full_file, index=False, header=True)

### File to import into SQL

In [39]:
# exporting for tableau
report_df = pd.read_sql_query('select * from "confirmed_cases_daily"',con=engine)

report_df.head()

Unnamed: 0,county_fips,county_name,county_desc,latitude,longitude,state_id,population,test_date,confirmed_cases,covid_deaths,previous_date,previous_confirmed_cases,cases_daily_change,cases_daily_change_percent,previous_covid_deaths,deaths_daily_change,deaths_daily_change_percent
0,1001,Autauga County,Autauga County (AL),32.579182,-86.499655,AL,55869,2020-01-23,0,0,2020-01-22,0,0,,0,0,
1,1001,Autauga County,Autauga County (AL),32.579182,-86.499655,AL,55869,2020-01-24,0,0,2020-01-23,0,0,,0,0,
2,1001,Autauga County,Autauga County (AL),32.579182,-86.499655,AL,55869,2020-01-25,0,0,2020-01-24,0,0,,0,0,
3,1001,Autauga County,Autauga County (AL),32.579182,-86.499655,AL,55869,2020-01-26,0,0,2020-01-25,0,0,,0,0,
4,1001,Autauga County,Autauga County (AL),32.579182,-86.499655,AL,55869,2020-01-27,0,0,2020-01-26,0,0,,0,0,


### Write to file for import to SQL Server

In [40]:
full_file = os.path.join("..", "data", "clean", "confirmed_cases_daily.csv")
report_df.to_csv(full_file, index=False, header=True)