# Exploratory Stuff

In [1]:
# init

import pandas as pd
import pandas.io.sql as psql

from covid19stats import *

conn = get_db_conn()

In [2]:
# sanity check: is this thing on?
# of course confirmed cases will be highly correlated with deaths, duh

sql = """
SELECT
    Confirmed,
    Deaths
FROM fact_counties_ranked cr
WHERE
    Date = '20200501';
"""

df = psql.read_sql(sql, conn)
print(f"N={df.size}")

df.corr()

N=5798


Unnamed: 0,Confirmed,Deaths
Confirmed,1.0,0.961202
Deaths,0.961202,1.0


In [3]:
# is there a correlation with population size?
# we'd expect so, since outbreaks began in large cities.
# correlation probably decreases over time

sql = """
SELECT
    Confirmed,
    Population
FROM fact_counties_ranked cr
JOIN dim_county c
    ON cr.FIPS = c.FIPS
WHERE
    Date = '20200501'
    AND Population > 0
    AND Confirmed >= 0;
"""

df = psql.read_sql(sql, conn)
print(f"N={df.size}")

df.corr()

N=5688


Unnamed: 0,Confirmed,Population
Confirmed,1.0,0.6319
Population,0.6319,1.0


In [4]:
# no correlation with median age, which is odd

sql = """
SELECT
    Deaths,
    MedianAge
FROM fact_counties_ranked cr
JOIN dim_county c
    ON cr.FIPS = c.FIPS
WHERE
    Date = '20200501'
    AND MedianAge > 0
    AND Deaths >= 0;
"""

df = psql.read_sql(sql, conn)
print(f"N={df.size}")

df.corr()

N=5386


Unnamed: 0,Deaths,MedianAge
Deaths,1.0,-0.030504
MedianAge,-0.030504,1.0


In [5]:
# no correlation with median income

sql = """
SELECT
    Confirmed,
    MedianIncome
FROM fact_counties_ranked cr
JOIN dim_county c
    ON cr.FIPS = c.FIPS
WHERE
    Date = '20200501'
    AND MedianIncome > 0
    AND Confirmed >= 0;
"""

df = psql.read_sql(sql, conn)
print(f"N={df.size}")

df.corr()

N=5384


Unnamed: 0,Confirmed,MedianIncome
Confirmed,1.0,0.158785
MedianIncome,0.158785,1.0


In [6]:
# try median income again, but with ConfirmedPer1M this time.
# it's not clear to me whether you can use counts adjusted to the pop in calculating correlation coefficient

sql = """
SELECT
    ConfirmedPer1M,
    MedianIncome
FROM fact_counties_ranked cr
JOIN dim_county c
    ON cr.FIPS = c.FIPS
WHERE
    Date = '20200501'
    AND MedianIncome > 0
    AND ConfirmedPer1M >= 0;
"""

df = psql.read_sql(sql, conn)
print(f"N={df.size}")

df.corr()

N=5384


Unnamed: 0,ConfirmedPer1M,MedianIncome
ConfirmedPer1M,1.0,0.095941
MedianIncome,0.095941,1.0
