# TODO:

Aggregate daily cases and deaths into monthly to be compatible with housing \
Filter housing data to only period of interest (start in 2019 or 2020?) \
First covid cases in US were confirmed in January 2020 \
Months should be rows (observations) and reported cases, deaths, county, and median value should be features \

For weekly assignment:
At a minimum, the file should include a summary containing:

Number of records\
Number of columns\
Statistical summary of response variable\
Statistical summary of potential predictor variables (if there are a large number of predictors, select the top 10)\
Note: Summarize categorical variables with counts and percentages for each level and summarize numerical variables with mean/quantiles/standard deviation.\
Include up to five helpful graphs

In [3]:
import pandas as pd
import numpy as np
import os

data_dir = '..\\data\\' # Windows, reverse slashes for MacOS

# Read in data

In [12]:
# fips to county read in and add columns to be compatible with other data

cols = ['state', 'state_fips','county_fips','county_name','H']
fips2county = pd.read_csv(f"{data_dir}\\fips2county.txt",dtype='str',names=cols)
fips2county['full_fips'] = fips2county.state_fips+fips2county.county_fips
fips2county['full_county'] = fips2county.county_name + ", " + fips2county.state
fips2county

Unnamed: 0,state,state_fips,county_fips,county_name,H,full_fips,full_county
0,AL,01,001,Autauga County,H1,01001,"Autauga County, AL"
1,AL,01,003,Baldwin County,H1,01003,"Baldwin County, AL"
2,AL,01,005,Barbour County,H1,01005,"Barbour County, AL"
3,AL,01,007,Bibb County,H1,01007,"Bibb County, AL"
4,AL,01,009,Blount County,H1,01009,"Blount County, AL"
...,...,...,...,...,...,...,...
3230,PR,72,153,Yauco Municipio,H1,72153,"Yauco Municipio, PR"
3231,UM,74,300,Midway Islands,H4,74300,"Midway Islands, UM"
3232,VI,78,010,St. Croix Island,H4,78010,"St. Croix Island, VI"
3233,VI,78,020,St. John Island,H4,78020,"St. John Island, VI"


In [102]:
# read in housing data, pull out counties and time during the pandemic

house = pd.read_csv(f"{data_dir}\\med_sale_price_counties.csv", header=1)
house.shape # 1860 counties, 120 months, only want last 24 (2020-2021)
house_county = pd.DataFrame(house.iloc[:,0])
house_pand = house.iloc[:, -24:-1]
new_house = house_county.join(house_pand)
new_house

Unnamed: 0,Region,Jan-20,Feb-20,Mar-20,Apr-20,May-20,Jun-20,Jul-20,Aug-20,Sep-20,...,Feb-21,Mar-21,Apr-21,May-21,Jun-21,Jul-21,Aug-21,Sep-21,Oct-21,Nov-21
0,"Abbeville County, SC",$200K,$229K,$216K,$103K,$159K,$168K,$183K,$193K,$160K,...,$222K,$125K,$225K,$173K,$280K,$352K,$271K,$111K,$169K,$220K
1,"Ada County, ID",$360K,$360K,$365K,$366K,$360K,$372K,$385K,$400K,$404K,...,$452K,$463K,$480K,$510K,$517K,$538K,$525K,$525K,$526K,$530K
2,"Adair County, IA",$62K,$127K,$60K,$130K,$110K,$90K,$95K,$75K,$60K,...,,$89K,$130K,$118K,$132K,$137K,$138K,$95K,$80K,$240K
3,"Adair County, OK",$92K,$99K,$169K,$124K,$98K,$135K,$90K,$88K,$103K,...,$222K,$115K,$283K,$79K,$95K,$103K,$136K,$226K,$157K,$145K
4,"Adams County, CO",$372K,$375K,$393K,$390K,$386K,$383K,$395K,$395K,$400K,...,$415K,$435K,$450K,$465K,$465K,$470K,$465K,$470K,$470K,$475K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855,"York County, VA",$315K,$305K,$301K,$285K,$325K,$339K,$328K,$320K,$308K,...,$333K,$329K,$325K,$321K,$346K,$334K,$336K,$336K,$330K,$347K
1856,"Young County, TX",$113K,$123K,$116K,$130K,$93K,$145K,$145K,$163K,$205K,...,$170K,$197K,$200K,$150K,$175K,$249K,$305K,$139K,$177K,$190K
1857,"Yuba County, CA",$331K,$325K,$319K,$330K,$295K,$337K,$333K,$340K,$315K,...,$383K,$375K,$386K,$400K,$420K,$420K,$381K,$404K,$400K,$415K
1858,"Yuma County, CO",$180K,$160K,$180K,$244K,$175K,$130K,$181K,$200K,$180K,...,$137K,$311K,$205K,$188K,$213K,$293K,$158K,$180K,$215K,$120K


In [87]:
# read in covid death and positive case data

case_data = pd.read_csv(f'{data_dir}\\time_series_covid19_confirmed_US.csv')
death_data = pd.read_csv(f'{data_dir}\\time_series_covid19_deaths_US.csv')

In [100]:
# convert fips codes to strings
case_data['UID'] = case_data.UID.astype('str')
fips_str = []
for i in range(0, len(case_data)):
    fips_str.append(case_data.iloc[i].UID[3:8])
case_data['FIPS'] = fips_str

death_data['UID'] = death_data.UID.astype('str')
fips_str = []
for i in range(0, len(death_data)):
    fips_str.append(death_data.iloc[i].UID[3:8])
death_data['FIPS'] = fips_str
death_data

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/8/22,2/9/22,2/10/22,2/11/22,2/12/22,2/13/22,2/14/22,2/15/22,2/16/22,2/17/22
0,84001001,US,USA,840,01001,Autauga,Alabama,US,32.539527,-86.644082,...,171,173,173,173,176,176,176,177,181,181
1,84001003,US,USA,840,01003,Baldwin,Alabama,US,30.727750,-87.722071,...,624,626,626,626,626,626,626,629,630,631
2,84001005,US,USA,840,01005,Barbour,Alabama,US,31.868263,-85.387129,...,85,85,87,87,89,89,89,90,91,91
3,84001007,US,USA,840,01007,Bibb,Alabama,US,32.996421,-87.125115,...,96,96,97,97,97,97,97,98,98,98
4,84001009,US,USA,840,01009,Blount,Alabama,US,33.982109,-86.567906,...,211,212,212,212,213,213,213,213,214,215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,84056039,US,USA,840,56039,Teton,Wyoming,US,43.935225,-110.589080,...,15,15,15,15,15,15,15,15,15,15
3338,84056041,US,USA,840,56041,Uinta,Wyoming,US,41.287818,-110.547578,...,36,36,36,36,36,36,36,36,36,36
3339,84090056,US,USA,840,90056,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3340,84056043,US,USA,840,56043,Washakie,Wyoming,US,43.904516,-107.680187,...,42,42,42,42,42,42,42,42,42,42
