In [1]:
from io import StringIO

import pandas as pd
import requests
import mysql.connector as connection
from sqlalchemy import create_engine

In [2]:
def cleanup_county(c):
    if c is None:
        return None
    
    c = c.upper()
    
    to_remove = [
        "COUNTY",
        "CITY",
        "TOWN",
        "'",
        ",",
        " OF"
    ]
    
    for tr in to_remove:
        c = c.replace(tr, "")
        
    c = c.strip()
    
    return c
    
def fix_fips(fips):
    if type(fips) == int:
        fips = str(fips)
        
    if fips.startswith("<a") and fips.endswith("</a>"):
        fips = fips.split(">")[1].split("<")[0]
        
    if len(fips) == 4:
        fips = '0' + fips
    
    return fips
    
resp = requests.get("https://www.openintro.org/data/csv/county_complete.csv", 
                    headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"})

buf = StringIO(resp.text)

df_counties = pd.read_csv(buf, dtype={'fips': str})
df_counties = df_counties[["fips", "state", "name", "area_2010"]]
df_counties = df_counties.rename(columns={'name': 'county'})
df_counties['county'] = df_counties['county'].apply(cleanup_county)
df_counties['fips'] = df_counties['fips'].apply(fix_fips)
df_counties

Unnamed: 0,fips,state,county,area_2010
0,01001,Alabama,AUTAUGA,594.44
1,01003,Alabama,BALDWIN,1589.78
2,01005,Alabama,BARBOUR,884.88
3,01007,Alabama,BIBB,622.58
4,01009,Alabama,BLOUNT,644.78
...,...,...,...,...
3137,56037,Wyoming,SWEETWATER,10426.65
3138,56039,Wyoming,TETON,3995.38
3139,56041,Wyoming,UINTA,2081.26
3140,56043,Wyoming,WASHAKIE,2238.55


In [3]:
db_connection_str = 'mysql+mysqlconnector://rl:trustno1@localhost/us_housing_prices_v2'
db_connection = create_engine(db_connection_str)

query = "SELECT * FROM `states`;"
states_df = pd.read_sql(query, db_connection)
states_df = states_df.rename(columns={'name': 'state'})

df_counties = pd.merge(df_counties, states_df, on='state')
df_counties['county_state'] = df_counties['county'] + ', ' + df_counties['code'] 
del df_counties['county']
del df_counties['code']
del df_counties['state']

df_counties.loc[df_counties['fips'] == "51600", "county_state"] = "FAIRFAX CITY, VA"
df_counties.loc[df_counties['fips'] == "24510", "county_state"] = "BALTIMORE CITY, MD"
df_counties

Unnamed: 0,fips,area_2010,county_state
0,01001,594.44,"AUTAUGA, AL"
1,01003,1589.78,"BALDWIN, AL"
2,01005,884.88,"BARBOUR, AL"
3,01007,622.58,"BIBB, AL"
4,01009,644.78,"BLOUNT, AL"
...,...,...,...
3137,56037,10426.65,"SWEETWATER, WY"
3138,56039,3995.38,"TETON, WY"
3139,56041,2081.26,"UINTA, WY"
3140,56043,2238.55,"WASHAKIE, WY"


In [4]:
df_proximity = pd.read_csv("http://proximityone.com/countytrends/cb_2015_us_county_500k_bp_1415_annual_table.csv",
                          dtype={'GEOID': str})
df_proximity

Unnamed: 0,NAMELSAD,STUSPS,LSAD,GEOID,CBSAFP,POP2014,POP2015,HU2014,HU2015,BLDG_14,...,V1_15,B2_15,U2_15,V2_15,B34_15,U34_15,V34_15,B5_15,U5_15,V5_15
0,Autauga County,AL,1,01001,33860.0,55290,55347,22751,22847,131,...,39749354.0,0,0,0.0,0,0,0.0,0,0,0.0
1,Baldwin County,AL,1,<a href=http://proximityone.com/rdems/1/rdems0...,19300.0,199713,203709,107368,108564,1373,...,302576607.0,11,22,2232258.0,29,109,12724884.0,31,450,43856188.0
2,Barbour County,AL,1,01005,,26815,26489,11799,11789,7,...,3292300.0,0,0,0.0,0,0,0.0,0,0,0.0
3,Bibb County,AL,1,01007,13820.0,22549,22583,8977,8986,19,...,2222180.0,0,0,0.0,0,0,0.0,0,0,0.0
4,Blount County,AL,1,01009,13820.0,57658,57673,23826,23817,3,...,1573173.0,0,0,0.0,0,0,0.0,2,40,3831302.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,Sweetwater County,WY,56,56037,40540.0,44925,44626,19077,19245,120,...,20852748.0,0,0,0.0,0,0,0.0,0,0,0.0
3138,Teton County,WY,56,56039,27220.0,22905,23125,13269,13395,137,...,234204021.0,10,20,6177535.0,8,30,1164993.0,1,12,1296756.0
3139,Uinta County,WY,56,56041,21740.0,20903,20822,8773,8788,35,...,7856953.0,0,0,0.0,0,0,0.0,0,0,0.0
3140,Washakie County,WY,56,56043,,8316,8328,3810,3804,4,...,970000.0,0,0,0.0,0,0,0.0,0,0,0.0


In [5]:
df_proximity = df_proximity.rename(columns={
    'NAMELSAD': 'county',
    'STUSPS': 'code',
    'GEOID': 'fips'
})
df_xref = pd.DataFrame(df_proximity)
df_xref['county'] = df_xref['county'].apply(cleanup_county)
df_xref['county_state'] = df_xref['county'] + ', ' + df_xref['code']
df_xref['population'] = (df_xref['POP2014'] + df_xref['POP2015']) / 2
df_xref['housing'] = (df_xref['HU2014'] + df_xref['HU2015']) / 2
df_xref['per_capita'] = df_proximity['housing'] / df_proximity['population']
df_xref = df_xref[["county_state", "population", "housing", "per_capita"]]
df_xref

Unnamed: 0,county_state,population,housing,per_capita
0,"AUTAUGA, AL",55318.5,22799.0,0.412141
1,"BALDWIN, AL",201711.0,107966.0,0.535251
2,"BARBOUR, AL",26652.0,11794.0,0.442518
3,"BIBB, AL",22566.0,8981.5,0.398010
4,"BLOUNT, AL",57665.5,23821.5,0.413098
...,...,...,...,...
3137,"SWEETWATER, WY",44775.5,19161.0,0.427935
3138,"TETON, WY",23015.0,13332.0,0.579274
3139,"UINTA, WY",20862.5,8780.5,0.420875
3140,"WASHAKIE, WY",8322.0,3807.0,0.457462


In [6]:
df_xref = pd.merge(df_counties, df_xref, on='county_state')
df_xref['per_area'] = df_xref['housing'] / df_xref['area_2010']
df_xref

Unnamed: 0,fips,area_2010,county_state,population,housing,per_capita,per_area
0,01001,594.44,"AUTAUGA, AL",55318.5,22799.0,0.412141,38.353745
1,01003,1589.78,"BALDWIN, AL",201711.0,107966.0,0.535251,67.912541
2,01005,884.88,"BARBOUR, AL",26652.0,11794.0,0.442518,13.328361
3,01007,622.58,"BIBB, AL",22566.0,8981.5,0.398010,14.426258
4,01009,644.78,"BLOUNT, AL",57665.5,23821.5,0.413098,36.945160
...,...,...,...,...,...,...,...
3141,56037,10426.65,"SWEETWATER, WY",44775.5,19161.0,0.427935,1.837695
3142,56039,3995.38,"TETON, WY",23015.0,13332.0,0.579274,3.336854
3143,56041,2081.26,"UINTA, WY",20862.5,8780.5,0.420875,4.218839
3144,56043,2238.55,"WASHAKIE, WY",8322.0,3807.0,0.457462,1.700654


## NOTE: the following requires the CSV file from the main notebook

In [7]:
df_counts_by_county = pd.read_csv("~/counts.csv")
df_counts_by_county

Unnamed: 0.1,Unnamed: 0,county_state,n,fips,population,area_2010,density,per_capita,per_area,per_capita_stdevs_from_mean,per_area_stdevs_from_mean
0,55,"CAPE MAY, NJ",76871,34009,94680.1,251.43,376.566440,0.811902,305.735195,5.137887,0.586059
1,218,"GRAND, CO",11284,8049,14910.4,1846.33,8.075696,0.756787,6.111584,4.678022,-0.287329
2,388,"HAMILTON, NY",2860,36041,4657.5,1717.37,2.711996,0.614063,1.665337,3.487175,-0.300290
3,322,"CRAWFORD, IN",5975,18025,10612.7,305.64,34.722877,0.563005,19.549143,3.061156,-0.248159
4,337,"SWITZERLAND, IN",5365,18155,10650.3,220.63,48.272220,0.503742,24.316729,2.566683,-0.234262
...,...,...,...,...,...,...,...,...,...,...,...
410,413,"EAGLE, CO",51,8037,53629.9,1684.53,31.836714,0.000951,0.030276,-1.628458,-0.305056
411,409,"POLK, FL",580,12105,651910.4,1797.84,362.607574,0.000890,0.322609,-1.628969,-0.304204
412,410,"INDIAN RIVER, FL",91,12061,147425.7,502.87,293.168612,0.000617,0.180961,-1.631242,-0.304616
413,412,"HENDERSON, NC",53,37089,111636.5,373.07,299.237409,0.000475,0.142064,-1.632431,-0.304730


In [8]:
df_xref = pd.merge(df_xref, df_counts_by_county, on='county_state')
df_xref = df_xref[["county_state", "per_capita_x", "per_capita_y", "per_area_x", "per_area_y"]]
df_xref

Unnamed: 0,county_state,per_capita_x,per_capita_y,per_area_x,per_area_y
0,"PINAL, AZ",0.412744,0.226846,30.889032,17.264579
1,"BUTTE, CA",0.435035,0.044836,59.739926,6.122362
2,"LOS ANGELES, CA",0.344813,0.021458,861.621709,52.872682
3,"RIVERSIDE, CA",0.351140,0.007643,114.245859,2.475689
4,"SAN FRANCISCO, CA",0.452894,0.040715,8297.194367,739.364199
...,...,...,...,...,...
412,"WAUKESHA, WI",0.411668,0.105881,296.566406,76.428844
413,"WAUPACA, WI",0.489614,0.163931,34.061334,11.326584
414,"WAUSHARA, WI",0.618284,0.231045,23.800208,8.959514
415,"WINNEBAGO, WI",0.438507,0.112160,171.160441,43.743239


In [9]:
df_xref.to_csv("~/xref.csv")

In [10]:
df_xref.sort_values('per_capita_y', ascending=False).head(10)

Unnamed: 0,county_state,per_capita_x,per_capita_y,per_area_x,per_area_y
199,"CAPE MAY, NJ",1.041181,0.811902,393.576741,305.735195
14,"GRAND, CO",1.118933,0.756787,8.815326,6.111584
236,"HAMILTON, NY",1.861754,0.614063,5.104899,1.665337
66,"CRAWFORD, IN",0.516248,0.563005,17.828164,19.549143
130,"SWITZERLAND, IN",0.485533,0.503742,23.122422,24.316729
135,"VERMILLION, IN",0.475977,0.490465,29.07778,30.037372
20,"OURAY, CO",0.676793,0.477264,5.820824,4.104581
128,"STEUBEN, IN",0.568467,0.460429,63.277659,51.246197
283,"CHEROKEE, NC",0.652778,0.448744,38.889621,27.196276
13,"GILPIN, CO",0.620779,0.447169,23.915944,17.224817


In [11]:
df_xref.sort_values('per_capita_y', ascending=True).head(10)

Unnamed: 0,county_state,per_capita_x,per_capita_y,per_area_x,per_area_y
336,"ARLINGTON, VA",0.489526,4e-06,4295.725838,0.038506
291,"HENDERSON, NC",0.497066,0.000475,148.95462,0.142064
34,"INDIAN RIVER, FL",0.530378,0.000617,154.351025,0.180961
45,"POLK, FL",0.441172,0.00089,157.706748,0.322609
11,"EAGLE, CO",0.592989,0.000951,18.768143,0.030276
341,"JEFFERSON, WA",0.592827,0.002512,9.970616,0.04269
27,"HARTFORD, CT",0.418149,0.004877,509.876887,5.940688
3,"RIVERSIDE, CA",0.35114,0.007643,114.245859,2.475689
49,"CLAYTON, GA",0.385985,0.009524,739.044289,18.407855
338,"FAIRFAX, VA",0.360467,0.010429,1051.647185,30.186459


In [12]:
df_xref.sort_values('per_area_y', ascending=False).head(10)

Unnamed: 0,county_state,per_capita_x,per_capita_y,per_area_x,per_area_y
246,"NEW YORK, NY",0.529122,0.068944,38026.828734,4908.409987
203,"HUDSON, NJ",0.411383,0.179057,5986.555532,2559.926391
330,"PHILADELPHIA, PA",0.429204,0.1928,5007.38255,2249.552573
239,"KINGS, NY",0.389524,0.055019,14458.521604,2001.440271
182,"SUFFOLK, MA",0.420828,0.130305,5601.134996,1734.153052
256,"QUEENS, NY",0.364098,0.057816,7819.400166,1213.673639
201,"ESSEX, NJ",0.395016,0.171214,2491.965771,1073.55994
214,"UNION, NJ",0.363034,0.191182,1956.761618,1019.463348
196,"BERGEN, NJ",0.379372,0.213033,1523.398137,843.920862
101,"MARION, IN",0.447889,0.342362,1058.721928,808.846833
