# NIMBY vs. YIMBY: geospatial analysis of new construction projects in USA

TODO something like:
* Establish connection between pandas and dolt running in server mode.
* For each state/county/ZIP code (across some timeframe):
    * Compute a number of first property sales.
        * Absolute number
        * Per unit of area
        * Per unit of population
    * Compute percentage deviation from the mean.
        * Per unit of area
        * Per unit of population
* Do some dataviz to generate a nice, big chart and write it up.

In [6]:
import mysql.connector as connection
import pandas as pd
from sqlalchemy import create_engine

db_connection_str = 'mysql+mysqlconnector://rl:trustno1@localhost/us_housing_prices_v2'
db_connection = create_engine(db_connection_str)

query = """
SELECT a.*
FROM `sales` a
INNER JOIN
(
    SELECT   `property_id`, `state`, `property_zip5`, `property_county`, MIN(`sale_datetime`) AS first_sale_datetime
    FROM     `sales`
    WHERE    `sale_datetime` LIKE "2021%"
    GROUP BY `property_id`
) b ON a.property_id = b.property_id AND a.sale_datetime = b.first_sale_datetime;
"""

result_df = pd.read_sql(query, db_connection)
result_df = result_df[['state', 'property_zip5', 'property_county', 'sale_datetime', 'property_id']]
result_df

Unnamed: 0,state,property_zip5,property_county,sale_datetime,property_id
0,AZ,85122,PINAL,2021-02-19,506060030
1,AZ,85128,PINAL,2021-12-01,509545320
2,AZ,85194,PINAL,2021-10-01,505386150
3,AZ,85194,PINAL,2021-05-01,505386200
4,AZ,85131,PINAL,2021-11-05,405063260
...,...,...,...,...,...
4610814,WI,,VILAS,2021-09-29,PRCL10-3542
4610815,WI,,RACINE,2021-07-01,PRCL010-04-20-04-161-000
4610816,WI,,WALWORTH,2021-11-08,PRCLFD 2900001B
4610817,WI,,ONEIDA,2021-02-24,PRCLLY-536-7


https://stackoverflow.com/questions/11683712/sql-group-by-and-min-mysql 

In [27]:
counts_by_state = result_df['state'].value_counts()
df_counts_by_state = pd.DataFrame.from_records([counts_by_state.to_dict()]).transpose()
df_counts_by_state.reset_index(inplace=True)
df_counts_by_state = df_counts_by_state.rename(columns={'index': 'code', 0: 'n'})
df_counts_by_state

Unnamed: 0,code,n
0,FL,1617210
1,MD,438105
2,NY,313077
3,NC,261735
4,NJ,234884
5,WI,212881
6,CA,197501
7,IN,180955
8,CO,156033
9,MA,112573


In [28]:
query = "SELECT * FROM `states`;"
states_df = pd.read_sql(query, db_connection)
states_df

Unnamed: 0,code,name
0,GA,Georgia
1,GU,Guam
2,HI,Hawaii
3,CO,Colorado
4,CT,Connecticut
5,AK,Alaska
6,DC,District of Columbia
7,AL,Alabama
8,AR,Arkansas
9,AS,American Samoa


In [29]:
df_counts_by_state = pd.merge(df_counts_by_state, states_df, on='code')
df_counts_by_state

Unnamed: 0,code,n,name
0,FL,1617210,Florida
1,MD,438105,Maryland
2,NY,313077,New York
3,NC,261735,North Carolina
4,NJ,234884,New Jersey
5,WI,212881,Wisconsin
6,CA,197501,California
7,IN,180955,Indiana
8,CO,156033,Colorado
9,MA,112573,Massachusetts


In [32]:
df_state_area = pd.read_html("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area")[0]
df_state_area.columns = df_state_area.columns.to_flat_index()
df_state_area = df_state_area.rename(columns={('State', 'State'): 'name', ('Land area[2]', 'km2'): 'land_area_km2'})
df_state_area = df_state_area[['name', 'land_area_km2']]
df_state_area

Unnamed: 0,name,land_area_km2
0,Alaska,1477953
1,Texas,676587
2,California,403466
3,Montana,376962
4,New Mexico,314161
5,Arizona,294207
6,Nevada,284332
7,Colorado,268431
8,Oregon,248608
9,Wyoming,251470


In [33]:
df_counts_by_state = pd.merge(df_counts_by_state, df_state_area, on='name')
df_counts_by_state

Unnamed: 0,code,n,name,land_area_km2
0,FL,1617210,Florida,138887
1,MD,438105,Maryland,25142
2,NY,313077,New York,122057
3,NC,261735,North Carolina,125920
4,NJ,234884,New Jersey,19047
5,WI,212881,Wisconsin,140268
6,CA,197501,California,403466
7,IN,180955,Indiana,92789
8,CO,156033,Colorado,268431
9,MA,112573,Massachusetts,20202


In [39]:
df_state_pop = pd.read_html('https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population')[0]
df_state_pop.columns = df_state_pop.columns.to_flat_index()
df_state_pop = df_state_pop.rename(columns={('State or territory', 'State or territory'): 'name', 
                                            ('Census population[8][a]', 'July 1, 2021 (est.)'): 'population'})
df_state_pop = df_state_pop[['name', 'population']]
df_state_pop

Unnamed: 0,name,population
0,California,39237836.0
1,Texas,29527941.0
2,Florida,21781128.0
3,New York,19835913.0
4,Pennsylvania,12964056.0
5,Illinois,12671469.0
6,Ohio,11780017.0
7,Georgia,10799566.0
8,North Carolina,10551162.0
9,Michigan,10050811.0


In [40]:
df_counts_by_state = pd.merge(df_counts_by_state, df_state_pop, on='name')
df_counts_by_state

Unnamed: 0,code,n,name,land_area_km2,population
0,FL,1617210,Florida,138887,21781128.0
1,MD,438105,Maryland,25142,6165129.0
2,NY,313077,New York,122057,19835913.0
3,NC,261735,North Carolina,125920,10551162.0
4,NJ,234884,New Jersey,19047,9267130.0
5,WI,212881,Wisconsin,140268,5895908.0
6,CA,197501,California,403466,39237836.0
7,IN,180955,Indiana,92789,6805985.0
8,CO,156033,Colorado,268431,5812069.0
9,MA,112573,Massachusetts,20202,6984723.0


In [42]:
df_counts_by_state['per_capita'] = df_counts_by_state['n'] / df_counts_by_state['population']
df_counts_by_state['per_land_km2'] = df_counts_by_state['n'] / df_counts_by_state['land_area_km2']
df_counts_by_state

Unnamed: 0,code,n,name,land_area_km2,population,per_capita,per_land_km2
0,FL,1617210,Florida,138887,21781128.0,0.074248,11.64407
1,MD,438105,Maryland,25142,6165129.0,0.071062,17.425225
2,NY,313077,New York,122057,19835913.0,0.015783,2.565007
3,NC,261735,North Carolina,125920,10551162.0,0.024806,2.078582
4,NJ,234884,New Jersey,19047,9267130.0,0.025346,12.331811
5,WI,212881,Wisconsin,140268,5895908.0,0.036107,1.517673
6,CA,197501,California,403466,39237836.0,0.005033,0.489511
7,IN,180955,Indiana,92789,6805985.0,0.026588,1.950177
8,CO,156033,Colorado,268431,5812069.0,0.026846,0.581278
9,MA,112573,Massachusetts,20202,6984723.0,0.016117,5.572369


In [66]:
per_capita_mean = float(df_counts_by_state[['per_capita']].mean()[0])
per_capita_stdev = float(df_counts_by_state[['per_capita']].std()[0])

df_counts_by_state['per_capita_stdevs_from_mean'] = (df_counts_by_state['per_capita'] - per_capita_mean) / per_capita_stdev

per_land_km2_mean = float(df_counts_by_state['per_land_km2'].mean())
per_land_km2_stdev = float(df_counts_by_state['per_land_km2'].std())

df_counts_by_state['per_land_km2_stdevs_from_mean'] = (df_counts_by_state['per_land_km2'] - per_land_km2_mean) / per_land_km2_stdev

df_counts_by_state['stdev_diff'] = abs(df_counts_by_state['per_capita_stdevs_from_mean'] - df_counts_by_state['per_land_km2_stdevs_from_mean'])

df_counts_by_state

Unnamed: 0,code,n,name,land_area_km2,population,per_capita,per_land_km2,per_capita_stdevs_from_mean,per_land_km2_stdevs_from_mean,stdev_diff
0,FL,1617210,Florida,138887,21781128.0,0.074248,11.64407,3.27864,0.586409,2.692232
1,MD,438105,Maryland,25142,6165129.0,0.071062,17.425225,3.103115,1.049088,2.054027
2,NY,313077,New York,122057,19835913.0,0.015783,2.565007,0.058099,-0.14021,0.198309
3,NC,261735,North Carolina,125920,10551162.0,0.024806,2.078582,0.555128,-0.179139,0.734267
4,NJ,234884,New Jersey,19047,9267130.0,0.025346,12.331811,0.584855,0.64145,0.056595
5,WI,212881,Wisconsin,140268,5895908.0,0.036107,1.517673,1.177605,-0.22403,1.401635
6,CA,197501,California,403466,39237836.0,0.005033,0.489511,-0.53406,-0.306316,0.227744
7,IN,180955,Indiana,92789,6805985.0,0.026588,1.950177,0.653254,-0.189416,0.84267
8,CO,156033,Colorado,268431,5812069.0,0.026846,0.581278,0.667507,-0.298972,0.966479
9,MA,112573,Massachusetts,20202,6984723.0,0.016117,5.572369,0.076481,0.100477,0.023996


In [27]:
counts_by_county = result_df['property_county'].value_counts()
df_counts_by_county = pd.DataFrame.from_records([counts_by_county.to_dict()]).transpose()
df_counts_by_county

Unnamed: 0,0
LEE,155915
LOS ANGELES,116226
PALM BEACH,103406
COOK COUNTY,98094
MIAMI-DADE,88775
...,...
DUBOIS,1
Halifax,1
NANTUCKET,1
OWEN,1


In [29]:
counts_by_zip = result_df['property_zip5'].value_counts()
df_counts_by_zip = pd.DataFrame.from_records([counts_by_zip.to_dict()]).transpose()
df_counts_by_zip

Unnamed: 0,0
34953,19619
33993,15350
33909,11059
21842,9590
33974,9345
...,...
78418,1
64081,1
32279,1
00859,1
