In [None]:
import pandas as pd
import json
import requests
import math
import numpy as np

In [None]:
from API_keys import myCenAPI
import utilcalcs as calc
import geo_agg as geo
from muni_geo_id import PUMA_2018,cousub_2018,placeLI_2018,\
                        cousub_2010,placeLI_2010,stco,state

# ADD GEOGRAPHY CSV WITH LAND AREAS TO CALCULATE POPULATION DENSITY!!!!

## User-defined parameters

In [None]:
#Variables and predicates for both start/end years
source = 'acs/acs5'

col_b = f'group(B23001)' 
col_d = f'DP05_0001E,DP05_0001M,DP02_0092E,DP02_0092M,DP03_0002E,DP03_0002M,group(DP04)' 

year1 = '2018'
year0 = '2010'

### Data download functions

In [None]:
def get_cousub(year,col_b,col_d,geo_code):
    frames1 = []
    frames2 = []
    for st,co in stco.items():
        for i in co:
            url = f'https://api.census.gov/data/{year}/{source}?get={col_b}&for=county%20subdivision:*&in=state:{st}%20county:{i}&key={myAPI}'
            resp = requests.request('GET', url).content
            df = pd.DataFrame(json.loads(resp)[1:])
            df.columns = json.loads(resp)[0]
            frames1.append(df)
    for st,co in stco.items():
        for i in co:
            url = f'https://api.census.gov/data/{year}/{source}/profile?get={col_d}&for=county%20subdivision:*&in=state:{st}%20county:{i}&key={myAPI}'
            resp = requests.request('GET', url).content
            df = pd.DataFrame(json.loads(resp)[1:])
            df.columns = json.loads(resp)[0]
            frames2.append(df)
    df_sub = pd.merge(pd.concat(frames1),pd.concat(frames2),how='left',on='GEO_ID')
    df_sub = df_sub[df_sub['GEO_ID'].isin(geo_code)]
    return df_sub

def get_place(year,col_b,col_d,geo_code):
    url1 = f'https://api.census.gov/data/{year}/{source}?get={col_b}&for=place:*&in=state:36&key={myAPI}'
    resp1 = requests.request('GET', url1).content
    df1 = pd.DataFrame(json.loads(resp1)[1:])
    df1.columns = json.loads(resp1)[0]
    url2 = f'https://api.census.gov/data/{year}/{source}/profile?get={col_d}&for=place:*&in=state:36&key={myAPI}'
    resp2 = requests.request('GET', url2).content
    df2 = pd.DataFrame(json.loads(resp2)[1:])
    df2.columns = json.loads(resp2)[0]
    
    df_pl = pd.merge(df1,df2,how='left',on='GEO_ID')
    df_pl = df_pl[df_pl['GEO_ID'].isin(geo_code)]
    return df_pl

def get_puma(year,col_b,col_d,geo_code):
    url1 = f'https://api.census.gov/data/{year}/{source}?get={col_b}&for=public%20use%20microdata%20area:*&in=state:36&key={myAPI}'
    resp1 = requests.request('GET', url1).content
    df1 = pd.DataFrame(json.loads(resp1)[1:])
    df1.columns = json.loads(resp1)[0]
    url2 = f'https://api.census.gov/data/{year}/{source}/profile?get={col_d}&for=public%20use%20microdata%20area:*&in=state:36&key={myAPI}'
    resp2 = requests.request('GET', url2).content
    df2 = pd.DataFrame(json.loads(resp2)[1:])
    df2.columns = json.loads(resp2)[0]
    
    df_puma = pd.merge(df1,df2,how='left',on='GEO_ID')
    df_puma = df_puma[df_puma['GEO_ID'].isin(geo_code)]
    return df_puma

def clean_data(df,var):
    dff = df[var].copy()
    var_num = var[1:]
    for col in var_num:
        dff[col] = dff[col].astype(float)
    dff = dff.replace([999999999, 555555555, 333333333, 222222222,\
                    666666666, 888888888, -999999999, -555555555,\
                    -333333333, -222222222, -666666666, -888888888], np.nan)
    return dff


### Variables for table calculations

In [None]:
#Total Population - to rename
PopTot = ['DP05_0001E','DP05_0001M']  #can also use b23001? 

# POPULATION DENSITY !!!
PopDen = ''

#Foreign-Born Population - for calculation
PopFB = ['DP02_0092E','DP02_0092M']

#Tot Labor Force - to rename
LFTot = ['DP03_0002E','DP03_0002M']

#Age 25 to 54 in Labor Force - to caluclate
LF2554E = ['B23001_025E','B23001_032E','B23001_111E','B23001_118E','B23001_039E','B23001_125E','B23001_046E','B23001_132E']
LF2554M = ['B23001_025M','B23001_032M','B23001_111M','B23001_118M','B23001_039M','B23001_125M','B23001_046M','B23001_132M']

#Age 65+ in Labor Force - to calculate
LFO65E = ['B23001_074E','B23001_079E','B23001_084E','B23001_160E','B23001_165E','B23001_170E']
LFO65M = ['B23001_074M','B23001_079M','B23001_084M','B23001_160M','B23001_165M','B23001_170M']

#Total Housing Units - to rename
HouTot = ['DP04_0001E','DP04_0001M']

#Total Housing Units by Owner vs. Renter - to rename
HouO = ['DP04_0046E','DP04_0046M']
HouR = ['DP04_0047E','DP04_0047M']
HouV = ['DP04_0003E','DP04_0003M']

#Total Housing Units by Building Size - to calculate
Hou1UE = ['DP04_0007E','DP04_0008E']
Hou1UM = ['DP04_0007M','DP04_0008M']
Hou24UE = ['DP04_0009E','DP04_0010E']
Hou24UM = ['DP04_0009M','DP04_0010M']
Hou5UE = ['DP04_0011E','DP04_0012E','DP04_0013E']
Hou5UM = ['DP04_0011M','DP04_0012M','DP04_0013M']

HouU = Hou1UE + Hou1UM + Hou24UE + Hou24UM + Hou5UE + Hou5UM


#List of all variables used for calculation + total labor force variables - replace the total pop 16+ variables
var_data_Y1 = ['GEO_ID'] + PopTot + PopFB + LFTot + LF2554E + LF2554M + LFO65E + LFO65M + HouTot + HouO + HouR + HouV + HouU 
var_data_Y0 = ['GEO_ID'] + PopTot + LFTot + LF2554E + LF2554M + LFO65E + LFO65M 

## Pop & Housing for Current [End] Year (Year 1)

#### Subdivisions in NY-NJ-CT - Places in LI

In [None]:
dfY1_sub = get_cousub(year1,col_b,col_d1,cousub_2018)
dfY1_pl = get_place(year1,col_b,col_d1,placeLI_2018)
dfY1 = pd.concat([dfY1_sub,dfY1_pl],sort=True)
dfY1 = clean_data(dfY1,var_data_Y1)
#dfY1.head()

#### PUMAS for NYC Only - to calculate as Sub-borough Areas

In [None]:
dfY1_nyc = get_puma(year1,col_b,col_d1,PUMA_2018)
dfY1_nyc = clean_data(dfY1_nyc,var_data_Y1)

In [None]:
#import csv to recode PUMAS to Sub-borough areas
geo_xwalk = pd.read_excel('../data/2018_PUMAxSubBor.xlsx') 
dfY1_nyc = geo_xwalk.merge(dfY1_nyc,on='GEO_ID').drop(columns=['GEO_ID'])

In [None]:
#Aggregate pumas to sub-borough geos & calc MOEs
dfY1_nyc = geo.calc_muni_agg(dfY1_nyc,'SB_ID')
dfY1_nyc = dfY1_nyc.rename(columns={'SB_ID':'GEO_ID'})
dfY1_nyc.head()

#### Combine SubPlace and NYCPUMA Table into Municipality Table for 2018

In [None]:
dfY1 = pd.concat([dfY1,dfY1_nyc],sort=True)
dfY1.head()

### Calculate Variables for Current Year (Y1)

In [None]:
#Year 1 (Current Year) ACS calculations - FINAL TABLE
#Total Population - rename
dfY1['PopTot_Y1E'] = dfY1['DP05_0001E']
dfY1['PopTot_Y1M'] = dfY1['DP05_0001M']
dfY1['PopTot_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['PopTot_Y1E'],x['PopTot_Y1M'])),axis=1)

#Population Density - calc & rename
#dfY1['PopDen_Y1E'] = dfY1['PopTot_Y1E'] / LAND_AREA #check that this variable is total FB and not total pop
#dfY1['PopDen_Y1M'] = dfY1['DP02_0092M']


#Foreign-born Population - calc & rename
dfY1['PopFB_Y1E'] = dfY1['DP02_0092E'] #check that this variable is total FB and not total pop
dfY1['PopFB_Y1M'] = dfY1['DP02_0092M']
dfY1['PopFB_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['PopFB_Y1E'],x['PopFB_Y1M'])),axis=1)

dfY1['PopFBP_Y1E'] = dfY1.apply(lambda x: (calc.get_pct(x['PopFB_Y1E'],x['PopTot_Y1E'])),axis=1)
dfY1['PopFBP_Y1M'] = dfY1.apply(lambda x: (calc.get_pctmoe(x['PopFB_Y1E'],x['PopFB_Y1M'],\
                                            x['PopTot_Y1E'],x['PopTot_Y1M'])),axis=1)
dfY1['PopFBP_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['PopFBP_Y1E'],x['PopFBP_Y1M'])),axis=1)

#Total Labor Force, MOE & CV
dfY1['LFTot_Y1E'] = dfY1['DP03_0002E']
dfY1['LFTot_Y1M'] = dfY1['DP03_0002M']
dfY1['LFTot_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['LFTot_Y1E'],x['LFTot_Y1M'])),axis=1)

#Prime-age (25-54) Labor Force, MOE & CV
dfY1['LF2554_Y1E'] = dfY1.loc[:,LF2554E].sum(axis=1)
dfY1['LF2554_Y1M'] = dfY1.apply(lambda x: (calc.get_moe(x[LF2554M])),axis=1)
dfY1['LF2554_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['LF2554_Y1E'],x['LF2554_Y1M'])),axis=1)

#Age 65+ Labor Force, MOE & CV
dfY1['LFO65_Y1E'] = dfY1.loc[:,LFO65E].sum(axis=1)
dfY1['LFO65_Y1M'] = dfY1.apply(lambda x: (calc.get_moe(x[LFO65M])),axis=1)
dfY1['LFO65_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['LFO65_Y1E'],x['LFO65_Y1M'])),axis=1)

#Total Housing Units
dfY1['HouTot_Y1E'] = dfY1['DP04_0001E']
dfY1['HouTot_Y1M'] = dfY1['DP04_0001M']
dfY1['HouTot_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['HouTot_Y1E'],x['HouTot_Y1M'])),axis=1)

#Total Housing Units by Tenure - to rename
dfY1['HouO_Y1E'] = dfY1['DP04_0046E']
dfY1['HouO_Y1M'] = dfY1['DP04_0046M']
dfY1['HouO_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['HouO_Y1E'],x['HouO_Y1M'])),axis=1)
dfY1['HouR_Y1E'] = dfY1['DP04_0047E']
dfY1['HouR_Y1M'] = dfY1['DP04_0047M']
dfY1['HouR_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['HouR_Y1E'],x['HouR_Y1M'])),axis=1)
dfY1['HouV_Y1E'] = dfY1['DP04_0003E']
dfY1['HouV_Y1M'] = dfY1['DP04_0003M']
dfY1['HouV_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['HouV_Y1E'],x['HouV_Y1M'])),axis=1)

#Total Housing Units by Building Size
dfY1['Hou1U_Y1E'] = dfY1.loc[:,Hou1UE].sum(axis=1)
dfY1['Hou1U_Y1M'] = dfY1.apply(lambda x: (calc.get_moe(x[Hou1UM])),axis=1)
dfY1['Hou1U_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['Hou1U_Y1E'],x['Hou1U_Y1M'])),axis=1)

dfY1['Hou24U_Y1E'] = dfY1.loc[:,Hou24UE].sum(axis=1)
dfY1['Hou24U_Y1M'] = dfY1.apply(lambda x: (calc.get_moe(x[Hou24UM])),axis=1)
dfY1['Hou24U_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['Hou24U_Y1E'],x['Hou24U_Y1M'])),axis=1)

dfY1['Hou5U_Y1E'] = dfY1.loc[:,Hou5UE].sum(axis=1)
dfY1['Hou5U_Y1M'] = dfY1.apply(lambda x: (calc.get_moe(x[Hou5UM])),axis=1)
dfY1['Hou5U_Y1C'] = dfY1.apply(lambda x: (calc.get_cv(x['Hou5U_Y1E'],x['Hou5U_Y1M'])),axis=1)

In [None]:
munY1 = dfY1.drop(var_data_Y1[1:],axis=1)
munY1.head()

In [None]:
munY1.shape

In [None]:
#munY1.to_csv('munY1_test.csv')

## Pop & Labor Force Init Year (Year 0)

#### Subdivisions & Places 2010 (with geo correct)

In [None]:
dfY0_sub = get_cousub(year0,col_b,col_d,cousub_2010)
dfY0_pl = get_place(year0,col_b,col_d,placeLI_2010)
dfY0 = pd.concat([dfY0_sub,dfY0_pl],sort=True)
dfY0 = clean_data(dfY0,var_data_Y0)
#dfY0.head()

In [None]:
#Recode 2010 geo ids to current GeoIDs
#Check 2018 data to make sure no more differences in 2010 recode!!!!!!!!!! clean up this join/csv to get rid of extras
xwalk_10 = pd.read_csv('../data/municipalities_10.csv')
dfY0 = xwalk_10.merge(dfY0,on='GEO_ID').drop(columns=['GEO_ID','Unnamed: 2','Unnamed: 3','Unnamed: 4',\
                                                     'Unnamed: 5','Unnamed: 6'])
dfY0 = dfY0.rename(columns={'GEOID18':'GEO_ID'})
dfY0.head()

In [None]:
dfY0 = geo.calc_muni_agg(dfY0,'GEO_ID')

In [None]:
dfY0.shape

#### NYC PUMAS

In [None]:
dfY0_nyc = get_puma(year0,col_b,col_d,PUMA_2018)
dfY0_nyc = clean_data(dfY0_nyc,var_data_Y0)

In [None]:
#import csv to recode 
geo_xwalk = pd.read_excel('../data/2018_PUMAxSubBor.xlsx') 
dfY0_nyc = geo_xwalk.merge(dfY0_nyc,on='GEO_ID').drop(columns=['GEO_ID'])
dfY0_nyc.head()

In [None]:
#agg pumas to sub-borough areas
dfY0_nyc = geo.calc_muni_agg(dfY0_nyc,'SB_ID')
dfY0_nyc = dfY0_nyc.rename(columns={'SB_ID':'GEO_ID'})
dfY0_nyc.head()

#### Join Sub-Place-PUMA

In [None]:
dfY0 = pd.concat([dfY0,dfY0_nyc],sort=True)
dfY0.head()

## Calculate Year 0 [Init Year] Variables

In [None]:
#2010 Population & Labor Force calculations - FINAL TABLE
#Total Population - rename
dfY0['PopTot_Y0E'] = dfY0['DP05_0001E']
dfY0['PopTot_Y0M'] = dfY0['DP05_0001M']
dfY0['PopTot_Y0C'] = dfY0.apply(lambda x: (calc.get_cv(x['PopTot_Y0E'],x['PopTot_Y0M'])),axis=1)

#Total Labor Force, MOE & CV
dfY0['LFTot_Y0E'] = dfY0['DP03_0002E']
dfY0['LFTot_Y0M'] = dfY0['DP03_0002M']
dfY0['LFTot_Y0C'] = dfY0.apply(lambda x: (calc.get_cv(x['LFTot_Y0E'],x['LFTot_Y0M'])),axis=1)

#Prime-age (25-54) Labor Force, MOE & CV
dfY0['LF2554_Y0E'] = dfY0.loc[:,LF2554E].sum(axis=1)
dfY0['LF2554_Y0M'] = dfY0.apply(lambda x: (calc.get_moe(x[LF2554M])),axis=1)
dfY0['LF2554_Y0C'] = dfY0.apply(lambda x: (calc.get_cv(x['LF2554_Y0E'],x['LF2554_Y0M'])),axis=1)

#Age 65+ Labor Force, MOE & CV
dfY0['LFO65_Y0E'] = dfY0.loc[:,LFO65E].sum(axis=1)
dfY0['LFO65_Y0M'] = dfY0.apply(lambda x: (calc.get_moe(x[LFO65M])),axis=1)
dfY0['LFO65_Y0C'] = dfY0.apply(lambda x: (calc.get_cv(x['LFO65_Y0E'],x['LFO65_Y0M'])),axis=1)

In [None]:
muniY0 = dfY0.drop(var_data_Y0[1:],axis=1)
muniY0.head()

## Change between 2006-2010 5YR (Year 0) and 2014-2018 5YR (Year 1)

In [None]:
#Merge Year 1 and Year 0 into table
muni = pd.merge(muniY0,munY1,how='left',on='GEO_ID')

In [None]:
#Calculate change between Year 1 and Year 0, MOE & CVs

#Total Population Change, MOE & CV
muni['PopTot_Y0Y1E'] = muni.PopTot_Y1E - muni.PopTot_Y0E
muni['PopTot_Y0Y1M'] = muni.apply(lambda x: (calc.get_moe([x['PopTot_Y0M'],x['PopTot_Y1M']])),axis=1)
muni['PopTot_Y0Y1C'] = muni.apply(lambda x: (calc.get_cv(x['PopTot_Y0Y1E'],x['PopTot_Y0Y1M'])),axis=1)

#Total Labor Force Change, MOE & CV
muni['LFTot_Y0Y1E'] = muni.LFTot_Y1E - muni.LFTot_Y0E
muni['LFTot_Y0Y1M'] = muni.apply(lambda x: (calc.get_moe([x['LFTot_Y0M'],x['LFTot_Y1M']])),axis=1)
muni['LFTot_Y0Y1C'] = muni.apply(lambda x: (calc.get_cv(x['LFTot_Y0Y1E'],x['LFTot_Y0Y1M'])),axis=1)

#Prime-age (25-54) Labor Force Change, MOE & CV
muni['LF2554_Y0Y1E'] = muni.LF2554_Y1E - muni.LF2554_Y0E
muni['LF2554_Y0Y1M'] = muni.apply(lambda x: (calc.get_moe([x['LF2554_Y0M'],x['LF2554_Y1M']])),axis=1)
muni['LF2554_Y0Y1C'] = muni.apply(lambda x: (calc.get_cv(x['LF2554_Y0Y1E'],x['LF2554_Y0Y1M'])),axis=1)

#Age 65+ Labor Force Change, MOE & CV
muni['LFO65_Y0Y1E'] = muni.LFO65_Y1E - muni.LFO65_Y0E
muni['LFO65_Y0Y1M'] = muni.apply(lambda x: (calc.get_moe([x['LFO65_Y0M'],x['LFO65_Y1M']])),axis=1)
muni['LFO65_Y0Y1C'] = muni.apply(lambda x: (calc.get_cv(x['LFO65_Y0Y1E'],x['LFO65_Y0Y1M'])),axis=1)

In [None]:
muni.head()

# Save as intermediate csv for later join w/ other muni data

In [None]:
muni.set_index('GEO_ID',inplace=True)
muni = muni.replace(np.nan,0)
for column_name in muni.columns:
    muni.rename(columns={column_name:column_name.replace('Y0',year0[2:]).replace('Y1',year1[2:])},inplace=True)

In [None]:
muni.head()

In [None]:
muni.to_csv('region_muni_intermediate.csv')