# to download population info

In [29]:
from census import Census
import pickle
import pandas as pd
from os.path import exists
graph_obj_path = 'graph_checkpoints/nyc_metro/checkpoint_12.pkl'
pop_file_path = 'data/nyc_metro/population.pkl'
with open(graph_obj_path, 'rb') as f:
    ct = pickle.load(f)

In [30]:
with open('../api_keys/census_key.txt', 'r') as f:
    key = f.readline()
c = Census(key)

# normalize by population of area
def geoid_to_cty_fps(geoid):
    if len(geoid) != 11:
        print('GEOID not correct length.')
        return
    countyfps = geoid[2:5]
    return countyfps

def geoid_to_st_fps(geoid):
    if len(geoid) != 11:
        print('GEOID not correct length.')
        return
    statefps = geoid[0:2]
    return statefps

def geoid_to_tract_fps(geoid):
    if len(geoid) != 11:
        print('GEOID not correct length.')
        return
    trfps = geoid[5:]
    return trfps


def get_population(statefps, countyfps, var='B01001_001E', varname='total_pop'):
    
    census = c.acs5.state_county_tract(fields = var,
                                        state_fips = statefps,
                                        county_fips = countyfps,
                                        tract = '*',
                                        year = 2019)
    df = pd.DataFrame(census)
    df.rename(columns={var: varname}, inplace=True)
    return df

def get_population_for_tract(statefps, countyfps, tract, var='B01001_001E', varname='total_pop'):
    
    census = c.acs5.state_county_tract(fields = var,
                                        state_fips = statefps,
                                        county_fips = countyfps,
                                        tract = tract,
                                        year = 2019)
    df = pd.DataFrame(census)
    df.rename(columns={var: varname}, inplace=True)
    return df

In [31]:
# ct = CensusTractMobility(tract_data_dir='Tracts/nyc_metro_boundaries/nyc_metro_boundaries.shp')
# checkpoint_version = -1
# checkpoint_path = f'safegraph/compute_graph_checkpoints/denver/checkpoint_{checkpoint_version}.pkl'
# # with open(checkpoint_path, 'rb') as f:
# #     ct = pickle.load(f)

# get population of each tract
df = pd.DataFrame()
pop_file_path = 'data/nyc_metro/population.pkl'
if not exists(pop_file_path):
    df['state_fips'] = ct.tract_data.GEOID.apply(geoid_to_st_fps)
    df['county_fips'] = ct.tract_data.GEOID.apply(geoid_to_cty_fps)
    df.drop_duplicates(inplace=True)

    pop = pd.DataFrame()
    for i in range(0, df.shape[0]):
        x = df.iloc[i]
        pop = pd.concat([pop, get_population(x['state_fips'], x['county_fips'])])

    pop['GEOID'] = pop['state'] + pop['county'] + pop['tract']
    pop_dict = dict(zip(pop['GEOID'], pop['total_pop']))

    f = open(pop_file_path, 'wb')
    pickle.dump(pop_dict, f)
    f.close()
else:
    with open(pop_file_path, 'rb') as f:
        pop_dict = pickle.load(f)

In [32]:
len(pop_dict.keys())

7595

In [33]:
all_geoids = list(ct.idx_node_map.values())
print(f'Num geoids in the graph: {len(all_geoids)}')

missed = []
[(missed.append(i) if i not in pop_dict.keys() else None) for i in all_geoids]

print(f'Num geoids that are not in the pop dict: {len(missed)}')

Num geoids in the graph: 8231
Num geoids that are not in the pop dict: 1335


In [34]:
# check if any are not strings
any([not isinstance(s, str) for s in missed])

False

In [35]:
any([i in pop_dict.keys() for i in missed])

False

In [36]:
missed

['09013533103',
 '09013533104',
 '09013535102',
 '09013535101',
 '09009184101',
 '09009167101',
 '09009180101',
 '09009350101',
 '09009166003',
 '09009142605',
 '09009175502',
 '09009180102',
 '09009130202',
 '09009140102',
 '09009141301',
 '09009167102',
 '09009166004',
 '09009154102',
 '09009171202',
 '09009167301',
 '09009175501',
 '09009194101',
 '09009154101',
 '09009194102',
 '09009171201',
 '09009130201',
 '09009140101',
 '09013890204',
 '09013890203',
 '09001022201',
 '09001022202',
 '09001022101',
 '09001022102',
 '09001020302',
 '09001020301',
 '09001021402',
 '09001021601',
 '09001020101',
 '09001021702',
 '09001021501',
 '09001211201',
 '09001210101',
 '09001020102',
 '09001210501',
 '09001210402',
 '09001050301',
 '09003430604',
 '09003430501',
 '09003430502',
 '09003980002',
 '09003980003',
 '09003980100',
 '09003980001',
 '09003490304',
 '09003490303',
 '09003400102',
 '09003400101',
 '09003420602',
 '09003420601',
 '09003514104',
 '09003405801',
 '09003430603',
 '090034

In [40]:
missed_df

Unnamed: 0,GEOID,geometry,state_fips,county_fips,tract_fips
9,09013533103,"POLYGON ((-72.43131 41.91505, -72.43095 41.915...",09,013,533103
10,09013533104,"POLYGON ((-72.39185 41.89844, -72.39182 41.898...",09,013,533104
11,09013535102,"POLYGON ((-72.51513 41.90598, -72.51513 41.906...",09,013,535102
12,09013535101,"POLYGON ((-72.51758 41.86665, -72.51733 41.869...",09,013,535101
13,09009184101,"POLYGON ((-72.83023 41.27524, -72.82956 41.275...",09,009,184101
...,...,...,...,...,...
8052,34001011703,"POLYGON ((-74.63574 39.42552, -74.63523 39.425...",34,001,011703
8053,34001011704,"POLYGON ((-74.61933 39.43987, -74.61745 39.441...",34,001,011704
8054,34001011808,"POLYGON ((-74.66878 39.41076, -74.66859 39.410...",34,001,011808
8055,34001011807,"POLYGON ((-74.63880 39.39040, -74.63859 39.390...",34,001,011807


In [41]:
missed_df = ct.tract_data.loc[ct.tract_data.GEOID.isin(missed)]

missed_df['state_fips'] = missed_df.GEOID.apply(geoid_to_st_fps)
missed_df['county_fips'] = missed_df.GEOID.apply(geoid_to_cty_fps)
missed_df['tract_fips'] = missed_df.GEOID.apply(geoid_to_tract_fps)
df = missed_df

pop = pd.DataFrame()
for i in range(0, df.shape[0]):
    x = df.iloc[i]
    pop = pd.concat([pop, get_population_for_tract(x['state_fips'], x['county_fips'], x['tract_fips'])])

pop['GEOID'] = pop['state'] + pop['county'] + pop['tract']
pop_dict2 = dict(zip(pop['GEOID'], pop['total_pop']))

# pop_dict = pop_dict | pop_dict3

# f = open(pop_file_path, 'wb')
# pickle.dump(pop_dict, f)
# f.close()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [59]:
len(pop_dict3.keys())

1173

In [60]:
len(pop_dict.keys())

7595

In [61]:
pop_dict.update(pop_dict3)

In [62]:
len(pop_dict.keys())

7595

In [13]:
ct.edge_mat

array([[ 7366.,  1408.,  1506., ...,     0.,     0.,     0.],
       [ 5730.,  4018.,  2104., ...,     0.,     0.,     0.],
       [ 5090.,  2166.,  6035., ...,     0.,     0.,     0.],
       ...,
       [    0.,     0.,     0., ...,  5568.,   785.,   593.],
       [    0.,     0.,     0., ...,  3642.,  8171.,  7336.],
       [    0.,     0.,     0., ...,  2812.,  6640., 14368.]])

In [8]:
ct.check_and_normalize_edges(pop_dict)

KeyError: '09013533103'