### Import

In [43]:
import os
import sys
import pandas as pd
import numpy as np
import shapefile
import re
import pickle
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial import KDTree
from tqdm import tqdm
from collections import Counter
from scipy.stats.mstats import gmean
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist

### Find Available files for tracts

In [13]:
myfiles = []
for root, dirs, files in os.walk("../data/acs/nyc_tracts/nyc_tracts_cleaned", topdown=False):
    for name in files:
        if 'X' in name and 'csvt' not in name:
            myfiles.append(os.path.join(root, name))
myfiles = np.sort(myfiles)

### Remove Margin of Error Columns

In [14]:
def removeMarginOfErrorColumns(data):

    #Remove Columns
    columns = data.columns
    keep = []
    for column in columns:
        if '(margin_of_error)' not in column:
            keep.append(column)
    data = data[keep]
    
    #Return
    return data

### Remove Empty Columns

In [15]:
def removeEmptyColumns(data):

    #Remove Columns
    columns = data.columns
    keepcolumns = []
    for column in columns:
        if (data[column].isnull().all() == False):
            keepcolumns.append(column)
    data = data[keepcolumns]
    
    #Return
    return data

### Read Data

In [16]:
dataframes = []
for m,myfile in tqdm(enumerate(myfiles)):

    #Read
    dataframe = pd.read_csv(myfile)

    #Remove OBJECTID and GEOID in all other tables
    if (m > 0): dataframe = dataframe.drop(['OBJECTID','GEOID'],axis=1)

    #Remove Margin of Error Columns
    dataframe = removeMarginOfErrorColumns(dataframe)
    
    #Remove Empty Columns
    dataframe = removeEmptyColumns(dataframe)
    
    #Add to bag
    dataframes.append(dataframe)

#Combine
data = pd.concat(dataframes,axis=1)

31it [00:17,  1.81it/s]


### Build dictionary of column occurences

In [17]:
C = {}
columns = data.columns
for c,column in enumerate(columns):
    if column in C:
        values = C[column]
        values.append(values[-1]+1)
        C[column] = values
    else:
        C[column] = [0]

### Drop Columns that occur more than once

In [18]:
newcolumns = []
dropcolumns =[]
columns = data.columns
for column in columns:
    occ = C[column]
    index = occ[0]
    newcolumn = column + '_' + str(index)
    C[column] = occ[1:]
    newcolumns.append(newcolumn)
    if (index > 0):
        dropcolumns.append(newcolumn)
data.columns = newcolumns
data = data.drop(dropcolumns,axis=1)

### Drop Columns

In [19]:
toberemoved = []
toberemoved.append('nativity_and_citizenship_status')
toberemoved.append('foreign-born_population')
toberemoved.append('place_of_birth')
toberemoved.append('by_nativity')
toberemoved.append('in_the_united_states_born')
toberemoved.append('sex_of_workers_by_time_leaving_home')
toberemoved.append('sex_of_workers_by_vehicles_available')
toberemoved.append('place_of_work_for_workers_')
toberemoved.append('aggregate_travel_time_to_work_')
toberemoved.append('number_of_workers_in_household_by_vehicles')
toberemoved.append('own_children_under_')
toberemoved.append('household_type_(including_living_alone)')
toberemoved.append('living_arrangements_of_adults')
toberemoved.append('grandparents_living')
toberemoved.append('household_type_by_relatives')
toberemoved.append('family_type_by_')
toberemoved.append('by_marital_status_')
toberemoved.append('ratio_of_income_to_poverty_level')
toberemoved.append('household_income_in_the_past_12_months_(in_2018_inflation-adjusted_dollars)_(')
toberemoved.append('age_of_householder_by_household_income_')
toberemoved.append('quintile_')
toberemoved.append('shares_of_aggregate_')
toberemoved.append('family_income_in_the_past_')
toberemoved.append('family_size_by_')
toberemoved.append('aggregate_nonfamily_')
toberemoved.append('sex_by_work_experience')
toberemoved.append('sex_by_earnings_')
toberemoved.append('veteran_status')
toberemoved.append('receipt_of_food_stamps')
toberemoved.append('sex_by_age_by_employment_')
toberemoved.append('presence_of_own_c')
toberemoved.append('sex_by_work_status_in_the_past')
toberemoved.append('full-time_year-round_work_status')
toberemoved.append('tenure_by_')
toberemoved.append('mortgage_status_')
toberemoved.append('total_population_in_occupied_housing_units_by_tenure')
toberemoved.append('plumbing_facilities_')
toberemoved.append('age_of_householder_by_gross')
toberemoved.append('aggregate_value_')
toberemoved.append('price_asked_total')
toberemoved.append('age_of_householder_by_selected_monthly_owner_costs')
toberemoved.append('household_income_in_the_past_12_months_(in_2018_inflation-adjusted_dollars)_by_value_')
toberemoved.append('allocation_of_')
toberemoved.append('military_health')
toberemoved.append('means-tested_')
toberemoved.append('sex_of_workers_')
toberemoved.append('unweighted_sample')
toberemoved.append('hispanic_or_latino_origin_by_')
toberemoved.append('marital_status_')
toberemoved.append('total_fields_of_')
toberemoved.append('house_heating_fuel_')
toberemoved.append('year_structure_built_')
toberemoved.append('kitchen_facilities_for_all')
toberemoved.append('selected_monthly_owner_costs')
toberemoved.append('median_value_by_year_structure')
toberemoved.append('disability_status')
toberemoved.append('field_of_bachelor')
toberemoved.append('_who_had_a_birth_in_the_past_')
toberemoved.append('language_spoken_at_')
toberemoved.append('median_age_')
toberemoved.append('nativity_in_the_united_states_')
toberemoved.append('sex_by_age_by_independent_living_difficulty')
toberemoved.append('_difficulty_')
toberemoved.append('number_of_earners_in_family_')
toberemoved.append('disabilities_')
toberemoved.append('aggregate_income_')
toberemoved.append('median_earnings_')
toberemoved.append('period_of_military_service')
toberemoved.append('_disability')
toberemoved.append('aggregate_number_of_vehicles')
toberemoved.append('median')
toberemoved.append('usual_hours_worked_')
toberemoved.append('kitchen_facilities')
toberemoved.append('quartile_')
toberemoved.append('group_quarters_population_total_population_in_group_quarters_0')
toberemoved.append('aggregate_price_asked')
toberemoved.append('aggregate_number_of_rooms_')
toberemoved.append('vacant_')
toberemoved.append('tenure_total_')
toberemoved.append('mean_usual_hours_')
toberemoved.append('aggregate_usual_hours_')

### Find Columns to separate of remove

In [20]:
columns = data.columns
ancestrycol = []
transportationcol = []
dropcol = []
schoolcol = []
povertycol = []
rentcol = []
healthinsurancecol = []
techcol = []
householdcol = []
commutecol = []
for column in columns:
    if 'ancestry' in column:
        ancestrycol.append(column)
    if 'means_of_transportation' in column:
        transportationcol.append(column)
    if 'school_enrollment_' in column:
        schoolcol.append(column)
    if '_rent_' in column:        
        rentcol.append(column)
    if 'poverty_status_' in column:
        povertycol.append(column)
    if ('health_insurance_' in column \
        or 'health_care_' in column \
        or 'medicare' in column):
        healthinsurancecol.append(column)
    if ('travel_' in column or 'time_leaving' in column):
        commutecol.append(column)
    if ('computer' in column or 'internet' in column):
        techcol.append(column)
    if ('household' in column):
        householdcol.append(column)
    for item in toberemoved:
        if item in column:
            dropcol.append(column)

### Remove/Separate Columns

In [21]:
newdata = data.copy()
rent = data[rentcol]
poverty = data[povertycol]
ancestry = data[ancestrycol]
transport = data[transportationcol]
education = data[schoolcol]
commute = data[commutecol]
tech = data[techcol]
hpusehold = data[householdcol]
healthinsurance = data[healthinsurancecol]
combined = dropcol + householdcol + rentcol + povertycol + ancestrycol + commutecol
combined = combined + transportationcol + schoolcol + techcol + healthinsurancecol
combined = np.unique(combined).tolist()
newdata = newdata.drop(combined,axis=1)

### GEO

In [22]:
geo = newdata[['OBJECTID_0','GEOID_0']]
geo.columns = ['OBJECTID','GEOID']

### Race Dictionary

In [23]:
R = {}
Rc = {}
R['white'] = 'white'
R['black_or_african_american'] = 'black'
R['american_indian_and_alaska_native'] = 'native'
R['asian'] = 'asian'
R['native_hawaiian_and_other_pacific_islander'] = 'islander'
R['two_or_more_races'] = 'two_or_more_races'
R['some_other_race'] = 'some_other_race'
R['white_not_hispanic_or_latino'] = 'not_hispanic'
R['hispanic_or_latino'] = 'hispanic'
Rc['white'] = 1
Rc['black_or_african_american'] = 2
Rc['american_indian_and_alaska_native'] = 3
Rc['asian'] = 4
Rc['native_hawaiian_and_other_pacific_islander'] = 5
Rc['two_or_more_races'] = 6
Rc['some_other_race'] = 7
Rc['white_not_hispanic_or_latino'] = 8
Rc['hispanic_or_latino'] = 9

### Education Levels

In [24]:
E1 = {}
E2 = {}
E1['no_schooling_completed'] = 1
E1['nursery_to_4th_grade'] = 2
E1['5th_and_6th_grade'] = 3
E1['7th_and_8th_grade'] = 4
E1['9th_grade'] = 5
E1['10th_grade'] = 6
E1['11th_grade'] = 7
E1['12th_grade_no_diploma'] = 8
E1['high_school_graduate'] = 9
E1['some_college_less_than_1_year'] = 10
E1['some_college_1_or_more_years_no_degree'] = 11
E1['associates_degree'] = 12
E1['bachelors_degree'] = 13
E1['masters_degree'] = 14
E1['professional_school_degree'] = 15
E1['doctorate_degree'] = 16
E2['less_than_high_school_diploma'] = 17
E2['high_school_graduate'] = 18
E2['some_college_or_associates_degree'] = 19
E2['bachelors_degree_or_higher'] = 20

### Renaming of Columns (Demo and Education)

In [25]:
keepcolumns = []
newcolumnnames = []
genderlevels = []
racelevels = []
educationlevels = []
for c,column in enumerate(newdata.columns):
    
    #Sex by age
    newname = column.replace('sex_by_age_','')
    newname = newname.replace('_(includes_equivalency)_','')
    newname = newname.replace('_(in_2018_inflation-adjusted_dollars)_','')
    allelements = newname.split('_')
    if ('sex_by_educational_attainment' not in column):
        if ('sex_by_age_male' in column or 'sex_by_age_female' in column):
            newname = newname.replace('_total_population_0','')
            newname = newname.replace('years','')
            elements = newname.split('_')

            if (len(elements) == 1): newname = elements[0] + '_total'
            else: newname = elements[0] + '_' + ''.join(elements[1:])
            newname = newname.replace('85andover','over85')
            if ('total' not in newname):
                newcolumnnames.append(newname)
                keepcolumns.append(column)
                if (newname.index('male') == 0):
                    genderlevels.append(0)
                    racelevels.append(0)
                    educationlevels.append(0)
                else:
                    genderlevels.append(1)
                    racelevels.append(0)
                    educationlevels.append(0)

        #Race
        races = re.findall('\(.*?\)',newname)
        if (len(races) > 0):
            therest = newname[len(races[0])+1:]
            race = races[0][1:-1].replace('_alone','')
            race2 = R[race]
            if 'years' in therest:
                elements = therest.split('_')
                index = elements.index('years')
                newname = '_'.join(elements[:index])
                newname = race2 + '_' + newname.replace('_85','_over_85')
                if ('total' not in newname):
                    newcolumnnames.append(newname)
                    keepcolumns.append(column)
                    if ('_male_' in newname):
                        racelevels.append(Rc[race])
                        genderlevels.append(0)
                        educationlevels.append(0)
                    else:
                        racelevels.append(Rc[race])
                        genderlevels.append(1)
                        educationlevels.append(0)
                    
    else:
        newname = newname.replace('_25_years_and_over','')
        newname = newname.replace('_population_0','')
        races = re.findall('\(.*?\)',newname)
        if (len(races) > 0):
            newname = newname.replace('sex_by_educational_attainment_','')
            races = re.findall('\(.*?\)',newname)
            remove = races[0] + '_'
            newname = newname.replace(remove,'').replace('_alone','')
            race = races[0][1:-1].replace('_alone','')
            race2 = R[race]
            newname = newname.replace('_white','')
            newname = newname.replace('_black_or_african_american','')
            newname = newname.replace('_american_indian_and_alaska_native','')
            newname = newname.replace('_asian','')
            newname = newname.replace('_native_hawaiian_and_other_pacific_islander','')
            newname = newname.replace('_two_or_more_races','')
            newname = newname.replace('_some_other_race','')
            newname = newname.replace('white','')
            newname = newname.replace('black_or_african_american','')
            newname = newname.replace('american_indian_and_alaska_native','')
            newname = newname.replace('asian','')
            newname = newname.replace('native_hawaiian_and_other_pacific_islander','')
            newname = newname.replace('some_other_race','')
            newname = newname.replace('two_or_more_races','')
            newname = newname.replace('_not_hispanic_or_latino','')
            newname = newname.replace('_hispanic_or_latino','')
            newname = newname.replace('not_hispanic_or_latino','')
            newname = newname.replace('hispanic_or_latino','')
            newname = newname.replace("'",'')
            elements = newname.split('_')
            if (len(elements) > 1):
                newname = race2 + '_' + newname
                newcolumnnames.append(newname)
                keepcolumns.append(column)
                schooling = '_'.join(elements[1:])
                schooling = E2[schooling]
                if ('_male_' in newname):
                    genderlevels.append(0)
                    racelevels.append(Rc[race])
                    educationlevels.append(schooling)
                else:
                    genderlevels.append(1)
                    racelevels.append(Rc[race])
                    educationlevels.append(schooling)
        else:
            newname = newname.replace("'",'')
            newname = newname.replace('sex_by_educational_attainment_','')
            newname = newname.replace('population_0','')
            elements = newname.split('_')
            if (len(elements) > 1):
                newcolumnnames.append(newname)
                keepcolumns.append(column)
                schooling = '_'.join(elements[1:])
                schooling = E1[schooling]
                if ('_male_' in newname):
                    genderlevels.append(0)
                    racelevels.append(0)
                    educationlevels.append(schooling)
                else:
                    genderlevels.append(1)
                    racelevels.append(0)
                    educationlevels.append(schooling)               

### Restrict and rename

In [26]:
newdata = newdata[keepcolumns]
newdata.columns = newcolumnnames

### Split into demo and edu

In [27]:
columns = newdata.columns
education = newdata[columns[np.where(np.array(educationlevels) > 0)]]
demographics = newdata[columns[np.where(np.array(educationlevels) == 0)]]

### Rent

In [28]:
keepcolumns = []
newcolumnnames = []
for column in rent.columns:
    if 'household_income_in_the_past_12_months_(in_' not in column:
        if 'bedrooms_by_gross_rent_' in column and '_with_cash_rent_' not in column:
            newname = column.replace('_renter-occupied_housing_units_0','')
            newcolumnnames.append(newname)
            keepcolumns.append(column)
        if 'gross_rent_with_cash_rent_' in column and 'bedrooms_by_gross_rent_with_cash_ren' not in column:
            newname = column.replace('_renter-occupied_housing_units_0','')
            newname = newname.replace('_with_cash_rent','')
            newname = newname.replace('$1_','$1')
            newname = newname.replace('$2_','$2')
            newname = newname.replace('$3_','$3')
            newname = newname.replace('$4_','$4')
            if (newname == 'gross_rent'):
                newname = 'gross_rent_total'
            newcolumnnames.append(newname)
            keepcolumns.append(column)
        if 'gross_rent_as_a_percentage_of_household_income_' in column:
            if 'age_of_householder_by_gross_rent' not in column:
                if 'median' not in column:
                    newname = column.replace('household_income_by_gross_rent_as_a_percentage_of_household_income_in_the_past_12_months','')
                    newname = newname.replace('_renter-occupied_housing_units_0','')
                    newname = newname.replace('_income_in_the_past_12_months','')
                    if (newname != '_total'):
                        if 'total' not in newname:
                            newname = newname + '_total'
                        if 'gross_rent_as_a_percentage_of_household' not in newname:
                            newname = 'gross_rent_as_a_percentage_of_household' + newname
                        newname = newname.replace('gross_rent_as_a_percentage_of_household','rent_as_percent_income')
                        newname = newname.replace('_percent_total','_percent')
                        newname = newname.replace('_000','000')
                        newname = newname.replace('_999','999')
                        newname = newname.replace('.0','')
                        newname = newname.replace('.9','')                        
                        newname = newname.replace('_to_','_')   
                        newcolumnnames.append(newname)
                        keepcolumns.append(column)
                        
rent = rent[keepcolumns]
rent.columns = newcolumnnames                        

### Poverty

In [29]:
keepcolumns = []
newcolumnnames = []
for column in poverty.columns:
    if 'poverty_status_in_the_past_12_months_by_sex_by_age' in column:
        index = column.index('poverty_status_in_the_past_12_months_by_sex_by_age')
        if (index == 0 and '.1_0' not in column and 'income_in_the_past' not in column):
            newname = column.replace('_population_for_whom_poverty_status_is_determined_0','')
            races = re.findall('\(.*?\)',newname)
            if (len(races) > 0):
                remove = races[0] + '_'
                newname = newname.replace(remove,'').replace('_alone','')
                newname = newname.replace('poverty_status_in_the_past_12_months_by_sex_by_age','poverty_status')
                newname = newname.replace('_white','')
                newname = newname.replace('_black_or_african_american','')
                newname = newname.replace('_american_indian_and_alaska_native','')
                newname = newname.replace('_asian','')
                newname = newname.replace('_native_hawaiian_and_other_pacific_islander','')
                newname = newname.replace('_two_or_more_races','')
                newname = newname.replace('_some_other_race','')
                race = races[0][1:-1].replace('_alone','')
                race2 = R[race]
                newname = race2 + '_' + newname
                newcolumnnames.append(newname)
                keepcolumns.append(column)
            else:
                newname = newname.replace('poverty_status_in_the_past_12_months_by_sex_by_age','poverty_status')
                newcolumnnames.append(newname)
                keepcolumns.append(column)

poverty = poverty[keepcolumns]
poverty.columns = newcolumnnames                        

### Ancestry

In [30]:
removelist = ['ancestry_west_indian','ancestry_subsaharan_african',\
              'ancestry_arab','ancestry_total','ancestry_total_afghan',\
              'ancestry_total_albanian','ancestry_total_alsatian',\
              'ancestry_total_american']
keepcolumns = []
newcolumnnames = []
for column in ancestry.columns:
    if 'people_reporting_ancestry_' in column:
        newname = column.replace('_total_population_0','')
        newname = newname.replace('_(except_hispanic_groups)','')
        newname = newname.replace('people_reporting_ancestry_','ancestry_')
        newname = newname.replace('/','_')
        found = 0
        for item in removelist:
            if (newname == item):
                found = 1
        if (found == 0):
            newcolumnnames.append(newname)
            keepcolumns.append(column)

ancestry = ancestry[keepcolumns]
ancestry.columns = newcolumnnames

### Transport

In [31]:
keepcolumns = []
newcolumnnames = []
excludelist = []
excludelist.append('.1_0')
excludelist.append('.2_0')
excludelist.append('.3_0')
excludelist.append('_citizenship_status_')
excludelist.append('_language_spoken_at_home')
excludelist.append("by_workers'_earnings")
excludelist.append('_by_occupation')
excludelist.append('_by_poverty_status')
excludelist.append('_by_industry')
excludelist.append('_by_time_leaving_home')
excludelist.append('_by_class_of')
excludelist.append('_by_travel_time')
excludelist.append('_by_tenure')
excludelist.append('_by_place_of_')
excludelist.append('_by_vehicles_available_')
for column in transport.columns[:117]:
    if 'means_of_transportation_to_work_' in column:
        if 'means_of_transportation_to_work_by_industry_' not in columns:
            index = column.index('means_of_transportation_to_work_')
            if (index == 0 and 'means_of_transportation_to_work_by_age_total_' not in column):
                newname = column.replace('means_of_transportation_to_work_by_age_','transportation_')
                found = 0
                for item in excludelist:
                    if item in newname:
                        found = 1
                if (found == 0):
                    newname = newname.replace('means_of_transportation_to_work_by_age_','transportation_')
                    newname = newname.replace('_-_drove_alone','_drove_alone')
                    newname = newname.replace('_-_carpooled','_carpooled')
                    newname = newname.replace('_workers_16_years_and_over_0','')
                    newname = newname.replace('_(excluding_taxicab)','')
                    if ('years' in newname):
                        newcolumnnames.append(newname)
                        keepcolumns.append(column)

transport = transport[keepcolumns]
transport.columns = newcolumnnames

### Commute

In [32]:
keepcolumns = []
newcolumnnames = []
for column in commute.columns:
    if 'aggregate_travel_time' not in column and 'means_of_transportation' not in column:
        if 'allocation_of_time' not in column and 'sex_of_workers' in column:
            newname = column
            newname = newname.replace('sex_of_workers_by_travel_time_to_work_','travel_time_')
            newname = newname.replace('sex_of_workers_by_time_leaving_home_to_go_to_work','time_leaving')
            newname = newname.replace('_who_did_not_work_at_home_0','')
            newname = newname.replace('_workers_16_years_and_over','')
            for i in range(10):
                for j in range(10):
                    repstr1 = str(i) + '_' + str(j)
                    repstr2 = str(i) + ':' + str(j)
                    newname = newname.replace(repstr1,repstr2)
                    newname = newname.replace('_a.m.','AM')
                    newname = newname.replace('_p.m.','PM')
            newcolumnnames.append(newname)
            keepcolumns.append(column)

commute = commute[keepcolumns]
commute.columns = newcolumnnames

### Tech

In [33]:
keepcolumns = []
newcolumnnames = []
for column in tech.columns:
    if 'presence_and_types_of_internet_subscriptions_in_household' in column:
        newname = column.replace('presence_and_types_of_internet_subscriptions_in_household_total_','internet_')
        newname = newname.replace('_households_0','')
        newname = newname.replace('_with_no_other_type_of_internet_subscription','_only')
    else:
        if 'allocation_of_household' not in column and 'degree' not in column and 'presence' not in column and 'types_of_computers_in_household' not in column:
            newname = column.replace('_households_0','')
            newname = newname.replace('has_one_or_more_types_of_computing_devices_','')
            newname = newname.replace('computers_in_household_','computers_')
            newname = newname.replace('internet_subscriptions_in_household','internet')
            newname = newname.replace('or_other_portable_wireless_computer_or_other_computer_','')
            newcolumnnames.append(newname)
            keepcolumns.append(column)

tech = tech[keepcolumns]
tech.columns = newcolumnnames

### Health insurance

In [34]:
keepcolumns = []
newcolumnnames = []
for column in healthinsurance.columns:
    if 'health_insurance_coverage_status_by_sex_by_age' in column:
        if ('.1_0' not in column):
            newname = column.replace('_population_0','')
            newname = newname.replace('health_insurance_coverage_status_by_sex_by_age','health_insurance')
            newname = newname.replace('_civilian_noninstitutionalized','')
            elements= newname.split('_')
            if (len(elements) > 3 and 'coverage' in newname):
                newname = newname.replace('_with_health_insurance_coverage','_with_insurance')
                newname = newname.replace('no_health_insurance_coverage','no_insurance')     
                newcolumnnames.append(newname)
                keepcolumns.append(column)

healthinsurance = healthinsurance[keepcolumns]
healthinsurance.columns = newcolumnnames

### Store all

In [35]:
dataframes = []
dataframes.append(geo)
dataframes.append(demographics)
dataframes.append(education)
dataframes.append(rent)
dataframes.append(poverty)
dataframes.append(ancestry)
dataframes.append(transport)
dataframes.append(transport)
dataframes.append(transport)
dataframes.append(commute)
dataframes.append(tech)
dataframes.append(healthinsurance)
data = pd.concat(dataframes,axis=1)
data = data.fillna(0)
data.to_csv('../data/acs/nyc_tracts/nyc_tracts_census.csv',index=False)

### Create CSVT file

In [36]:
with open('../data/acs/nyc_tracts/nyc_tracts_census.csvt','w') as f:
    for c,column in enumerate(data.columns):
        if (c == 1):
            f.write('"String(11)",')
        else:
            f.write('"Integer",')
    f.write('"Integer"\n')

### Load Shapefile

In [37]:
sf = shapefile.Reader('../data/acs/nyc_tracts/nyc_tracts_cleaned.shp')
streetsShapeRecs = sf.shapeRecords()

### Join

In [54]:
outdata = []
geoids = data['GEOID'].astype(str).values
for item in tqdm(streetsShapeRecs):
    shape = item.shape
    record = item.record
    points = shape.points
    mpoint = np.mean(points,axis=0)
    geoid1 = record[4]
    lat = float(record[11])
    lng = float(record[12])
    lat = mpoint[1]
    lng = mpoint[0]    
    index = np.where(geoid1 == geoids)[0][0]
    datapoint = data.iloc[index].tolist()
    outdata.append([geoid1,lat,lng] + datapoint[2:])
outdata = pd.DataFrame(outdata)
outdata.columns = ['geoid','lat','lng'] + data.columns[2:].tolist()
outdata.to_csv('../data/acs/nyc_tracts/nyc_tracts_census_geo.csv',index=False)

100%|██████████| 2165/2165 [00:00<00:00, 2617.90it/s]


### Graph

In [55]:
X = pickle.load(open("../data/graphs/nyc.p","rb"))
O = X['O']
N = X['N']
edges = X['edges']
nodes = X['nodes']
nodenames = X['nodenames']
edgedatabase = X['edgedatabase']
G = X['G']
Nlist = X['Nlist']

### Get LatLngs

In [56]:
Olatlng = outdata[['lat','lng']].values
nodelatlng1 = edgedatabase[['lat1','lng1']].values
nodelatlng2 = edgedatabase[['lat2','lng2']].values

### Build Trees

In [57]:
kdtree1 = KDTree(nodelatlng1)
kdtree2 = KDTree(nodelatlng2)

### Query Trees

In [58]:
cindex = []
cname = []
for i in tqdm(range(Olatlng.shape[0])):
    ind1 = kdtree1.query(Olatlng[i,:],k=1)[1]
    ind2 = kdtree2.query(Olatlng[i,:],k=1)[1]
    dp1 = edgedatabase.iloc[ind1]
    dp2 = edgedatabase.iloc[ind2]
    cindex.append(dp1.index1)
    cname.append(dp1.nodename1)
outdata['node_index'] = cindex    
outdata['node_name'] = cname
outdata.to_csv('../data/acs/nyc_tracts/nyc_tracts_census_geo_node.csv',index=False)

100%|██████████| 2165/2165 [00:02<00:00, 890.31it/s]
