In [1]:
import geopandas as gpd
from shapely.geometry import Point
import os

# Load census tract shapefile
tracts = gpd.read_file('tl_rd22_48_tract')

# Create point object
point = Point( -98.49460900270131, 29.4192557501264)

# Check if point is within a census tract
tract = tracts[tracts.contains(point)]

# Extract data from corresponding row in dataframe
if not tract.empty:
    tract_data = tract.iloc[0]
    print("Point is within census tract:", tract_data['NAME'])
else:
    print("Point is not within any census tract.")


Point is within census tract: 1101


In [2]:
# I input a sample point from google maps
# based on a map of census tract codes. At
# first it said the point was not in a census
# tract, but then I resversed the coordinates
# and it returned the correct tract code.
# NOTE: need to reverse order returned from 
# google maps. May be able to address this output
# at the Places API level.

In [3]:
tract

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
1190,48,29,110100,48029110100,1101,Census Tract 1101,G5020,S,3428908,13720,29.4261655,-98.4905264,"POLYGON ((-98.50167 29.42272, -98.50164 29.423..."


In [4]:
tracts.head().T

Unnamed: 0,0,1,2,3,4
STATEFP,48,48,48,48,48
COUNTYFP,189,219,219,219,219
TRACTCE,950200,950400,950300,950100,950600
GEOID,48189950200,48219950400,48219950300,48219950100,48219950600
NAME,9502,9504,9503,9501,9506
NAMELSAD,Census Tract 9502,Census Tract 9504,Census Tract 9503,Census Tract 9501,Census Tract 9506
MTFCC,G5020,G5020,G5020,G5020,G5020
FUNCSTAT,S,S,S,S,S
ALAND,6306913,12691656,12186639,214157569,358638163
AWATER,0,5302,0,0,0


In [5]:
tracts.columns

Index(['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'MTFCC',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'],
      dtype='object')

In [6]:
tracts.head().T

Unnamed: 0,0,1,2,3,4
STATEFP,48,48,48,48,48
COUNTYFP,189,219,219,219,219
TRACTCE,950200,950400,950300,950100,950600
GEOID,48189950200,48219950400,48219950300,48219950100,48219950600
NAME,9502,9504,9503,9501,9506
NAMELSAD,Census Tract 9502,Census Tract 9504,Census Tract 9503,Census Tract 9501,Census Tract 9506
MTFCC,G5020,G5020,G5020,G5020,G5020
FUNCSTAT,S,S,S,S,S
ALAND,6306913,12691656,12186639,214157569,358638163
AWATER,0,5302,0,0,0


In [7]:
tracts.NAME.dtype

dtype('O')

In [8]:
tracts.shape

(6896, 13)

In [9]:
# to filter tracts from all of Texas to Bexar
# County use COUNTYFP == '029'
tracts = tracts[(tracts.COUNTYFP == '029')]
tracts.shape

(375, 13)

In [10]:
#Let's open a census table for tracts in Bexar County
# and see what it looks like:
import numpy as np
import pandas as pd

s0101 = pd.read_csv('sex_and_age.csv')
s0101

Unnamed: 0,GEO_ID,NAME,S0101_C01_001E,S0101_C01_001M,S0101_C01_001MA,S0101_C01_001EA,S0101_C01_002E,S0101_C01_002EA,S0101_C01_002M,S0101_C01_002MA,...,S0101_C06_036MA,S0101_C06_037E,S0101_C06_037EA,S0101_C06_037M,S0101_C06_037MA,S0101_C06_038E,S0101_C06_038EA,S0101_C06_038M,S0101_C06_038MA,Unnamed: 914
0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total po...,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under ...,Annotation of Estimate!!Total!!Total populatio...,Margin of Error!!Total!!Total population!!AGE!...,Annotation of Margin of Error!!Total!!Total po...,...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,
1,0500000US48029,"Bexar County, Texas",1978826,*****,*****,,139109,,67,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,1400000US48029110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,1400000US48029110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,1400000US48029110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,1400000US48029980002,"Census Tract 9800.02, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
373,1400000US48029980003,"Census Tract 9800.03, Bexar County, Texas",1059,286,,,84,,31,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
374,1400000US48029980004,"Census Tract 9800.04, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
375,1400000US48029980005,"Census Tract 9800.05, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [11]:
# Ok, there's a lot of messiness here. 
# First, I'll lowercase all column names to make more pythonic
s0101.columns = [col.lower() for col in s0101.columns]
s0101

Unnamed: 0,geo_id,name,s0101_c01_001e,s0101_c01_001m,s0101_c01_001ma,s0101_c01_001ea,s0101_c01_002e,s0101_c01_002ea,s0101_c01_002m,s0101_c01_002ma,...,s0101_c06_036ma,s0101_c06_037e,s0101_c06_037ea,s0101_c06_037m,s0101_c06_037ma,s0101_c06_038e,s0101_c06_038ea,s0101_c06_038m,s0101_c06_038ma,unnamed: 914
0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total po...,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under ...,Annotation of Estimate!!Total!!Total populatio...,Margin of Error!!Total!!Total population!!AGE!...,Annotation of Margin of Error!!Total!!Total po...,...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,
1,0500000US48029,"Bexar County, Texas",1978826,*****,*****,,139109,,67,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,1400000US48029110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,1400000US48029110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,1400000US48029110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,1400000US48029980002,"Census Tract 9800.02, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
373,1400000US48029980003,"Census Tract 9800.03, Bexar County, Texas",1059,286,,,84,,31,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
374,1400000US48029980004,"Census Tract 9800.04, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
375,1400000US48029980005,"Census Tract 9800.05, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [12]:
# That looks better

In [13]:
# Then, I think using the last 6 digits of the GEO_ID is the best
# way to get the census tract as it is more standardized than the name
# field. Then I'll drop the NAME column.
s0101.dtypes

geo_id              object
name                object
s0101_c01_001e      object
s0101_c01_001m      object
s0101_c01_001ma     object
                    ...   
s0101_c06_038e      object
s0101_c06_038ea     object
s0101_c06_038m      object
s0101_c06_038ma     object
unnamed: 914       float64
Length: 915, dtype: object

In [14]:
# GEO_ID is an object, so:
s0101.geo_id = s0101.geo_id.str[-6:]
s0101.head()

Unnamed: 0,geo_id,name,s0101_c01_001e,s0101_c01_001m,s0101_c01_001ma,s0101_c01_001ea,s0101_c01_002e,s0101_c01_002ea,s0101_c01_002m,s0101_c01_002ma,...,s0101_c06_036ma,s0101_c06_037e,s0101_c06_037ea,s0101_c06_037m,s0101_c06_037ma,s0101_c06_038e,s0101_c06_038ea,s0101_c06_038m,s0101_c06_038ma,unnamed: 914
0,graphy,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total po...,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under ...,Annotation of Estimate!!Total!!Total populatio...,Margin of Error!!Total!!Total population!!AGE!...,Annotation of Margin of Error!!Total!!Total po...,...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,Estimate!!Percent Female!!Total population!!PE...,Annotation of Estimate!!Percent Female!!Total ...,Margin of Error!!Percent Female!!Total populat...,Annotation of Margin of Error!!Percent Female!...,
1,S48029,"Bexar County, Texas",1978826,*****,*****,,139109,,67,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [15]:
# that's better, but I accidentally truncated row 1 which is the
# human-readable column name. Let's look at the list of coded column names
# and see if there's any reason to keep them.
print(s0101.columns.to_list())

['geo_id', 'name', 's0101_c01_001e', 's0101_c01_001m', 's0101_c01_001ma', 's0101_c01_001ea', 's0101_c01_002e', 's0101_c01_002ea', 's0101_c01_002m', 's0101_c01_002ma', 's0101_c01_003e', 's0101_c01_003ea', 's0101_c01_003m', 's0101_c01_003ma', 's0101_c01_004e', 's0101_c01_004m', 's0101_c01_004ma', 's0101_c01_004ea', 's0101_c01_005e', 's0101_c01_005ea', 's0101_c01_005m', 's0101_c01_005ma', 's0101_c01_006e', 's0101_c01_006ea', 's0101_c01_006m', 's0101_c01_006ma', 's0101_c01_007e', 's0101_c01_007m', 's0101_c01_007ma', 's0101_c01_007ea', 's0101_c01_008e', 's0101_c01_008m', 's0101_c01_008ma', 's0101_c01_008ea', 's0101_c01_009e', 's0101_c01_009m', 's0101_c01_009ma', 's0101_c01_009ea', 's0101_c01_010e', 's0101_c01_010ea', 's0101_c01_010m', 's0101_c01_010ma', 's0101_c01_011e', 's0101_c01_011ea', 's0101_c01_011m', 's0101_c01_011ma', 's0101_c01_012e', 's0101_c01_012m', 's0101_c01_012ma', 's0101_c01_012ea', 's0101_c01_013e', 's0101_c01_013ea', 's0101_c01_013m', 's0101_c01_013ma', 's0101_c01_014e', '

In [16]:
# besides geo_id and name, those are actully pretty useless for the scope of this project
# I'll go ahead and drop that row and reassign row 1 as columns
# I'll go back and import the .csv using header=1 and skiprows=[0]
s0101 = pd.read_csv('sex_and_age.csv', skiprows=[0])
# I guess it does the skip first and then grabs header so just need the skiprows
s0101

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total population,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Annotation of Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Annotation of Margin of Error!!Total!!Total population!!AGE!!Under 5 years,...,Annotation of Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Unnamed: 914
0,0500000US48029,"Bexar County, Texas",1978826,*****,*****,,139109,,67,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
1,1400000US48029110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,1400000US48029110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,1400000US48029110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,1400000US48029110600,"Census Tract 1106, Bexar County, Texas",5645,1041,,,207,,141,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,1400000US48029980002,"Census Tract 9800.02, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
372,1400000US48029980003,"Census Tract 9800.03, Bexar County, Texas",1059,286,,,84,,31,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
373,1400000US48029980004,"Census Tract 9800.04, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
374,1400000US48029980005,"Census Tract 9800.05, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [17]:
# but now, row 0 is aggregate data for the county, this is not likely to be
# useful and may create confusion for modeling so will actually drop that at
# import as well. It is row 2 in the original .csv:
s0101 = pd.read_csv('sex_and_age.csv', skiprows=[0,2])

s0101

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total population,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Annotation of Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Annotation of Margin of Error!!Total!!Total population!!AGE!!Under 5 years,...,Annotation of Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Unnamed: 914
0,1400000US48029110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
1,1400000US48029110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,1400000US48029110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,1400000US48029110600,"Census Tract 1106, Bexar County, Texas",5645,1041,,,207,,141,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,1400000US48029110700,"Census Tract 1107, Bexar County, Texas",1079,207,,,43,,35,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,1400000US48029980002,"Census Tract 9800.02, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
371,1400000US48029980003,"Census Tract 9800.03, Bexar County, Texas",1059,286,,,84,,31,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
372,1400000US48029980004,"Census Tract 9800.04, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
373,1400000US48029980005,"Census Tract 9800.05, Bexar County, Texas",0,14,,,0,,14,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [18]:
# boom, now take the last six of the Geography:
s0101.Geography = s0101.Geography.str[-6:]
s0101.head()

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Annotation of Margin of Error!!Total!!Total population,Annotation of Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Annotation of Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Annotation of Margin of Error!!Total!!Total population!!AGE!!Under 5 years,...,Annotation of Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Annotation of Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Unnamed: 914
0,110100,"Census Tract 1101, Bexar County, Texas",3153,504,,,70,,74,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
1,110300,"Census Tract 1103, Bexar County, Texas",3114,638,,,276,,180,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
2,110500,"Census Tract 1105, Bexar County, Texas",2430,392,,,540,,207,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
3,110600,"Census Tract 1106, Bexar County, Texas",5645,1041,,,207,,141,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),
4,110700,"Census Tract 1107, Bexar County, Texas",1079,207,,,43,,35,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),


In [19]:
s0101.columns.str.contains('Margin|Annotation')

array([False, False, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False,  True,

In [20]:
# I'm going to drop most of these features, but there are so many that I will
# start by dropping all of the margin of error and annotation features
filtered_cols = s0101.columns.str.contains('Margin|Annotation')
cols_to_drop = s0101.columns[filtered_cols].to_list()
s0101 = s0101.drop(columns=cols_to_drop)
s0101

# def drop_cols(df):
#     for col in df.columns:
#         if 

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Estimate!!Total!!Total population!!AGE!!5 to 9 years,Estimate!!Total!!Total population!!AGE!!10 to 14 years,Estimate!!Total!!Total population!!AGE!!15 to 19 years,Estimate!!Total!!Total population!!AGE!!20 to 24 years,Estimate!!Total!!Total population!!AGE!!25 to 29 years,Estimate!!Total!!Total population!!AGE!!30 to 34 years,...,Estimate!!Percent Female!!Total population!!SELECTED AGE CATEGORIES!!65 years and over,Estimate!!Percent Female!!Total population!!SELECTED AGE CATEGORIES!!75 years and over,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Median age (years),Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females),Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Unnamed: 914
0,110100,"Census Tract 1101, Bexar County, Texas",3153,70,45,35,33,267,564,251,...,19.6,10.1,(X),(X),(X),(X),(X),(X),(X),
1,110300,"Census Tract 1103, Bexar County, Texas",3114,276,267,128,209,170,208,437,...,8.5,0.6,(X),(X),(X),(X),(X),(X),(X),
2,110500,"Census Tract 1105, Bexar County, Texas",2430,540,258,136,115,156,204,174,...,11.1,4.7,(X),(X),(X),(X),(X),(X),(X),
3,110600,"Census Tract 1106, Bexar County, Texas",5645,207,86,114,256,476,726,603,...,25.1,12.9,(X),(X),(X),(X),(X),(X),(X),
4,110700,"Census Tract 1107, Bexar County, Texas",1079,43,73,60,36,76,26,46,...,27.0,14.0,(X),(X),(X),(X),(X),(X),(X),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,980002,"Census Tract 9800.02, Bexar County, Texas",0,0,0,0,0,0,0,0,...,-,-,(X),(X),(X),(X),(X),(X),(X),
371,980003,"Census Tract 9800.03, Bexar County, Texas",1059,84,72,40,225,244,150,77,...,0.0,0.0,(X),(X),(X),(X),(X),(X),(X),
372,980004,"Census Tract 9800.04, Bexar County, Texas",0,0,0,0,0,0,0,0,...,-,-,(X),(X),(X),(X),(X),(X),(X),
373,980005,"Census Tract 9800.05, Bexar County, Texas",0,0,0,0,0,0,0,0,...,-,-,(X),(X),(X),(X),(X),(X),(X),


In [21]:
# ok, that dropped from 915 cols to 231. Now I'm just gonna look at the list
# for some features I may want.
cols = s0101.columns.to_list()
cols

['Geography',
 'Geographic Area Name',
 'Estimate!!Total!!Total population',
 'Estimate!!Total!!Total population!!AGE!!Under 5 years',
 'Estimate!!Total!!Total population!!AGE!!5 to 9 years',
 'Estimate!!Total!!Total population!!AGE!!10 to 14 years',
 'Estimate!!Total!!Total population!!AGE!!15 to 19 years',
 'Estimate!!Total!!Total population!!AGE!!20 to 24 years',
 'Estimate!!Total!!Total population!!AGE!!25 to 29 years',
 'Estimate!!Total!!Total population!!AGE!!30 to 34 years',
 'Estimate!!Total!!Total population!!AGE!!35 to 39 years',
 'Estimate!!Total!!Total population!!AGE!!40 to 44 years',
 'Estimate!!Total!!Total population!!AGE!!45 to 49 years',
 'Estimate!!Total!!Total population!!AGE!!50 to 54 years',
 'Estimate!!Total!!Total population!!AGE!!55 to 59 years',
 'Estimate!!Total!!Total population!!AGE!!60 to 64 years',
 'Estimate!!Total!!Total population!!AGE!!65 to 69 years',
 'Estimate!!Total!!Total population!!AGE!!70 to 74 years',
 'Estimate!!Total!!Total population!!AGE!

In [22]:
s0101_cols = ['Geography', 'Estimate!!Total!!Total population',
              'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females)',
              'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio',
              'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Child dependency ratio']
df = s0101[s0101_cols]
df.head()

Unnamed: 0,Geography,Estimate!!Total!!Total population,Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females),Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Child dependency ratio
0,110100,3153,161.7,29.0,6.4
1,110300,3114,99.9,13.0,43.4
2,110500,2430,100.3,25.4,89.8
3,110600,5645,275.1,15.0,11.6
4,110700,1079,129.1,49.2,35.8


In [23]:
df.columns = [col.lower() for col in df.columns]
df.columns = [col.replace('!!', '_') for col in df.columns]
df.columns = [col.replace('estimate_total_', '') for col in df.columns]
df.columns = [col.replace('summary indicators_', '') for col in df.columns]
df.columns = [col.replace('total population_', '') for col in df.columns]

In [24]:
df.head()

Unnamed: 0,geography,total population,sex ratio (males per 100 females),old-age dependency ratio,child dependency ratio
0,110100,3153,161.7,29.0,6.4
1,110300,3114,99.9,13.0,43.4
2,110500,2430,100.3,25.4,89.8
3,110600,5645,275.1,15.0,11.6
4,110700,1079,129.1,49.2,35.8


In [25]:
df.shape

(375, 5)

In [26]:
ok

NameError: name 'ok' is not defined