# Data Exploration

Goals: 
1. Learn and understand the structure of PUMAS and how to join data to them
2. Create a GeoJSON file with the most common educational attainment level by PUMA, County, State.

In [8]:
import gzip
import pandas as pd
import geopandas as gpd
import json
from zipfile import ZipFile
import io
import shapefile

In [2]:
with gzip.open("../data/raw/usa_00041.dta.gz", "rb") as file:
    df = pd.read_stata(file)

In [3]:
print(df.shape)
df.head()

(1839285, 21)


Unnamed: 0,year,multyear,datanum,serial,cbserial,hhwt,statefip,countyfips,puma,gq,...,perwt,sex,age,bpl,bpld,citizen,language,languaged,educ,educd
0,2016,2012,5,3,2012000000000.0,22,georgia,0,100,households under 1970 definition,...,33,female,34,georgia,georgia,,english,english,4 years of college,bachelor's degree
1,2016,2012,5,13,2012000000000.0,8,georgia,0,1007,households under 1970 definition,...,8,female,33,georgia,georgia,,english,english,grade 12,regular high school diploma
2,2016,2012,5,14,2012000000000.0,2,california,29,2901,households under 1970 definition,...,2,female,30,california,california,,english,english,grade 12,ged or alternative credential
3,2016,2012,5,26,2012000000000.0,12,georgia,0,2002,households under 1970 definition,...,13,female,34,colorado,colorado,,spanish,spanish,1 year of college,"1 or more years of college credit, no degree"
4,2016,2012,5,28,2012000000000.0,3,california,73,7313,other group quarters,...,3,male,28,california,california,,english,english,grade 12,ged or alternative credential


In [4]:
zipfile = ZipFile("../data/raw/cb_2017_us_county_500k.zip")

In [10]:
zipfile.extractall("../data/interim/county_data/")

In [11]:
gdf = gpd.read_file("../data/interim/county_data/cb_2017_us_county_500k.shp")

In [12]:
gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,1,5,161528,0500000US01005,1005,Barbour,6,2292144656,50538698,"POLYGON ((-85.74803199999999 31.619181, -85.74..."
1,1,23,161537,0500000US01023,1023,Choctaw,6,2365869837,19144469,"POLYGON ((-88.47322699999999 31.893856, -88.46..."
2,1,35,161543,0500000US01035,1035,Conecuh,6,2201948618,6643480,"POLYGON ((-87.42720399999999 31.26436, -87.425..."
3,1,51,161551,0500000US01051,1051,Elmore,6,1601762124,99965171,"POLYGON ((-86.41333499999999 32.750591, -86.37..."
4,1,65,161558,0500000US01065,1065,Hale,6,1667907107,32423356,"POLYGON ((-87.870464 32.762442, -87.868184 32...."


In [13]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3233 entries, 0 to 3232
Data columns (total 10 columns):
STATEFP     3233 non-null object
COUNTYFP    3233 non-null object
COUNTYNS    3233 non-null object
AFFGEOID    3233 non-null object
GEOID       3233 non-null object
NAME        3233 non-null object
LSAD        3233 non-null object
ALAND       3233 non-null int64
AWATER      3233 non-null int64
geometry    3233 non-null object
dtypes: int64(2), object(8)
memory usage: 252.7+ KB


In [15]:
df['statefips_code'] = df['statefip'].cat.codes.astype(str).str.rjust(2, "0")

df['countyfips'] = df['countyfips'].astype(str).str.rjust(3, "0")

df['GEOID'] = df['statefips_code'] + df['countyfips']

In [16]:
df.head()

Unnamed: 0,year,multyear,datanum,serial,cbserial,hhwt,statefip,countyfips,puma,gq,...,age,bpl,bpld,citizen,language,languaged,educ,educd,statefips_code,GEOID
0,2016,2012,5,3,2012000000000.0,22,georgia,0,100,households under 1970 definition,...,34,georgia,georgia,,english,english,4 years of college,bachelor's degree,10,10000
1,2016,2012,5,13,2012000000000.0,8,georgia,0,1007,households under 1970 definition,...,33,georgia,georgia,,english,english,grade 12,regular high school diploma,10,10000
2,2016,2012,5,14,2012000000000.0,2,california,29,2901,households under 1970 definition,...,30,california,california,,english,english,grade 12,ged or alternative credential,4,4029
3,2016,2012,5,26,2012000000000.0,12,georgia,0,2002,households under 1970 definition,...,34,colorado,colorado,,spanish,spanish,1 year of college,"1 or more years of college credit, no degree",10,10000
4,2016,2012,5,28,2012000000000.0,3,california,73,7313,other group quarters,...,28,california,california,,english,english,grade 12,ged or alternative credential,4,4073


Creating an education variable

In [17]:
df.loc[df['educd'] < 'regular high school diploma', 'education_level'] = 'No HS'

df.loc[((df['educd'] == 'regular high school diploma') | (df['educd'] == 'ged or alternative credential')), 'education_level'] = 'HS'

df.loc[(df['educd'] >= 'some college, but less than 1 year') & (df['educd'] <= "associate's degree, type not specified"), 'education_level'] = 'Some college/AA'

df.loc[df['educd'] == "bachelor's degree", 'education_level'] = 'B.A.'

df.loc[df['educd'] > "bachelor's degree", 'education_level'] = 'Advanced degree'


In [18]:
df['education_level'].isnull().sum()

0

In [19]:
education_by_puma = df.groupby(['GEOID', 'education_level'])['perwt'].sum().to_frame()

In [20]:
shares = education_by_puma.groupby(level=0).apply(lambda x: x / float(x.sum()))

In [21]:
most_common_educ = shares.groupby(level = 0)['perwt'].apply(lambda x: x.argmax()[1]).to_frame()
numbers = shares.groupby(level = 0)['perwt'].max()

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.


In [22]:
most_common_educ['share'] = numbers

most_common_educ.columns = ['education_level', 'share']
most_common_educ.reset_index(inplace = True)
most_common_educ.head()

Unnamed: 0,GEOID,education_level,share
0,0,Some college/AA,0.344062
1,3,Some college/AA,0.333148
2,15,Some college/AA,0.375766
3,55,Some college/AA,0.367613
4,73,Some college/AA,0.309765


In [23]:
gdf['centroid_lon'] = gdf['geometry'].centroid.x
gdf['centroid_lat'] = gdf['geometry'].centroid.y

In [24]:
ba_shares = shares.xs('B.A.', level = 1).reset_index()

ba_shares.columns = ['GEOID', 'Shares of BA']

In [25]:
geo_data = gdf.merge(most_common_educ, how = 'left', on = 'GEOID', )

In [38]:
## We'll focus on BAs
geo_ba = gdf.merge(ba_shares, how = 'left', on = 'GEOID')
geo_ba['Shares of BA'].fillna(0, inplace = True)


In [41]:
geo_ba.columns

Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD',
       'ALAND', 'AWATER', 'geometry', 'centroid_lon', 'centroid_lat',
       'Shares of BA'],
      dtype='object')

In [42]:
ca_ba = geo_ba[geo_ba['STATEFP'] == '06']

In [43]:
ca_ba.shape

(58, 13)

Now we can create a GeoJSON for the choropleth.

In [44]:
# choro_data = json.loads(geo_ba.to_json())

In [45]:
ca_data = json.loads(ca_ba.to_json())

In [46]:
with open("../data/interim/ca_ba.geojson", "w") as file:
    json.dump(ca_data, file, indent=4)

In [73]:
# with open("../data/interim/ba_pumas.geojson", "w") as file:
#     json.dump(choro_data, file, indent=4)

***
Just to check we can try to visualize it right now,

In [59]:
ca_ba.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,centroid_lon,centroid_lat,Shares of BA
18,6,1,1675839,0500000US06001,6001,Alameda,6,1909616630,216916717,"POLYGON ((-122.342253 37.805558, -122.33411840...",-121.88887,37.646895,0.292831
19,6,5,1675841,0500000US06005,6005,Amador,6,1539933576,29470568,"POLYGON ((-121.027406 38.50354, -121.027472 38...",-120.65109,38.446392,0.285289
20,6,13,1675903,0500000US06013,6013,Contra Costa,6,1857310903,225193562,"POLYGON ((-122.42976 37.965405, -122.418592 37...",-121.927786,37.919123,0.290183
21,6,23,1681908,0500000US06023,6023,Humboldt,6,9241251740,1254039383,"POLYGON ((-124.408601 40.44320099999999, -124....",-123.875629,40.699297,0.0
22,6,37,277283,0500000US06037,6037,Los Angeles,6,10510588451,1794793532,"(POLYGON ((-118.604415 33.478552, -118.598783 ...",-118.224817,34.320751,0.0


In [48]:
import altair as alt

data = alt.Data(values = ca_data['features'])

alt.Chart(data).mark_geoshape().encode(
    color = 'properties.Shares of BA:Q'
)

In [62]:
ca_ba.plot(column = 'Shares of BA', cmap='OrRd', scheme='quantiles');

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [63]:
alt.Chart(data).mark_

STATEFP          object
COUNTYFP         object
COUNTYNS         object
AFFGEOID         object
GEOID            object
NAME             object
LSAD             object
ALAND             int64
AWATER            int64
geometry         object
centroid_lon    float64
centroid_lat    float64
Shares of BA    float64
dtype: object