# How many institutions are found in counties with majority underrepresented groups?

## What are the characteristics of those institutions?

In [1]:
import pandas as pd
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
from pathlib import Path

from tools import tree
from datetime import datetime as dt
today = dt.today().strftime("%d-%b-%y")

today

'21-Apr-20'

In [2]:
RAW_DATA = Path("../data/raw/")
INTERIM_DATA = Path("../data/interim/")
PROCESSED_DATA = Path("../data/processed/")
FINAL_DATA = Path("../data/final/")
EXTERNAL_DATA = Path("../data/external/")

In [3]:
tree(PROCESSED_DATA)

+ ../data/processed
    + counties.csv
    + geodata
        + tl_2019_us_county.cpg
        + tl_2019_us_county.dbf
        + tl_2019_us_county.prj
        + tl_2019_us_county.shp
        + tl_2019_us_county.shp.ea.iso.xml
        + tl_2019_us_county.shp.iso.xml
        + tl_2019_us_county.shx
    + institutional_characteristics.csv
    + processed_data.csv


In [4]:
county_data = pd.read_csv(PROCESSED_DATA / 'counties.csv')

In [None]:
county_data.head().T

In [None]:
institutions_data = pd.read_csv(PROCESSED_DATA / 'institutional_characteristics.csv')

In [None]:
institutions_data.head().T

In [None]:
county_shapes = gpd.read_file(PROCESSED_DATA / 'geodata' / 'tl_2019_us_county.shp')

In [None]:
county_shapes.head().T

In [None]:
import us

In [None]:
contiguous_fips = [state.fips for state in us.STATES_CONTIGUOUS]

mask_contiguous_fips = county_shapes['STATEFP'].isin(contiguous_fips)

county_shapes = county_shapes[mask_contiguous_fips]

In [None]:
contiguous_states = [state.name for state in us.STATES_CONTIGUOUS]
mask_contiguous_states = institutions_data['fips_state_code'].isin(contiguous_states)
institutions_data = institutions_data[mask_contiguous_states]

### Majority underrepresented group?? 

In [None]:
county_data.head().T

In [None]:
county_data['share_underrepresented'] = (county_data['black_alone'] + 
    county_data['american_indian_and_alaska_native'] + 
    county_data['native_hawaiian_and_pacific_islander'] + 
    county_data['latino_alone']) / county_data['universe']

In [None]:
county_data.head().T

In [None]:
subset_county_data = county_data[['geoid', 'name', 'share_underrepresented']].copy()
subset_county_shapes = county_shapes[['GEOID', 'NAME', 'geometry']].copy()

In [None]:
subset_county_data['geoid'] = subset_county_data['geoid'].astype(str).str.zfill(5)

In [None]:
subset_county_data.set_index('geoid', inplace = True)
subset_county_shapes.set_index('GEOID', inplace = True)

In [None]:
working_gdf = subset_county_shapes.join(subset_county_data)

In [None]:
gplt.choropleth(working_gdf, projection = gcrs.WebMercator(), hue = 'share_underrepresented', );

In [None]:
geo_institutions = gpd.GeoDataFrame(institutions_data, geometry = gpd.points_from_xy(institutions_data['longitude'], institutions_data['latitude']))

In [None]:
ax = gplt.choropleth(working_gdf, projection=gcrs.WebMercator(), hue = 'share_underrepresented', figsize=(12,12))
gplt.pointplot(geo_institutions, ax = ax, zorder=3, alpha=.3, color='red', s=2);

1. mask with only majority underrepped counties
2. spatial join institutions WITHIN those counties

In [None]:
mask_majority_underrepresented = working_gdf['share_underrepresented'] > 0.50

In [None]:
working_gdf[mask_majority_underrepresented].plot();

In [None]:
majority_underrepresented = working_gdf[mask_majority_underrepresented].copy()

In [None]:
gpd.sjoin(geo_institutions, majority_underrepresented, how='inner', op='intersects').plot();

In [None]:
institutions_in_majority_underrepresented_counties = gpd.sjoin(geo_institutions, majority_underrepresented, how='inner', op='intersects')

In [None]:
ax = gplt.choropleth(working_gdf, projection=gcrs.WebMercator(), hue = 'share_underrepresented', figsize=(12,12))
gplt.pointplot(institutions_in_majority_underrepresented_counties, ax = ax, zorder=3, alpha=.5, color='red', s=3);

In [None]:
majority_underrepresented.shape

In [None]:
geo_institutions.shape

In [None]:
institutions_in_majority_underrepresented_counties.shape

In [32]:
institutions_in_majority_underrepresented_counties['control'].value_counts(normalize=True)

Public                    0.384501
Private not-for-profit    0.323398
Private for-profit        0.292101
Name: control, dtype: float64

In [33]:
geo_institutions['control'].value_counts(normalize = True)

Public                    0.428878
Private not-for-profit    0.381911
Private for-profit        0.189211
Name: control, dtype: float64

In [34]:
institutions_in_majority_underrepresented_counties['level'].value_counts(normalize=True)

 4-year or above    0.590164
 2-year             0.409836
Name: level, dtype: float64

In [35]:
geo_institutions['level'].value_counts(normalize = True)

 4-year or above    0.643317
 2-year             0.356683
Name: level, dtype: float64

In [36]:
geo_institutions['total_enrollment'].mean()

5211.410800644815

In [37]:
institutions_in_majority_underrepresented_counties['total_enrollment'].mean()

5497.3288490284

In [38]:
geo_institutions['share_full-time'] = geo_institutions['full-time_enrollment'] / geo_institutions['total_enrollment']
institutions_in_majority_underrepresented_counties['share_full-time'] = institutions_in_majority_underrepresented_counties['full-time_enrollment'] / institutions_in_majority_underrepresented_counties['total_enrollment']

In [39]:
geo_institutions['share_full-time'].mean()

0.6744567999987309

In [40]:
institutions_in_majority_underrepresented_counties['share_full-time'].mean()

0.6911874144129944