# RQ1: How many higher education institutions are found in counties with majority underrepresented groups?
Answer: 659

## What are the characteristics of those institutions?
- More private-for-profit
- More 2-year (vs 4-year or above)

In [None]:
import pandas as pd
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt

from tools import tree
from pathlib import Path
from datetime import datetime as dt
today = dt.today().strftime("%d-%b-%y")

today

In [None]:
RAW_DATA = Path("../data/raw/")
INTERIM_DATA = Path("../data/interim/")
PROCESSED_DATA = Path("../data/processed/")
FINAL_DATA = Path("../data/final/")
EXTERNAL_DATA = Path("../data/external/")

In [None]:
tree(PROCESSED_DATA)

In [None]:
counties_data = pd.read_csv(PROCESSED_DATA / 'counties.csv')
institutions_data = pd.read_csv(PROCESSED_DATA / 'institutions_data.csv')
counties_shapes = gpd.read_file(PROCESSED_DATA / 'geodata' / 'tl_2019_us_county.shp')

In [None]:
counties_data.head().T

In [None]:
institutions_data.head().T

In [None]:
counties_shapes.head().T

In [None]:
counties_shapes.sample(5)

In [None]:
counties_shapes.plot();

In [None]:
import us

In [None]:
contiguous_fips = [state.fips for state in us.STATES_CONTIGUOUS]

In [None]:
mask_contiguous_fips = counties_shapes['STATEFP'].isin(contiguous_fips)

In [None]:
mask_contiguous_fips

In [None]:
counties_shapes = counties_shapes[mask_contiguous_fips]

In [None]:
name_to_fips_map = us.states.mapping("name", "fips")

In [None]:
institutions_data['fips_state_code'] = institutions_data['fips_state_code'].map(name_to_fips_map)

In [None]:
mask_contiguous_fips_institutions = institutions_data['fips_state_code'].isin(contiguous_fips)

In [None]:
institutions_data = institutions_data[mask_contiguous_fips_institutions]

In [None]:
counties_data.head()

In [None]:
counties_data['share_underrepresented'] = (
    counties_data['black_alone'] 
    + counties_data['latino_alone'] 
    + counties_data['american_indian_and_alaska_native'] 
    + counties_data['native_hawaiian_and_pacific_islander']
) / counties_data['universe']

In [None]:
counties_data.head().T

In [None]:
subset_counties_data = counties_data[['geoid', 'name', 'share_underrepresented']].copy()
subset_counties_shapes = counties_shapes[['GEOID', 'NAME', 'geometry']].copy()

In [None]:
subset_counties_data['geoid'] = subset_counties_data['geoid'].astype(str).str.zfill(5)

In [None]:
subset_counties_data = subset_counties_data.set_index('geoid')
subset_counties_shapes = subset_counties_shapes.set_index('GEOID')

In [None]:
working_gdf = subset_counties_shapes.join(subset_counties_data)

In [None]:
working_gdf.plot(column = 'share_underrepresented');

In [None]:
gplt.choropleth(working_gdf, projection=gcrs.WebMercator(), hue = 'share_underrepresented'); 

In [None]:
geo_institutions = gpd.GeoDataFrame(institutions_data, geometry = gpd.points_from_xy(institutions_data['longitude'], institutions_data['latitude']))

In [None]:
working_gdf.crs

In [None]:
geo_institutions.crs = working_gdf.crs

In [None]:
geo_institutions.crs

In [None]:
gplt.pointplot(geo_institutions)

In [None]:
ax = gplt.choropleth(working_gdf, projection=gcrs.WebMercator(), hue = 'share_underrepresented', figsize = (12,12))
gplt.pointplot(geo_institutions, ax = ax, zorder = 3, alpha = 0.3, color = "red", s = 2)

In [None]:
mask_majority_underrepresented = working_gdf['share_underrepresented'] > 0.50

In [None]:
working_gdf[mask_majority_underrepresented].plot();

In [None]:
working_gdf[~mask_majority_underrepresented].plot();

In [None]:
majority_underrepresented = working_gdf[mask_majority_underrepresented].copy()

In [None]:
majority_underrepresented.head()

In [None]:
institutions_in_majority_underrepresented = gpd.sjoin(geo_institutions, majority_underrepresented, how="inner", op="intersects")

In [None]:
ax = gplt.choropleth(working_gdf, projection=gcrs.WebMercator(), hue = 'share_underrepresented', figsize = (12,12))
gplt.pointplot(institutions_in_majority_underrepresented, ax = ax, zorder = 3, alpha = 0.3, color = "red", s = 2)

In [None]:
majority_underrepresented.shape

In [None]:
counties_shapes.shape

In [None]:
geo_institutions.shape

In [None]:
institutions_in_majority_underrepresented.shape

In [None]:
institutions_in_majority_underrepresented.head().T

In [None]:
institutions_in_majority_underrepresented['control'].value_counts(normalize = True)

In [None]:
geo_institutions['control'].value_counts(normalize = True)

In counties where there's a majority of underrepresented groups, 29% of higher ed institutions are private for-profits whereas across the united states that number drops to 19%. 

In [None]:
institutions_in_majority_underrepresented['level'].value_counts(normalize = True)

In [None]:
geo_institutions['level'].value_counts(normalize = True)

# Checkpoint

In [None]:
tree(PROCESSED_DATA)

In [None]:
PROCESSED_DATA.joinpath("processed_geodata").mkdir()

In [None]:
PROCESSED_DATA.joinpath("processed_institutions").mkdir()

In [None]:
tree(PROCESSED_DATA)

In [None]:
working_gdf.to_file(PROCESSED_DATA / 'processed_geodata' / 'contiguous_us.shp')
geo_institutions.to_file(PROCESSED_DATA / 'processed_institutions' / 'geo_institutions.shp')