# Transform

Clean up the raw data from the state.

In [85]:
import os
import warnings
import geopandas as gpd
from shapely.geometry import Point

In [69]:
warnings.simplefilter("ignore")

In [70]:
input_dir = os.path.join(os.getcwd(), "input")

In [71]:
output_dir = os.path.join(os.getcwd(), "output")

In [72]:
os.path.exists(output_dir) or os.mkdir(output_dir)

True

Read in the school roster

In [73]:
roster_df = gpd.pd.read_csv(
    os.path.join(input_dir, "pubschls.txt"),
    dtype={"CDSCode": str},
    delimiter="\t",
    encoding="latin-1"
)

In [74]:
len(roster_df)

17816

Drop non-schools

In [75]:
schools_df = roster_df[~(roster_df.School.isnull())]

In [76]:
len(schools_df)

16442

Filter it down to active schools

In [77]:
active_df = schools_df[schools_df['StatusType'] == 'Active']

In [78]:
len(active_df)

10611

In [79]:
trimmed_df = active_df[[
    'CDSCode',
    'School',
    'District',
    'StreetAbr',
    'City',
    'County',
    'Zip',
    'Charter',
    'FundingType',
    'Latitude',
    'Longitude',
    'SOCType',
    'EILCode',
    'GSserved',
]]

Split grades

In [80]:
trimmed_df['low_grade_served'] = trimmed_df.GSserved.str.split('-').str.get(0)

In [81]:
trimmed_df['high_grade_served'] = trimmed_df.GSserved.str.split('-').str.get(1)

Clean up header names

In [82]:
cleaned_df = trimmed_df.rename(columns={
    'CDSCode': "cds_code",
    'School': "name",
    'District': "district",
    'StreetAbr': "street",
    'City': "city",
    'County': "county",
    'Zip': "zipcode",
    'Charter': "is_charter",
    'FundingType': "funding_type",
    'Latitude': "latitude",
    'Longitude': "longitude",
    'SOCType': "ownership",
    'EILCode': "instructional_level",
    'GSserved': "grades_served",
})

Write it CSVs

In [92]:
write_csv = lambda df, path: df.to_csv(os.path.join(output_dir, path), encoding="utf-8", index=False)

In [93]:
write_csv(cleaned_df, "public_schools.csv")

In [94]:
write_csv(cleaned_df[cleaned_df.is_charter == 'Y'], "charter_schools.csv")

In [100]:
write_csv(cleaned_df[cleaned_df.county == 'Los Angeles'], 'public_schools_in_la_county.csv')

Write out maps

In [120]:
def df_to_gdf(input_df, crs={'init': u'epsg:4326'}):
    """
    Accepts a DataFrame with longitude and latitude columns. Returns a GeoDataFrame.
    """
    df = input_df.copy()
    geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
    return gpd.GeoDataFrame(df, crs=crs, geometry=geometry)

In [121]:
cleaned_gdf = df_to_gdf(cleaned_df)

In [126]:
def write_maps(df, path):
    shp_path = os.path.join(output_dir, "{}.shp".format(path))
    df.to_file(shp_path)
    json_path = os.path.join(output_dir, "{}.geojson".format(path))
    os.path.exists(json_path) and os.remove(json_path)
    df.to_file(json_path, driver='GeoJSON')

In [127]:
write_maps(cleaned_gdf, 'public_schools')

TypeError: open() got multiple values for keyword argument 'crs'

In [None]:
write_maps(cleaned_gdf[cleaned_gdf.is_charter == 'Y'], 'charter_schools')

In [None]:
write_maps(cleaned_gdf[cleaned_gdf.county == 'Los Angeles'], 'public_schools_in_la_county')