# Process Dataset With School Ratings
### Data Source: https://texasschoolguide.org/school-rankings/

In [1]:
import pandas as pd
import gmaps
import requests
import json
from config import gkey
import numpy as np

In [2]:
# configure the google maps key for api requests
gmaps.configure(api_key=gkey)

In [3]:
# read csv file
file = pd.read_csv("Resources/tsg_final_11.17_From_Children_at_Risk.csv")

# make the file into a DataFrame
schools = pd.DataFrame(file)

In [4]:
# drop rows without school rating or school name
schools.dropna(how='any',subset=['TEA Grade','School.Name'], inplace=True)

In [5]:
# filter the data to show only schools in Harris County, TX
schools_harris = schools.loc[schools["County"]=="HARRIS",:]

In [6]:
# filter the data to remove school type listed as unavailable
schools_harris = schools_harris.loc[schools['School.Type']!="Unavailable"]
schools_harris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 972 entries, 23 to 9643
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   School.ID                 972 non-null    int64  
 1   School.Name               972 non-null    object 
 2   Street.Address            972 non-null    object 
 3   CITY                      972 non-null    object 
 4   ZIP                       972 non-null    float64
 5   County                    972 non-null    object 
 6   District.Name             972 non-null    object 
 7   School.Type               972 non-null    object 
 8   Charter.Trad              972 non-null    object 
 9   Grade.Current             972 non-null    object 
 10  TEA Grade                 972 non-null    object 
 11  GoldRibbon                68 non-null     float64
 12  PEG                       972 non-null    object 
 13  County_Ranking            972 non-null    object 
 14  Asian.pc

In [7]:
# locate rows with missing coordinates
missing_info = schools_harris.loc[schools['latitude'].isna()]
missing_info

Unnamed: 0,School.ID,School.Name,Street.Address,CITY,ZIP,County,District.Name,School.Type,Charter.Trad,Grade.Current,...,Math.Current-2,Math.Change,Student.Growth.Reading,Student.Growth.Math,School.Program,Community.Resources,longitude,latitude,Student.Teacher.Ratio,Grade.Range
4848,227820213,KIPP PEACE ELEMENTARY,5400 MLK JR BLVD,HOUSTON,77021.0,HARRIS,KIPP TEXAS PUBLIC SCHOOLS,Elementary,Charter,C,...,,,HIGH,HIGH,,,,,,PK—04
4854,227820060,KIPP PRIME COLLEGE PREPARATORY MIDDLE,8805 FERNDALE,HOUSTON,77017.0,HARRIS,KIPP TEXAS PUBLIC SCHOOLS,Middle,Charter,C,...,,,LOW,LOW,,,,,,05—07


In [8]:
# geolocate schools without lat/lng information
base_url = "https://maps.googleapis.com/maps/api/geocode/json?"
addresses = missing_info["Street.Address"].to_list()
params = {"key": gkey}
google_info = []
g_lat = []
g_lng = []
for address in addresses:
    params['address'] = address
    response = requests.get(base_url,params=params).json()
    google_info.append(response)
    lat = response['results'][0]['geometry']['location']['lat']
    g_lat.append(lat)
    lng = response['results'][0]['geometry']['location']['lng']
    g_lng.append(lng)

In [9]:
# add the coordinates to the schools DataFrame
for x in range(0,len(addresses)):    
    schools_harris.loc[schools_harris['Street.Address'] == addresses[x], ['latitude']] = g_lat[x]
    schools_harris.loc[schools_harris['Street.Address'] == addresses[x], ['longitude']] = g_lng[x]
    
schools_harris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 972 entries, 23 to 9643
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   School.ID                 972 non-null    int64  
 1   School.Name               972 non-null    object 
 2   Street.Address            972 non-null    object 
 3   CITY                      972 non-null    object 
 4   ZIP                       972 non-null    float64
 5   County                    972 non-null    object 
 6   District.Name             972 non-null    object 
 7   School.Type               972 non-null    object 
 8   Charter.Trad              972 non-null    object 
 9   Grade.Current             972 non-null    object 
 10  TEA Grade                 972 non-null    object 
 11  GoldRibbon                68 non-null     float64
 12  PEG                       972 non-null    object 
 13  County_Ranking            972 non-null    object 
 14  Asian.pc

In [10]:
# Limit the data to only the columns necessary
schools_harris = schools_harris[['School.ID','School.Name','Street.Address','CITY','ZIP','District.Name',
                                 'School.Type','Charter.Trad','TEA Grade','latitude','longitude']]

In [11]:
# Remove charter schools, as they will not be used in the final analysis
schools_harris = schools_harris.loc[schools_harris['Charter.Trad']== "Traditional Public School", :]

In [12]:
# Drop the charter_trad column
schools_harris = schools_harris.drop(columns=['Charter.Trad'])

In [13]:
# Rename columns
schools_harris = schools_harris.rename(columns={'School.ID': 'school_id',
                                               'School.Name':'name',
                                               'Street.Address': 'address',
                                               'CITY': 'city',
                                               'ZIP': 'zip_code',
                                               'District.Name': 'district_name',
                                               'School.Type': 'school_type',
                                               'TEA Grade':'TEA grade'})
schools_harris.head()

Unnamed: 0,school_id,name,address,city,zip_code,district_name,school_type,TEA grade,latitude,longitude
56,101907107,ADAM ELEMENTARY,11303 HONEYGROVE LN,HOUSTON,77065.0,CYPRESS-FAIRBANKS ISD,Elementary,B,29.926556,-95.603242
81,101905043,AGUIRRE JUNIOR HIGH,15726 WALLISVILLE RD,HOUSTON,77049.0,CHANNELVIEW ISD,Middle,B,29.809586,-95.156563
93,101911101,ALAMO ELEMENTARY,6100 N MAIN,BAYTOWN,77521.0,GOOSE CREEK CISD,Elementary,A,29.79278,-94.963885
109,101903045,ALBRIGHT MIDDLE,6315 WINKLEMAN,HOUSTON,77083.0,ALIEF ISD,Middle,A,29.709561,-95.654675
110,101912102,ALCOTT ELEMENTARY,5859 BELLFORT,HOUSTON,77033.0,HOUSTON ISD,Elementary,B,29.667765,-95.329295


In [14]:
# convert TEA grades to numbers, A=95, B=85, C=75, D=65, F=55
schools_harris.loc[schools_harris["TEA grade"]=="A", "school_rating"]=95
schools_harris.loc[schools_harris["TEA grade"]=="B", "school_rating"]=85
schools_harris.loc[schools_harris["TEA grade"]=="C", "school_rating"]=75
schools_harris.loc[schools_harris["TEA grade"]=="D", "school_rating"]=65
schools_harris.loc[schools_harris["TEA grade"]=="F", "school_rating"]=55

In [15]:
del schools_harris['TEA grade']
schools_harris.head()

Unnamed: 0,school_id,name,address,city,zip_code,district_name,school_type,latitude,longitude,school_rating
56,101907107,ADAM ELEMENTARY,11303 HONEYGROVE LN,HOUSTON,77065.0,CYPRESS-FAIRBANKS ISD,Elementary,29.926556,-95.603242,85.0
81,101905043,AGUIRRE JUNIOR HIGH,15726 WALLISVILLE RD,HOUSTON,77049.0,CHANNELVIEW ISD,Middle,29.809586,-95.156563,85.0
93,101911101,ALAMO ELEMENTARY,6100 N MAIN,BAYTOWN,77521.0,GOOSE CREEK CISD,Elementary,29.79278,-94.963885,95.0
109,101903045,ALBRIGHT MIDDLE,6315 WINKLEMAN,HOUSTON,77083.0,ALIEF ISD,Middle,29.709561,-95.654675,95.0
110,101912102,ALCOTT ELEMENTARY,5859 BELLFORT,HOUSTON,77033.0,HOUSTON ISD,Elementary,29.667765,-95.329295,85.0


In [16]:
# Convert zip_code to int to remove the decimals
schools_harris['zip_code'] = schools_harris['zip_code'].astype('int64')
schools_harris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 822 entries, 56 to 9643
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   school_id      822 non-null    int64  
 1   name           822 non-null    object 
 2   address        822 non-null    object 
 3   city           822 non-null    object 
 4   zip_code       822 non-null    int64  
 5   district_name  822 non-null    object 
 6   school_type    822 non-null    object 
 7   latitude       822 non-null    float64
 8   longitude      822 non-null    float64
 9   school_rating  822 non-null    float64
dtypes: float64(3), int64(2), object(5)
memory usage: 70.6+ KB


In [17]:
# Convert zip_code to string
schools_harris['zip_code'] = schools_harris['zip_code'].astype('str')
schools_harris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 822 entries, 56 to 9643
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   school_id      822 non-null    int64  
 1   name           822 non-null    object 
 2   address        822 non-null    object 
 3   city           822 non-null    object 
 4   zip_code       822 non-null    object 
 5   district_name  822 non-null    object 
 6   school_type    822 non-null    object 
 7   latitude       822 non-null    float64
 8   longitude      822 non-null    float64
 9   school_rating  822 non-null    float64
dtypes: float64(3), int64(1), object(6)
memory usage: 70.6+ KB


In [18]:
# create a unique id per district
district_list = schools_harris['district_name'].unique()
for x in range (0,len(district_list)):
    schools_harris.loc[schools_harris["district_name"]==district_list[x], "district_id"]=x+1

In [19]:
district_list

array(['CYPRESS-FAIRBANKS ISD', 'CHANNELVIEW ISD', 'GOOSE CREEK CISD',
       'ALIEF ISD', 'HOUSTON ISD', 'ALDINE ISD', 'SPRING ISD',
       'HUMBLE ISD', 'PASADENA ISD', 'CROSBY ISD', 'LA PORTE ISD',
       'KATY ISD', 'KLEIN ISD', 'DEER PARK ISD', 'SPRING BRANCH ISD',
       'SHELDON ISD', 'TOMBALL ISD', 'GALENA PARK ISD', 'HUFFMAN ISD'],
      dtype=object)

In [20]:
schools_harris['district_id'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19.])

In [24]:
# create a separate dataframe for school_districts
school_districts = schools_harris[['district_id','district_name']]

In [29]:
# reduce the district table to unique value
school_districts = school_districts.drop_duplicates(ignore_index=True)
school_districts.head()

Unnamed: 0,district_id,district_name
0,1.0,CYPRESS-FAIRBANKS ISD
1,2.0,CHANNELVIEW ISD
2,3.0,GOOSE CREEK CISD
3,4.0,ALIEF ISD
4,5.0,HOUSTON ISD


In [30]:
# create a separa dataframe for schools
school_ratings = schools_harris.drop(columns=['district_name'])
school_ratings.head()

Unnamed: 0,school_id,name,address,city,zip_code,school_type,latitude,longitude,school_rating,district_id
56,101907107,ADAM ELEMENTARY,11303 HONEYGROVE LN,HOUSTON,77065,Elementary,29.926556,-95.603242,85.0,1.0
81,101905043,AGUIRRE JUNIOR HIGH,15726 WALLISVILLE RD,HOUSTON,77049,Middle,29.809586,-95.156563,85.0,2.0
93,101911101,ALAMO ELEMENTARY,6100 N MAIN,BAYTOWN,77521,Elementary,29.79278,-94.963885,95.0,3.0
109,101903045,ALBRIGHT MIDDLE,6315 WINKLEMAN,HOUSTON,77083,Middle,29.709561,-95.654675,95.0,4.0
110,101912102,ALCOTT ELEMENTARY,5859 BELLFORT,HOUSTON,77033,Elementary,29.667765,-95.329295,85.0,5.0


In [31]:
# export to csv files
school_districts.to_csv("Output/school_districts.csv", index=False)
school_ratings.to_csv("Output/school_ratings.csv", index=False)