# Data Cleaning Walkthrough




In [11]:
import pandas as pd
data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]
data = {}
# Store all filenames into a dictionary: keys are csv file name and value are dataframes (csv file content)
for file in data_files:
    name = file.split('.')[0]
    data[name] = pd.read_csv('schools/'+file)
    print('\n {} \n {}'.format(name, data[name].head(3)))



 ap_2010 
       DBN                    SchoolName AP Test Takers  Total Exams Taken  \
0  01M448  UNIVERSITY NEIGHBORHOOD H.S.              39                49   
1  01M450        EAST SIDE COMMUNITY HS              19                21   
2  01M515           LOWER EASTSIDE PREP              24                26   

  Number of Exams with scores 3 4 or 5  
0                                   10  
1                                    s  
2                                   24  

 class_size 
    CSD BOROUGH SCHOOL CODE                SCHOOL NAME GRADE  PROGRAM TYPE  \
0    1       M        M015  P.S. 015 Roberto Clemente     0K       GEN ED   
1    1       M        M015  P.S. 015 Roberto Clemente     0K          CTT   
2    1       M        M015  P.S. 015 Roberto Clemente     01       GEN ED   

  CORE SUBJECT (MS CORE and 9-12 ONLY) CORE COURSE (MS CORE and 9-12 ONLY)  \
0                                    -                                   -   
1                                  

## Reading Survey Files (specifying data file encoding)

Let us filter the data and work with fewer columns within the survey dataframe:

https://data.cityofnewyork.us/Education/2010-2011-NYC-School-Survey/mnz3-dyi8

To check data file encoding: http://encodingchecker.codeplex.com

Indexing and Selecting Data: which choice is better for indexing?

http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [2]:
d75_survey = pd.read_csv('schools/survey_d75.txt', delimiter='\t', encoding='windows-1252')
all_survey = pd.read_csv('schools/survey_all.txt', delimiter='\t', encoding='windows-1252')
# default parameter value axis=0
survey = pd.concat([all_survey, d75_survey])
print(survey.head(3))


survey["DBN"] = survey["dbn"]

survey = survey.loc[:,["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", 
                 "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", 
                 "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]]
data["survey"] = survey

     N_p  N_s   N_t  aca_p_11  aca_s_11  aca_t_11  aca_tot_11    bn  com_p_11  \
0   90.0  NaN  22.0       7.8       NaN       7.9         7.9  M015       7.6   
1  161.0  NaN  34.0       7.8       NaN       9.1         8.4  M019       7.6   
2  367.0  NaN  42.0       8.6       NaN       7.5         8.0  M020       8.3   

   com_s_11   ...    t_q8c_1  t_q8c_2  t_q8c_3 t_q8c_4  t_q9  t_q9_1  t_q9_2  \
0       NaN   ...       29.0     67.0      5.0     0.0   NaN     5.0    14.0   
1       NaN   ...       74.0     21.0      6.0     0.0   NaN     3.0     6.0   
2       NaN   ...       33.0     35.0     20.0    13.0   NaN     3.0     5.0   

   t_q9_3  t_q9_4  t_q9_5  
0    52.0    24.0     5.0  
1     3.0    78.0     9.0  
2    16.0    70.0     5.0  

[3 rows x 2773 columns]


## Building a 'DBN' column in 'class_size' dataframe from 'CSD' and 'SCHOOL CODE' columns

In [3]:
def add_0(num):
    num_st = str(num)
    if len(num_st) < 2:
        num_st = num_st.zfill(2)
    return num_st

hs_directory = data['hs_directory']
class_size = data['class_size']
hs_directory['DBN'] = hs_directory['dbn']
class_size['padded_csd'] = data["class_size"]["CSD"].apply(add_0)
class_size['DBN'] = class_size['padded_csd'] + class_size['SCHOOL CODE']
print(class_size.head(3))

   CSD BOROUGH SCHOOL CODE                SCHOOL NAME GRADE  PROGRAM TYPE  \
0    1       M        M015  P.S. 015 Roberto Clemente     0K       GEN ED   
1    1       M        M015  P.S. 015 Roberto Clemente     0K          CTT   
2    1       M        M015  P.S. 015 Roberto Clemente     01       GEN ED   

  CORE SUBJECT (MS CORE and 9-12 ONLY) CORE COURSE (MS CORE and 9-12 ONLY)  \
0                                    -                                   -   
1                                    -                                   -   
2                                    -                                   -   

  SERVICE CATEGORY(K-9* ONLY)  NUMBER OF STUDENTS / SEATS FILLED  \
0                           -                               19.0   
1                           -                               21.0   
2                           -                               17.0   

   NUMBER OF SECTIONS  AVERAGE CLASS SIZE  SIZE OF SMALLEST CLASS  \
0                 1.0               

In [4]:
import numpy as np

sat_results = data['sat_results']
columns = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for column in columns:
    sat_results[column] = pd.to_numeric(sat_results[column], errors='coerce')

# add SAT values
sat_results['sat_score'] = sat_results[columns].apply(np.sum, axis=1)

## Retrieving latitudes and longitudes from the 'Location 1' column:

In [7]:
import re
hs_directory = data['hs_directory']
print(hs_directory[['Location 1']].head(3))

                                          Location 1
0  883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...
1  1110 Boston Road\nBronx, NY 10456\n(40.8276026...
2  1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...


We can notice that latitude and longitude are within parenthesis...

In [8]:
# Use a Regexp to find all content between parenthesis:
def get_lat(string):
    coord = re.findall("\(.+\)", string)
    coord = coord[0].split(',')
    return (coord[0]).replace('(','')

hs_directory['lat'] = hs_directory['Location 1'].apply(get_lat)
print(hs_directory['lat'].head(3))

0     40.67029890700047
1      40.8276026690005
2    40.842414068000494
Name: lat, dtype: object


In [10]:
def get_lon(string):
    coord = re.findall("\(.+\)", string)
    coord = coord[0].split(',')
    return coord[1].replace(')','')
hs_directory['lon'] = hs_directory['Location 1'].apply(get_lon)

hs_directory['lon'] = pd.to_numeric(hs_directory['lon'], errors='coerce')
hs_directory['lat'] = pd.to_numeric(hs_directory['lat'], errors='coerce')
print(hs_directory[['school_name', 'lat', 'lon']].head(3))

                                   school_name        lat        lon
0          Brooklyn School for Music & Theatre  40.670299 -73.961648
1             High School for Violin and Dance  40.827603 -73.904475
2  Comprehensive Model School Project M.S. 327  40.842414 -73.916162


# Filtering

Let us filter class_size dataframe so that only values containing '09-12' for GRADE and GEN ED for PROGRAM TYPE remain.

In [7]:
class_size = data['class_size']
class_size = class_size[class_size['GRADE '].str.contains('09-12', na=False)]
class_size = class_size[class_size['PROGRAM TYPE'].str.contains('GEN ED', na=False)]
print(class_size.head(3))

     CSD BOROUGH SCHOOL CODE                                    SCHOOL NAME  \
225    1       M        M292  Henry Street School for International Studies   
226    1       M        M292  Henry Street School for International Studies   
227    1       M        M292  Henry Street School for International Studies   

    GRADE  PROGRAM TYPE CORE SUBJECT (MS CORE and 9-12 ONLY)  \
225  09-12       GEN ED                              ENGLISH   
226  09-12       GEN ED                              ENGLISH   
227  09-12       GEN ED                              ENGLISH   

    CORE COURSE (MS CORE and 9-12 ONLY) SERVICE CATEGORY(K-9* ONLY)  \
225                           English 9                           -   
226                          English 10                           -   
227                          English 11                           -   

     NUMBER OF STUDENTS / SEATS FILLED  NUMBER OF SECTIONS  \
225                               63.0                 3.0   
226              