In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import os

In [33]:
# Import CSV and convert to dataframes
contact_df = pd.read_csv(os.path.join("Resources", "school_contact.csv"))
ratio_df = pd.read_csv(os.path.join("Resources", "school_ratio.csv"))
exp_df = pd.read_csv(os.path.join("Resources", "school_exp.csv"))
test_df = pd.read_csv(os.path.join("Resources", "school_test.csv"))

#### Clean contact dataframe

In [3]:
# Drop unnecessary columns
contact_df = contact_df[["DISTRICT_CODE", "DISTRICT_NAME", "SCHOOL_CODE", "SCHOOL_NAME", "GRADESPAN"]]

# Verify no missing data
contact_df.isnull().sum()

# Verify data are of correct type
contact_df.dtypes

# Create unique key column from district and school codes
contact_df["DS_CODE"] = contact_df["DISTRICT_CODE"].map(str) + "-" + contact_df["SCHOOL_CODE"].map(str)

# Verify no duplicate district-school codes
contact_df[contact_df.duplicated(['DS_CODE'])]

# Verify no incorrect data in grade span
contact_df.GRADESPAN.unique()

array(['05-08', 'PK-04', '09-12', 'PK-08', 'KG-08', 'PK-PK', 'PK-05',
       '06-08', 'KG-02', 'PK-02', '03-05', 'PK-03', '04-08', '04-05',
       'KG-06', '07-08', 'PK-06', 'PK-01', 'KG-05', 'PK-12', 'KG-12',
       '07-12', '01-06', '01-08', '02-04', '03-06', '04-06', 'PK-KG',
       '01-03', '03-08', 'KG-03', '06-12', 'KG-04', '03-03', '01-04',
       '01-12', '02-06', 'KG-01', '06-07', '02-03', '05-06', '06-06',
       '01-02', 'KG-10', 'PK-09', '03-04', '09-10', '05-05', '08-12',
       '05-12', 'PK-07', '08-09', '02-12', '02-05', '01-05', '09-09',
       '10-12', '11-12', '07-09', 'KG-KG', '02-08', '06-09', '03-07',
       '02-02', '12-12', 'PK-10', 'KG-07', 'KG-11', 'KG-09'], dtype=object)

#### Clean ratio dataframe

In [4]:
# Drop and rename columns
ratio_df = ratio_df[["DistrictCode", "SchoolCode", "Student_Teacher_School", "Student_Teacher_District", "SchoolName"]]
ratio_df = ratio_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "SchoolCode": "SCHOOL_CODE", \
                                    "Student_Teacher_School": "SCH_RAT", "Student_Teacher_District": "DIST_RAT"})

# Verify no missing data
ratio_df.isnull().sum()

# Verify data are of correct type
ratio_df.dtypes

# Create unique key column from district and school codes
ratio_df["DS_CODE"] = ratio_df["DISTRICT_CODE"].map(str) + "-" + ratio_df["SCHOOL_CODE"].map(str)

# Verify no duplicate district-school codes
ratio_df[ratio_df.duplicated(['DS_CODE'])]

# Review ratios to verify no junk data
ratio_df.SCH_RAT.unique()

# Based on research, we decided probable inaccurate data would be values greater than 50:1
# So as to provide as little inaccurate data as possible, replaced these values with nulls
# Also replace "N" values with "Not Available"

ratio_df = ratio_df.replace(["148:1", "53:1", "56:1", "50:1", "152:1", "66:1", "106:1", "74:1",\
                             "245:1", "268:1", "520:1", "327:1", "N"], "Not Available")

#### Clean expense dataframe

In [5]:
# Drop and rename columns
exp_df = exp_df[["DistrictCode", "Total"]]
exp_df = exp_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "Total": "EXPENSE"})

# Verify no null values
exp_df.isnull().sum()

# Verify no duplicate districts
exp_df[exp_df.duplicated(["DISTRICT_CODE"])]

# Inspect expense values; clean and format expense column; replace missing values with "Not Available"
exp_df.EXPENSE.unique()
exp_df = exp_df.replace("N", "0")
exp_df["EXPENSE"] = exp_df["EXPENSE"].astype('int64')
exp_df["EXPENSE"] = exp_df["EXPENSE"].map("${:,.0f}".format)
exp_df = exp_df.replace("$0", "Not Available")

#### Clean test dataframe

In [35]:
# Drop and rename columns
test_df = test_df[["DistrictCode", "SchoolCode", "SchoolName", "Test", "Subject", "School_Avg", "State_avg"]]
test_df = test_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "SchoolCode": "SCHOOL_CODE",\
                                  "SchoolCode": "SCHOOL_CODE", "Test": "TEST", "School_Avg": "SCH_AVG", \
                                  "State_avg": "STATE_AVG"})

# Verify no missing data
test_df.isnull().sum()

# Drop any duplicate rows
test_df.drop_duplicates()

# Create unique key column from district and school codes
test_df["DS_CODE"] = test_df["DISTRICT_CODE"].map(str) + "-" + test_df["SCHOOL_CODE"].map(str)

# Combine test and subject columns
test_df["TEST"] = test_df["TEST"] + ": " + test_df["Subject"]
test_df = test_df.drop("Subject", axis = 1)

In [50]:
# Review ACT scores 
ACT_df = test_df[test_df['TEST'].str.contains('ACT')]
ACT_df.SCH_AVG.unique()
ACT_df.STATE_AVG.unique()

# Review SAT scores
SAT_df = test_df[test_df['TEST'].str.contains('SAT')]
SAT_df.SCH_AVG.unique()
SAT_df.STATE_AVG.unique()

array([478, 543, 542])

In [51]:
test_df = test_df.replace(["N", "*"], "Not Available")

In [54]:
SAT_df = test_df[test_df['TEST'].str.contains('SAT')]
SAT_df.SCH_AVG.unique()

array(['428', '422', '509', '512', '451', '446', '505', '515', '469',
       '464', '527', '462', '468', '540', '546', '461', '524', '523',
       '435', '482', '497', '476', '557', '474', '455', '543', '534',
       '494', '495', '558', '384', '382', '443', '701', '666', '728',
       '692', '631', '600', '699', '662', '504', 'Not Available', '467',
       '447', '460', '485', '449', '522', '529', '453', '513', '562',
       '555', '607', '597', '489', '498', '549', '444', '488', '506',
       '491', '563', '570', '475', '561', '559', '511', '593', '581',
       '518', '520', '584', '439', '438', '554', '619', '611', '441',
       '436', '507', '490', '487', '583', '566', '516', '514', '599',
       '587', '437', '457', '448', '526', '539', '616', '594', '533',
       '560', '473', '481', '465', '585', '567', '643', '625', '596',
       '642', '624', '551', '612', '578', '499', '574', '537', '528',
       '582', '588', '577', '544', '586', '519', '532', '590', '480',
       '471', '54

array([24, 23])

In [29]:
# Verify data are of correct type
test_df.dtypes

Unnamed: 0,DISTRICT_CODE,SCHOOL_CODE,SchoolName,TEST,SCH_AVG,STATE_AVG,DS_CODE
0,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: English,19,24,110-10
1,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Math,21,24,110-10
2,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Reading,21,24,110-10
3,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Science,20,23,110-10
4,110,10,ATLANTIC CITY HIGH SCHOOL,PSAT: Math,428,478,110-10


Unnamed: 0,DISTRICT_CODE,SCHOOL_CODE,SchoolName,TEST,Subject,SCH_AVG,STATE_AVG,DS_CODE
0,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: English,English,19,24,110-10
1,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Math,Math,21,24,110-10
2,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Reading,Reading,21,24,110-10
3,110,10,ATLANTIC CITY HIGH SCHOOL,ACT: Science,Science,20,23,110-10
4,110,10,ATLANTIC CITY HIGH SCHOOL,PSAT: Math,Math,428,478,110-10
