In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import os

In [11]:
# Import CSV and convert to dataframes
contact_df = pd.read_csv(os.path.join("Resources", "contact.csv"))
ratio_df = pd.read_csv(os.path.join("Resources", "studentratio.csv"))
exp_df = pd.read_csv(os.path.join("Resources", "expenditure.csv"))
SAT_df = pd.read_csv(os.path.join("Resources", "SAT.csv"))

#### Clean contact dataframe

In [3]:
# Drop unnecessary columns
contact_df = contact_df[["DISTRICT_CODE", "GRADESPAN"]]

# Verify no missing data
contact_df.isnull().sum()

# Verify no incorrect data in grade span
contact_df.GRADESPAN.unique()

# Verify no duplicate district codes
contact_df[contact_df.duplicated(['DISTRICT_CODE'])]

# Verify data are of correct type
contact_df.dtypes

DISTRICT_CODE     int64
GRADESPAN        object
dtype: object

#### Clean ratio dataframe

In [12]:
# Drop and rename columns
ratio_df = ratio_df[["DistrictCode", "Student_Teacher_District"]]
ratio_df = ratio_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "Student_Teacher_District": "RATIO"})

# Verify no null values
ratio_df.isnull().sum()

# Clean district column, verify no duplicate data, and cast as integer
ratio_df = ratio_df[ratio_df.DISTRICT_CODE != 'State']
ratio_df[ratio_df.duplicated(["DISTRICT_CODE"])]
ratio_df["DISTRICT_CODE"] = ratio_df["DISTRICT_CODE"].astype('int64')

# Inspect ratio values to verify data are valid; replace junk values with null
ratio_df.RATIO.unique()
ratio_df = ratio_df.replace("139:1", None)
ratio_df = ratio_df.replace("N", None)

# Verify data are of correct type
ratio_df.dtypes

DISTRICT_CODE     int64
RATIO            object
dtype: object

#### Clean expense dataframe

In [5]:
# Drop and rename columns
exp_df = exp_df[["DistrictCode", "Total"]]
exp_df = exp_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "Total": "EXPENSE"})

# Verify no null values
exp_df.isnull().sum()

# Clean district column
exp_df = exp_df[exp_df.DISTRICT_CODE != 'State']
exp_df[exp_df.duplicated(["DISTRICT_CODE"])]
exp_df["DISTRICT_CODE"] = exp_df["DISTRICT_CODE"].astype('int64')

# Inspect expense values; clean and format expense column; replace missing values with null

exp_df.EXPENSE.unique()
exp_df = exp_df.replace("N", "0")
exp_df["EXPENSE"] = exp_df["EXPENSE"].astype('int64')
exp_df["EXPENSE"] = exp_df["EXPENSE"].map("${:,.0f}".format)
exp_df = exp_df.replace("$0", None)

# Verify data are of correct type
exp_df.dtypes

DISTRICT_CODE     int64
EXPENSE          object
dtype: object

#### Clean SAT dataframe

In [14]:
SAT_df = SAT_df[["DistrictCode", "Test", "Subject", "District_Avg", "State_avg"]]
SAT_df = SAT_df.rename(columns={"DistrictCode": "DISTRICT_CODE", "District_Avg": "DIST_AVG", "State_avg": "STATE_AVG"})

DISTRICT_CODE    0
Test             0
Subject          0
DIST_AVG         8
STATE_AVG        0
dtype: int64