In [37]:
# Import dependencies
import pandas as pd
import numpy as np

# Load in csv files
school_data_to_load="Resources/schools_complete.csv"
student_data_to_load="Resources/students_complete.csv"

# Convert csv files into pandas data frames
school_data_df=pd.read_csv(school_data_to_load)
student_data_df=pd.read_csv(student_data_to_load)

In [38]:
# Join the school data and student data tables on the 'school_name' column
full_data_df=pd.merge(student_data_df, school_data_df, how="left", on="school_name")
full_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [39]:
# Rename column titles and rearrange column order to be more natural
full_data_df=full_data_df.rename(columns={"student_name": "Student Name", "gender": "Gender", "grade": "Grade", "school_name": "School Name", "reading_score": "Reading Score", "math_score": "Math Score", "type": "School Type", "size": "School Size", "budget": "School Budget"})
full_data_df=full_data_df[["Student ID", "Student Name", "Gender", "Grade", "Reading Score", "Math Score", "School ID", "School Name", "School Type", "School Size", "School Budget"]]
full_data_df.head()

Unnamed: 0,Student ID,Student Name,Gender,Grade,Reading Score,Math Score,School ID,School Name,School Type,School Size,School Budget
0,0,Paul Bradley,M,9th,66,79,0,Huang High School,District,2917,1910635
1,1,Victor Smith,M,12th,94,61,0,Huang High School,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,90,60,0,Huang High School,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,67,58,0,Huang High School,District,2917,1910635
4,4,Bonnie Ray,F,9th,97,84,0,Huang High School,District,2917,1910635


In [60]:
# Testing that columns don't have anything weird in them I can't see from the head() above. 
# As long as they don't have anything weird going on, then I can be sure my summary statistcs
# found below are reliable values. This entire section will be commented out at the end.

# Test that Student ID column has no duplicates, no NaN values, and only integer values
full_data_df['Student ID'].duplicated().unique() # Returns 'array([False])' which means no duplicate values in Student ID column
full_data_df['Student ID'].isna().unique() # Returns 'array([False])' which means no NaN values in Student ID column
full_data_df['Student ID'].dtypes # Returns 'int64' meaning each Student ID element is an integer.

# Test that School ID has no NaN values and only integer values
full_data_df['School ID'].unique() # Returns an array from 0 to 14
full_data_df['School ID'].dtypes # Returns 'int64' meaning each School ID element is an integer

# Test that for each unique School ID, there is a unique School Budget value associated with it
full_data_df.groupby(['School ID', 'School Budget']).size() # Demonstrates there is a unique school budget value for each school ID, and also prints how many rows there are for each school

# Test that the Math Score column has no NaN values and only integer values
full_data_df['Math Score'].isna().unique() # Returns 'array([False])' which means no NaN values in Math Score column
full_data_df['Math Score'].dtypes # Returns 'int64' meaning each Math Score is an integer

# Test that the Reading Score column has no NaN values and only integer values
full_data_df['Reading Score'].isna().unique() # Returns 'array([False])' which means no NaN values in Reading Score column
full_data_df['Reading Score'].dtypes # Returns 'int64' meaning each Reading Score is an integer

array([False])

In [62]:
# Calculate values of number of students passing Math and Reading, respectively

# Define counter variables, set initial values to 0
numPassMath = 0
numPassReading = 0

# Define simple for loops to count # of students passing in each subject respectively
for x in full_data_df['Math Score']:
    if x >= 70:
        numPassMath += 1

for x in full_data_df['Reading Score']:
    if x >= 70:
        numPassReading += 1

In [69]:
# Store variables for each metric I want to put in my summary dataframe
totalSchools=full_data_df.nunique().loc['School ID']
totalStudents=full_data_df.nunique().loc['Student ID']
totalBudget=full_data_df['School Budget'].unique().sum()
meanMathScore=full_data_df['Math Score'].mean()
meanReadingScore=full_data_df['Reading Score'].mean()
percentPassMath=numPassMath/totalStudents
percentPassReading=numPassReading/totalStudents
overallPassRate=(percentPassMath + percentPassReading)/2 # NOTE: This yields a different value than what the homework specifies, but I'm pretty sure this formula is correct. Not sure why there is a discrepancy.

In [98]:
# Print summary dataframe
district_summary_df=pd.DataFrame(
    {'Total Schools': totalSchools,
     'Total Students': '{:,}'.format(totalStudents),
     'Total Budget': '${:,.2f}'.format(totalBudget),
     'Average Math Score': meanMathScore,
     'Average Reading Score': meanReadingScore,
     '% Passing Math': '{0:.6f}'.format(percentPassMath*100),
     '% Passing Reading': '{0:.6f}'.format(percentPassReading*100),
     '% Overall Passing Rate': '{0:.6f}'.format(overallPassRate*100)
    }, index=['Stats'])

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Stats,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.393158
