# Analysis of School Data - Performance Indicators

In [90]:
# Dependencies and Setup
import pandas as pd

# Set Default Styling
pd.options.display.float_format = '{:,.1f}'.format

In [91]:
# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

#school_data_complete.head()

## District Summary

In [92]:
# total schools in dataset
total_schools = school_data_complete["school_name"].nunique()
total_schools

15

In [93]:
# total number of students
total_students = school_data_complete["student_name"].count()
total_students

39170

In [94]:
# school system budget (all schools)
total_budget = school_data_complete.loc[:, ["school_name","budget"]].drop_duplicates().sum()
total_budget=total_budget['budget']
total_budget

24649428

In [95]:
# average math score 
math_mean = school_data_complete['math_score'].mean()
math_mean

78.98537145774827

In [96]:
# average reading score
read_mean = school_data_complete['reading_score'].mean()
read_mean

81.87784018381414

In [97]:
# non arithmetic mean - might be okay in this example since all students should have scores for both tests
pass_mean = (math_mean + read_mean)/2
pass_mean

80.43160582078121

In [98]:
# math pass rate > 70
pass_math = len(school_data_complete[school_data_complete['math_score']>=70])/total_students*100
pass_math

74.9808526933878

In [99]:
# reading pass rate > 70
pass_read = len(school_data_complete[school_data_complete['reading_score']>=70])/total_students*100
pass_read

85.80546336482001

In [100]:
# create dataframe from above scalars
data={
    'Total Schools':total_schools,
    'Total Students':total_students,
    'Total Budget':total_budget,
    'Average Math Score':math_mean,
    'Average Reading Score':read_mean,
    'Overall % Pass':pass_mean,
    '% Math Pass':pass_math,
    '% Read Pass':pass_read
}

# need to put dictionary in list; defining the index is unnecessary
summary_df=pd.DataFrame([data], index=[0])
summary_df.head()

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Overall % Pass,% Math Pass,% Read Pass
0,15,39170,24649428,79.0,81.9,80.4,75.0,85.8


In [101]:
# formatting; nice use of dictionary
# using `style.format` is probably better than using `.map` since map converts everything to strings. 
# Note that this overwrites all my previous default settings in cell 1
summary_df.style.format({'Total Students': "{0:,.0f}",'Total Budget': "${0:,.2f}"})

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Overall % Pass,% Math Pass,% Read Pass
0,15,39170,"$24,649,428.00",78.985371,81.87784,80.431606,74.980853,85.805463


## School Summary

In [117]:
# make dataframe with school as the index
main_df = school_data_complete.loc[:, ["school_name","type"]].sort_values(by=['school_name']).drop_duplicates()
main_df.set_index(['school_name'], inplace=True)
main_df.rename(columns = {"type":"School Type"}, inplace=True)
#main_df.head()

In [118]:
# series of total students
count_values = school_data_complete.loc[:, ["school_name","student_name"]].groupby("school_name").count()
count_values.rename(columns = {"student_name":"Total Students"}, inplace=True)
main_df=main_df.merge(count_values, on=['school_name'])

In [119]:
# series of school budget
total_budget = school_data_complete.loc[:, ["school_name","budget"]].drop_duplicates()
total_budget.rename(columns = {"budget":"Total School Budget"}, inplace=True)
total_budget.set_index("school_name", inplace=True)
main_df=main_df.merge(total_budget, on=['school_name'])
#main_df.head()

In [120]:
# calculated the school budget per student 
main_df['Per Student Budget'] = main_df['Total School Budget']/main_df['Total Students']
#main_df.head()

In [121]:
# create and merge dataframe of test scores for each school
mean_values = school_data_complete.loc[:, ["school_name","student_name","math_score", "reading_score"]].groupby("school_name").mean()
mean_values.rename(columns = {"math_score":"Average Math Score", "reading_score":"Average Reading Score"}, inplace=True)
main_df=main_df.merge(mean_values, on=['school_name'])
#main_df.head()

In [122]:
# make dataframe of math scores > 70 per school then merge
pass_math = school_data_complete[school_data_complete['math_score'] >= 70].groupby(['school_name']).count()
pass_percent_math = pass_math['Student ID']/main_df['Total Students']*100
pass_percent_math = pass_percent_math.to_frame("% Passing Math")
main_df=main_df.merge(pass_percent_math, on=['school_name'])
#main_df.head()

In [123]:
# make dataframe of reading scores > 70 per school then merge
pass_read = school_data_complete[school_data_complete['reading_score'] >= 70].groupby(['school_name']).count()
pass_percent_read = pass_read['Student ID']/main_df['Total Students']*100
pass_percent_read = pass_percent_read.to_frame("% Passing Reading")
main_df=main_df.merge(pass_percent_read, on=['school_name'])
#main_df.head()

In [124]:
# calculated the better way of doing the estimate  
pass_read_and_math = school_data_complete[ (school_data_complete['reading_score'] >= 70) & (school_data_complete['math_score'] >= 70)].groupby('school_name').count()

percent_pass_read_and_math = pass_read_and_math['student_name']/main_df['Total Students']*100


In [126]:
# calculated non-weighted percent overall passing 
# this is not actually the correct way since it is not weighted.
# this value will always be between the two test scores (an average)
main_df['Overall % Passing Rate']= (main_df['% Passing Math'] + main_df['% Passing Reading'])/2

# display the better way of doing the estimate  
main_df['Better Overall % Passsing'] = percent_pass_read_and_math

main_df.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate,Better Overall % Passsing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bailey High School,District,4976,3124928,628.0,77.0,81.0,66.7,81.9,74.3,54.6
Cabrera High School,Charter,1858,1081356,582.0,83.1,84.0,94.1,97.0,95.6,91.3
Figueroa High School,District,2949,1884411,639.0,76.7,81.2,66.0,80.7,73.4,53.2
Ford High School,District,2739,1763916,644.0,77.1,80.7,68.3,79.3,73.8,54.3
Griffin High School,Charter,1468,917500,625.0,83.4,83.8,93.4,97.1,95.3,90.6


## Top Performing Schools (By Passing Rate)

In [127]:
# Sort and display the top five schools in overall passing rate
main_df.sort_values('Overall % Passing Rate', ascending=False).head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate,Better Overall % Passsing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.1,84.0,94.1,97.0,95.6,91.3
Thomas High School,Charter,1635,1043130,638.0,83.4,83.8,93.3,97.3,95.3,90.9
Pena High School,Charter,962,585858,609.0,83.8,84.0,94.6,95.9,95.3,90.5
Griffin High School,Charter,1468,917500,625.0,83.4,83.8,93.4,97.1,95.3,90.6
Wilson High School,Charter,2283,1319574,578.0,83.3,84.0,93.9,96.5,95.2,90.6


## Bottom Performing Schools (By Passing Rate)

In [128]:
# Sort and display the five worst-performing schools
main_df.sort_values('Overall % Passing Rate').head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate,Better Overall % Passsing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Rodriguez High School,District,3999,2547363,637.0,76.8,80.7,66.4,80.2,73.3,53.0
Figueroa High School,District,2949,1884411,639.0,76.7,81.2,66.0,80.7,73.4,53.2
Huang High School,District,2917,1910635,655.0,76.6,81.2,65.7,81.3,73.5,53.5
Johnson High School,District,4761,3094650,650.0,77.1,81.0,66.1,81.2,73.6,53.5
Ford High School,District,2739,1763916,644.0,77.1,80.7,68.3,79.3,73.8,54.3


## Math Scores by Grade

In [137]:
#Create dataframes for each set of data
g9_math = school_data_complete[school_data_complete['grade'] == '9th'].groupby(['school_name'])
g9_math_vals = g9_math[['school_name','math_score']].mean()
g10_math = school_data_complete[school_data_complete['grade'] == '10th'].groupby(['school_name'])
g10_math_vals = g10_math[['school_name','math_score']].mean()
g11_math = school_data_complete[school_data_complete['grade'] == '11th'].groupby(['school_name'])
g11_math_vals = g11_math[['school_name','math_score']].mean()
g12_math = school_data_complete[school_data_complete['grade'] == '12th'].groupby(['school_name'])
g12_math_vals = g12_math[['school_name','math_score']].mean()

#Merge the dataframes into one dataframe and rename after each
df_1=g9_math_vals.merge(g10_math_vals, on='school_name')
df_1.rename(columns={'math_score_x':"9th", 'math_score_y':"10th"}, inplace=True)
df_1=df_1.merge(g11_math_vals, on='school_name')
df_1.rename(columns={'math_score':"11th"}, inplace=True)
df_1=df_1.merge(g12_math_vals, on='school_name')
df_1.rename(columns={'math_score':"12th"}, inplace=True)

df_1

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4
Hernandez High School,77.4,77.3,77.1,77.2
Holden High School,83.8,83.4,85.0,82.9
Huang High School,77.0,75.9,76.4,77.2
Johnson High School,77.2,76.7,77.5,76.9
Pena High School,83.6,83.4,84.3,84.1


## Reading Score by Grade 

In [130]:
#Create dataframes for each set of data
g9_read = school_data_complete[school_data_complete['grade'] == '9th'].groupby(['school_name'])
g9_read_vals = g9_read[['school_name','reading_score']].mean()
g10_read = school_data_complete[school_data_complete['grade'] == '10th'].groupby(['school_name'])
g10_read_vals = g10_read[['school_name','reading_score']].mean()
g11_read = school_data_complete[school_data_complete['grade'] == '11th'].groupby(['school_name'])
g11_read_vals = g11_read[['school_name','reading_score']].mean()
g12_read = school_data_complete[school_data_complete['grade'] == '12th'].groupby(['school_name'])
g12_read_vals = g12_read[['school_name','reading_score']].mean()

#Merge the dataframes into one dataframe and rename after each
df_1=g9_read_vals.merge(g10_read_vals, on='school_name')
df_1.rename(columns={'reading_score_x':"9th", 'reading_score_y':"10th"}, inplace=True)
df_1=df_1.merge(g11_read_vals, on='school_name')
df_1.rename(columns={'reading_score':"11th"}, inplace=True)
df_1=df_1.merge(g12_read_vals, on='school_name')
df_1.rename(columns={'reading_score':"12th"}, inplace=True)


df_1.head()

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0


## Scores by School Spending

In [131]:
# Sample bins.
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]

In [133]:
#This is simple average of category values and is not weighted
filter_df = main_df.loc[:, ['Per Student Budget','Average Math Score', 'Average Reading Score','% Passing Math', '% Passing Reading', 'Overall % Passing Rate']]
filter_df["Spending Ranges (Per Student)"] = pd.cut(filter_df['Per Student Budget'], spending_bins, labels=group_names)
filter_df.drop(columns=['Per Student Budget'], inplace=True)
filter_df = filter_df.groupby(['Spending Ranges (Per Student)'])
filter_df.mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.5,83.9,93.5,96.6,95.0
$585-615,83.6,83.9,94.2,95.9,95.1
$615-645,79.1,81.9,75.7,86.1,80.9
$645-675,77.0,81.0,66.2,81.1,73.6


## Scores by School Size

In [134]:
# Sample bins.
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

In [135]:
#Copy and paste and replace of above code
filter2_df = main_df.loc[:, ['Total Students','Average Math Score', 'Average Reading Score','% Passing Math', '% Passing Reading', 'Overall % Passing Rate']]
filter2_df["School Size"] = pd.cut(filter2_df['Total Students'], size_bins, labels=group_names)
filter2_df.drop(columns=['Total Students'], inplace=True)
filter2_df = filter2_df.groupby(['School Size'])
filter2_df.mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.8,83.9,93.6,96.1,94.8
Medium (1000-2000),83.4,83.9,93.6,96.8,95.2
Large (2000-5000),77.7,81.3,70.0,82.8,76.4


## Scores by School Type

In [136]:
# Perform the same operations as above, based on school type.
#This is a variation of above but the binning is not really necessary...
filter3_df = main_df.loc[:, ['School Type','Average Math Score', 'Average Reading Score','% Passing Math', '% Passing Reading', 'Overall % Passing Rate']]
filter3_df=filter3_df.groupby(['School Type'])
filter3_df.mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall % Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.5,83.9,93.6,96.6,95.1
District,77.0,81.0,66.5,80.8,73.7
