In [1]:
#import dependencies
import pandas as pd
import numpy as np

In [2]:
#read in dataset to use and preview
student_data = pd.read_csv('students_complete.csv')
school_data = pd.read_csv('schools_complete.csv')
student_data.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [3]:
school_data.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [36]:
#analysis to get high level snapshot of all schools in the district
total_schools = student_data['school'].nunique()

total_students = student_data['Student ID'].nunique()

total_budget = school_data['budget'].sum()

avg_math_score = student_data['math_score'].mean()

avg_reading_score = student_data['reading_score'].mean()

math_passing = (student_data.loc[student_data['math_score'] > 70].count()[0])/total_students * 100

reading_passing = (student_data.loc[student_data['reading_score'] > 70].count()[0])/total_students * 100

overall_passing = np.mean([reading_passing,math_passing])
#labels for the above analysis
labels = ['Total Schools', 'Total Students', 'Total Budget','Average Math Score','Average Reading Score','% Passing Math','% Passing Reading','Overall Passing']

total_data = [total_schools,total_students,total_budget,avg_math_score,avg_reading_score,math_passing,reading_passing,overall_passing]
#create dataframe to display summary of the data
summary = pd.DataFrame(total_data,index=labels)
summary = summary.rename(columns={0:'Summary Data'})
summary.round(2)

Unnamed: 0,Summary Data
Total Schools,15.0
Total Students,39170.0
Total Budget,24649428.0
Average Math Score,78.99
Average Reading Score,81.88
% Passing Math,72.39
% Passing Reading,82.97
Overall Passing,77.68


In [37]:
#do some analysis to display key metrics of each school
#sort the data by school name
sorted_schools = school_data.sort_values('name')
sorted_students = student_data.sort_values('school')
#store data for students with passing grade, 70 in this case
passed_reading = sorted_students.loc[sorted_students['reading_score'] > 70]
passed_math = sorted_students.loc[sorted_students['math_score'] > 70]

school_groups = sorted_students.groupby('school')

school_names = sorted_schools['name'].unique()

school_type = sorted_schools['type'].values

students_per_school = school_groups['Student ID'].count().values

budget_per_school = sorted_schools['budget'].values

avgmath_per_school = school_groups['math_score'].mean().values

avgreading_per_school = school_groups['reading_score'].mean().values

mathpassing_per_school = (passed_math.groupby('school').count()['name'].values)/students_per_school * 100

readingpassing_per_school = (passed_reading.groupby('school').count()['name'].values)/students_per_school * 100

overallpassing_per_school = (mathpassing_per_school + readingpassing_per_school) / 2

In [38]:
#create dataframe to hold school metrics calculated
data_per_school = {'Shool Type':school_type, 'Number of Students':students_per_school,
                    'Budget':budget_per_school,'Average Math Score':avgmath_per_school,
                    'Average Reading Score':avgreading_per_school,'% Passing Math':mathpassing_per_school,
                    '% Passing Reading':readingpassing_per_school,'Overall Passing':overallpassing_per_school}
summary_per_school = pd.DataFrame(data_per_school,index=school_names)
summary_per_school

Unnamed: 0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Budget,Number of Students,Overall Passing,Shool Type
Bailey High School,64.630225,79.300643,77.048432,81.033963,3124928,4976,71.965434,District
Cabrera High School,89.558665,93.86437,83.061895,83.97578,1081356,1858,91.711518,Charter
Figueroa High School,63.750424,78.433367,76.711767,81.15802,1884411,2949,71.091896,District
Ford High School,65.753925,77.51004,77.102592,80.746258,1763916,2739,71.631982,District
Griffin High School,89.713896,93.392371,83.351499,83.816757,917500,1468,91.553134,Charter
Hernandez High School,64.746494,78.187702,77.289752,80.934412,3022020,4635,71.467098,District
Holden High School,90.632319,92.740047,83.803279,83.814988,248087,427,91.686183,Charter
Huang High School,63.318478,78.81385,76.629414,81.182722,1910635,2917,71.066164,District
Johnson High School,63.852132,78.281874,77.072464,80.966394,3094650,4761,71.067003,District
Pena High School,91.683992,92.203742,83.839917,84.044699,585858,962,91.943867,Charter


In [7]:
#data on the top five performing schools based on overall passing
top_five_performing = summary_per_school.sort_values('Overall Passing',ascending=False).iloc[0:5]
top_five_performing

Unnamed: 0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Budget,Number of Students,Overall Passing,Shool Type
Wilson High School,90.932983,93.25449,83.274201,83.989488,1319574,2283,92.093736,Charter
Pena High School,91.683992,92.203742,83.839917,84.044699,585858,962,91.943867,Charter
Wright High School,90.277778,93.444444,83.682222,83.955,1049400,1800,91.861111,Charter
Cabrera High School,89.558665,93.86437,83.061895,83.97578,1081356,1858,91.711518,Charter
Holden High School,90.632319,92.740047,83.803279,83.814988,248087,427,91.686183,Charter


In [8]:
#data on the bottom five performing schools based on overall passing
bottom_five_performing = summary_per_school.sort_values('Overall Passing').iloc[0:5]
bottom_five_performing

Unnamed: 0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Budget,Number of Students,Overall Passing,Shool Type
Rodriguez High School,64.066017,77.744436,76.842711,80.744686,2547363,3999,70.905226,District
Huang High School,63.318478,78.81385,76.629414,81.182722,1910635,2917,71.066164,District
Johnson High School,63.852132,78.281874,77.072464,80.966394,3094650,4761,71.067003,District
Figueroa High School,63.750424,78.433367,76.711767,81.15802,1884411,2949,71.091896,District
Hernandez High School,64.746494,78.187702,77.289752,80.934412,3022020,4635,71.467098,District


In [11]:
#average math scores by grade for each school
by_grade = student_data.groupby(['school','grade'])
math_per_grade = pd.DataFrame(by_grade['math_score'].mean())
math_per_grade.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,math_score
school,grade,Unnamed: 2_level_1
Bailey High School,10th,77.0
Bailey High School,11th,77.52
Bailey High School,12th,76.49
Bailey High School,9th,77.08
Cabrera High School,10th,83.15
Cabrera High School,11th,82.77
Cabrera High School,12th,83.28
Cabrera High School,9th,83.09
Figueroa High School,10th,76.54
Figueroa High School,11th,76.88


In [12]:
#average reading scores by grade for each school
reading_per_grade = pd.DataFrame(by_grade['reading_score'].mean())
reading_per_grade.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,reading_score
school,grade,Unnamed: 2_level_1
Bailey High School,10th,80.91
Bailey High School,11th,80.95
Bailey High School,12th,80.91
Bailey High School,9th,81.3
Cabrera High School,10th,84.25
Cabrera High School,11th,83.79
Cabrera High School,12th,84.29
Cabrera High School,9th,83.68
Figueroa High School,10th,81.41
Figueroa High School,11th,80.64


In [33]:
#scores by school spending per student,grouped
summary_per_school['Spending per Student'] = summary_per_school['Budget']/summary_per_school['Number of Students']
bins = [0,600,620,640,660]
bin_labels = ['<600','600-619','620-639','>640']
summary_per_school['Spending Range Per Student'] = pd.cut(summary_per_school['Spending per Student'],bins=4,labels=bin_labels)
summary_per_school.groupby('Spending Range Per Student')['% Passing Math','% Passing Reading','Average Math Score','Average Reading Score','Overall Passing'].mean().round(2)

Unnamed: 0_level_0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Overall Passing
Spending Range Per Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<600,90.35,93.33,83.46,83.93,91.84
600-619,90.79,92.41,83.6,83.89,91.6
620-639,77.17,86.35,80.2,82.43,81.76
>640,67.96,80.27,77.87,81.37,74.11


In [34]:
#scores by school size,grouped
summary_per_school['School Size Range'] = pd.cut(summary_per_school['Number of Students'],bins=3,labels=['Small','Medium','Large'])
summary_per_school.groupby('School Size Range')['% Passing Math','% Passing Reading','Average Math Score','Average Reading Score','Overall Passing'].mean().round(2)

Unnamed: 0_level_0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Overall Passing
School Size Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small,90.28,93.02,83.5,83.88,91.65
Medium,70.94,82.0,78.43,81.77,76.47
Large,64.32,78.38,77.06,80.92,71.35


In [35]:
#scores by school type
summary_per_school.groupby('Shool Type')['% Passing Math','% Passing Reading','Average Math Score','Average Reading Score','Overall Passing'].mean().round(2)

Unnamed: 0_level_0,% Passing Math,% Passing Reading,Average Math Score,Average Reading Score,Overall Passing
Shool Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,90.36,93.05,83.47,83.9,91.71
District,64.3,78.32,76.96,80.97,71.31
