In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data_to_load = "../Resources/schools_complete.csv"
student_data_to_load = "../Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head(100)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
95,95,Kevin Martinez,M,11th,Huang High School,89,58,0,District,2917,1910635
96,96,Jessica Smith,F,9th,Huang High School,77,72,0,District,2917,1910635
97,97,Dawn Trujillo,F,11th,Huang High School,82,87,0,District,2917,1910635
98,98,Virginia Ramirez MD,F,10th,Huang High School,99,89,0,District,2917,1910635


In [2]:
#Calculate the total number of schools
school_data_df = pd.DataFrame(school_data)
totalschools = len(school_data_df["school_name"])
totalschools

15

In [3]:
#Calculate the total number of students
totalstudents = len(school_data_complete["student_name"])
totalstudents

39170

In [4]:
school_data_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
#Calculate the total budget
budgets = school_data_df["budget"].sum()
budgets

24649428

In [6]:
#Calculate the average math score
avg_math = school_data_complete["math_score"].mean()
avg_math

78.98537145774827

In [7]:
#Calculate the average reading score
avg_read = school_data_complete["reading_score"].mean()
avg_read

81.87784018381414

In [8]:
#Calculate the percentage of students with a passing math score (70 or greater)
passing_math = len(school_data_complete[school_data_complete["math_score"] >= 70])
total_pass_math_percent = passing_math/totalstudents * 100
print(total_pass_math_percent)
print(passing_math)
print(totalstudents)

74.9808526933878
29370
39170


In [9]:
#Calculate the percentage of students with a passing reading score (70 or greater)
passing_read = len(school_data_complete[school_data_complete["reading_score"] >= 70])
total_read_pass_percent = passing_read/totalstudents * 100
print(total_read_pass_percent)
print(passing_read)
print(totalstudents)

85.80546336482001
33610
39170


In [10]:
#Calculate the percentage of students who passed math and reading (% Overall Passing)
#overall_pass = np.where(school_data_complete[school_data_complete["reading_score"] >= 70, [school_data_complete["math_score"] >= 70]]).sum()
passing_df = school_data_complete[school_data_complete["reading_score"] >= 70]
overall_pass = len(passing_df[passing_df["math_score"]>= 70])
overall_pass_percent = overall_pass/totalstudents * 100
overall_pass_percent

65.17232575950983

In [11]:
#Create a dataframe to hold the above results
District_Data = pd.DataFrame({
    "Total Schools" : [totalschools],
    "Total Students" : [totalstudents],
    "Total Budget" : [budgets],
    "Average Math Score" : [avg_math],
    "Average Reading Score" : [avg_read],
    "% Passing Math" : [total_pass_math_percent],
    "% Passing Reading" : [total_read_pass_percent],
    "% Overall Passing" : [overall_pass_percent]
})

In [12]:
#Optional: give the displayed data cleaner formatting
District_Data["Total Students"] = District_Data["Total Students"].astype(float).map("{:,.0f}".format)
District_Data["Total Budget"] = District_Data["Total Budget"].astype(float).map("${:,.2f}".format)
District_Data

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


In [13]:
#Create an overview table that summarizes key metrics about each school, including:
#School Name, #School Type, #Total Students per school, #Total School Budget
school_data_df

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [14]:
#Per Student Budget
dollars_per_student = school_data['budget'] / school_data['size']
dollars_per_student

0     655.0
1     639.0
2     600.0
3     652.0
4     625.0
5     578.0
6     582.0
7     628.0
8     581.0
9     609.0
10    583.0
11    637.0
12    650.0
13    644.0
14    638.0
dtype: float64

In [15]:
#Average Math Score
math_scores = school_data_complete.groupby('School ID')['math_score'].mean()
math_scores

School ID
0     76.629414
1     76.711767
2     83.359455
3     77.289752
4     83.351499
5     83.274201
6     83.061895
7     77.048432
8     83.803279
9     83.839917
10    83.682222
11    76.842711
12    77.072464
13    77.102592
14    83.418349
Name: math_score, dtype: float64

In [16]:
#Average Reading Score
reading_scores = school_data_complete.groupby('School ID')['reading_score'].mean()
reading_scores

School ID
0     81.182722
1     81.158020
2     83.725724
3     80.934412
4     83.816757
5     83.989488
6     83.975780
7     81.033963
8     83.814988
9     84.044699
10    83.955000
11    80.744686
12    80.966394
13    80.746258
14    83.848930
Name: reading_score, dtype: float64

In [17]:
#% Passing Math
schools_passing_math = school_data_complete.loc[school_data_complete['math_score']>=70]
math_by_school = schools_passing_math.groupby('School ID')['math_score'].count()
math_by_school_percent = math_by_school / school_data['size'] * 100
math_by_school_percent

School ID
0     65.683922
1     65.988471
2     93.867121
3     66.752967
4     93.392371
5     93.867718
6     94.133477
7     66.680064
8     92.505855
9     94.594595
10    93.333333
11    66.366592
12    66.057551
13    68.309602
14    93.272171
dtype: float64

In [18]:
#% Passing Reading
schools_passing_reading = school_data_complete.loc[school_data_complete['reading_score']>=70]
reading_by_school = schools_passing_reading.groupby('School ID')['reading_score'].count()
reading_by_school_percent = reading_by_school / school_data['size'] * 100
reading_by_school_percent

School ID
0     81.316421
1     80.739234
2     95.854628
3     80.862999
4     97.138965
5     96.539641
6     97.039828
7     81.933280
8     96.252927
9     95.945946
10    96.611111
11    80.220055
12    81.222432
13    79.299014
14    97.308869
dtype: float64

In [19]:
#% Overall Passing (The percentage of students that passed math and reading.)
overall_by_school = school_data_complete.loc[school_data_complete['math_score']>=70]
overall_by_school = overall_by_school.loc[overall_by_school['reading_score']>=70]
overall_by_school = overall_by_school.groupby('School ID')['reading_score'].count()
overall_by_school_percent = overall_by_school / school_data['size'] * 100
overall_by_school_percent

School ID
0     53.513884
1     53.204476
2     89.892107
3     53.527508
4     90.599455
5     90.582567
6     91.334769
7     54.642283
8     89.227166
9     90.540541
10    90.333333
11    52.988247
12    53.539172
13    54.289887
14    90.948012
dtype: float64

In [26]:
#Create a dataframe to hold the above results
school_data_df['Per Student Spending'] = dollars_per_student
school_data_df['Average Reading Score'] = reading_scores
school_data_df['Average Math Score'] = math_scores
school_data_df['Percent Passing Math'] = math_by_school_percent
school_data_df['Percent Passing Reading'] = reading_by_school_percent
school_data_df['Percent Overall Passing'] = overall_by_school_percent
#del school_data_df['School ID']
school_data_df.sort_values('school_name').set_index('school_name')


Unnamed: 0_level_0,type,size,budget,Per Student Spending,Average Reading Score,Average Math Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628.0,81.033963,77.048432,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,1081356,582.0,83.97578,83.061895,94.133477,97.039828,91.334769
Figueroa High School,District,2949,1884411,639.0,81.15802,76.711767,65.988471,80.739234,53.204476
Ford High School,District,2739,1763916,644.0,80.746258,77.102592,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,917500,625.0,83.816757,83.351499,93.392371,97.138965,90.599455
Hernandez High School,District,4635,3022020,652.0,80.934412,77.289752,66.752967,80.862999,53.527508
Holden High School,Charter,427,248087,581.0,83.814988,83.803279,92.505855,96.252927,89.227166
Huang High School,District,2917,1910635,655.0,81.182722,76.629414,65.683922,81.316421,53.513884
Johnson High School,District,4761,3094650,650.0,80.966394,77.072464,66.057551,81.222432,53.539172
Pena High School,Charter,962,585858,609.0,84.044699,83.839917,94.594595,95.945946,90.540541


In [30]:
#Top Performing Schools (By % Overall Passing)
top_5_df = school_data_df.sort_values('Percent Overall Passing', ascending = False).set_index('school_name').head(5)
top_5_df


Unnamed: 0_level_0,type,size,budget,Per Student Spending,Average Reading Score,Average Math Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.97578,83.061895,94.133477,97.039828,91.334769
Thomas High School,Charter,1635,1043130,638.0,83.84893,83.418349,93.272171,97.308869,90.948012
Griffin High School,Charter,1468,917500,625.0,83.816757,83.351499,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574,578.0,83.989488,83.274201,93.867718,96.539641,90.582567
Pena High School,Charter,962,585858,609.0,84.044699,83.839917,94.594595,95.945946,90.540541


In [33]:
bottom_5_df = school_data_df.sort_values('Percent Overall Passing', ascending = True).set_index('school_name').head(5)
bottom_5_df

Unnamed: 0_level_0,type,size,budget,Per Student Spending,Average Reading Score,Average Math Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,80.744686,76.842711,66.366592,80.220055,52.988247
Figueroa High School,District,2949,1884411,639.0,81.15802,76.711767,65.988471,80.739234,53.204476
Huang High School,District,2917,1910635,655.0,81.182722,76.629414,65.683922,81.316421,53.513884
Hernandez High School,District,4635,3022020,652.0,80.934412,77.289752,66.752967,80.862999,53.527508
Johnson High School,District,4761,3094650,650.0,80.966394,77.072464,66.057551,81.222432,53.539172
