# **PyCity Schools Analysis** #

Analysis here

In [1]:
# Dependencies and Setup

import pandas as pd

# Files to load

schools_complete_file = "Resources/schools_complete.csv"
students_complete_file = "Resources/students_complete.csv"

# read files and store into Pandas DataFrame 

school_data = pd.read_csv(schools_complete_file)
student_data = pd.read_csv(students_complete_file)

# combine the data into a single dataset

school_data_complete = pd.merge(student_data, school_data, how='left', on='school_name')
school_data_complete.head()



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


# **District Summary** #

In [2]:
# Calculate the total number of unique schools

school_count_df = school_data_complete["school_name"].unique()
total_schools = len(school_count_df)
total_schools

15

In [3]:
# Total students

student_count = school_data_complete["student_name"].count() 
student_count

39170

In [4]:
# Total budget

total_budget = school_data_complete["budget"].unique()
grand_total = total_budget.sum()
grand_total

24649428

In [5]:
# Total budget (alternative solution)

total_budget2 = school_data_complete[["school_name","budget"]]
grouped_budget = total_budget2.groupby(["school_name"])
grand_total2 = grouped_budget.mean().sum()
grand_total2

budget    24649428
dtype: int64

In [6]:
# Average math score
average_math_score = school_data_complete["math_score"].mean()
average_math_score

78.98537145774827

In [7]:
# Average reading score

average_reading_score = school_data_complete["reading_score"].mean()
average_reading_score

81.87784018381414

In [8]:
# % passing math (the percentage of students who passed math)

passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_math_percentage


74.9808526933878

In [9]:
# % passing reading (the percentage of students who passed reading)

passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 70)].count()["student_name"]
passing_reading_percentage = passing_reading_count / float(student_count) * 100
passing_reading_percentage

85.80546336482001

In [10]:
# % overall passing (the percentage of students who passed math AND reading)

passing_math_reading_count = school_data_complete[(school_data_complete["math_score"] >= 70) \
                                                    & (school_data_complete["reading_score"] >= 70)].count()["student_name"]

overall_passing_rate = passing_math_reading_count /  float(student_count) * 100
overall_passing_rate

65.17232575950983

In [11]:
# snapshot of the district's key metrics in a DataFrame

summary_dict = {"Total schools": [total_schools], \
                "Total students": [student_count], \
                "Total Budget": [grand_total], \
                "Average Math Score": [average_math_score], \
                "Average Reading Score": [average_reading_score], \
                "% Passing Math": [passing_math_percentage],\
                "% Passing Reading": [passing_reading_percentage], \
                "% Overall Passing": [overall_passing_rate]}

district_summary = pd.DataFrame(data=summary_dict)

# Formatting
district_summary["Total students"] = district_summary["Total students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)

# Display the DataFrame

district_summary

Unnamed: 0,Total schools,Total students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


# **School Summary** #

In [12]:
# Use the code provided to select the school type
school_types = school_data.set_index(["school_name"])["type"]
school_types

school_name
Huang High School        District
Figueroa High School     District
Shelton High School       Charter
Hernandez High School    District
Griffin High School       Charter
Wilson High School        Charter
Cabrera High School       Charter
Bailey High School       District
Holden High School        Charter
Pena High School          Charter
Wright High School        Charter
Rodriguez High School    District
Johnson High School      District
Ford High School         District
Thomas High School        Charter
Name: type, dtype: object

In [13]:
# Calculate the total student count
per_school_counts = school_data["size"]
per_school_counts

0     2917
1     2949
2     1761
3     4635
4     1468
5     2283
6     1858
7     4976
8      427
9      962
10    1800
11    3999
12    4761
13    2739
14    1635
Name: size, dtype: int64

In [14]:
# Calculate the total school budget and per capita spending
per_school_budget = school_data["budget"]
per_school_capita = per_school_budget / per_school_counts
per_school_capita

0     655.0
1     639.0
2     600.0
3     652.0
4     625.0
5     578.0
6     582.0
7     628.0
8     581.0
9     609.0
10    583.0
11    637.0
12    650.0
13    644.0
14    638.0
dtype: float64

In [23]:
# Per student budget

school_data["per_student_budget"] = school_data["budget"] / per_school_counts
# per_student_budget


In [76]:
student_data

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90


In [93]:
#average test score per school

per_school_math = student_data.groupby(["school_name"])
print(per_school_math)
per_school_math.mean().head(15)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa15a86fa10>


Unnamed: 0_level_0,Student ID,reading_score,math_score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,20358.5,81.033963,77.048432
Cabrera High School,16941.5,83.97578,83.061895
Figueroa High School,4391.0,81.15802,76.711767
Ford High School,36165.0,80.746258,77.102592
Griffin High School,12995.5,83.816757,83.351499
Hernandez High School,9944.0,80.934412,77.289752
Holden High School,23060.0,83.814988,83.803279
Huang High School,1458.0,81.182722,76.629414
Johnson High School,32415.0,80.966394,77.072464
Pena High School,23754.5,84.044699,83.839917


In [24]:
school_summary1 = school_data[["school_name","type","size","budget","per_student_budget"]]
school_summary1


Unnamed: 0,school_name,type,size,budget,per_student_budget
0,Huang High School,District,2917,1910635,655.0
1,Figueroa High School,District,2949,1884411,639.0
2,Shelton High School,Charter,1761,1056600,600.0
3,Hernandez High School,District,4635,3022020,652.0
4,Griffin High School,Charter,1468,917500,625.0
5,Wilson High School,Charter,2283,1319574,578.0
6,Cabrera High School,Charter,1858,1081356,582.0
7,Bailey High School,District,4976,3124928,628.0
8,Holden High School,Charter,427,248087,581.0
9,Pena High School,Charter,962,585858,609.0


In [25]:
school_summary2 = pd.merge(school_summary1, student_data, how='outer', on='school_name')
school_summary2


Unnamed: 0,school_name,type,size,budget,per_student_budget,Student ID,student_name,gender,grade,reading_score,math_score
0,Huang High School,District,2917,1910635,655.0,0,Paul Bradley,M,9th,66,79
1,Huang High School,District,2917,1910635,655.0,1,Victor Smith,M,12th,94,61
2,Huang High School,District,2917,1910635,655.0,2,Kevin Rodriguez,M,12th,90,60
3,Huang High School,District,2917,1910635,655.0,3,Dr. Richard Scott,M,12th,67,58
4,Huang High School,District,2917,1910635,655.0,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,Thomas High School,Charter,1635,1043130,638.0,39165,Donna Howard,F,12th,99,90
39166,Thomas High School,Charter,1635,1043130,638.0,39166,Dawn Bell,F,10th,95,70
39167,Thomas High School,Charter,1635,1043130,638.0,39167,Rebecca Tanner,F,9th,73,84
39168,Thomas High School,Charter,1635,1043130,638.0,39168,Desiree Kidd,F,10th,99,90


In [27]:
# % passing math (the percentage of students who passed math)

passing_math_count = school_summary2[(school_summary2["math_score"] >= 70)].count()["student_name"]
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_math_percentage


74.9808526933878

In [33]:
students_cleaned_df = student_data.loc[(student_data["reading_score"]>69),:]
students_cleaned_df.head()


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80


In [29]:
approved_df = students_cleaned_df[(students_cleaned_df[reading_score]>69)]

NameError: name 'reading_score' is not defined

In [31]:
grouped_schools = students_cleaned_df.groupby(["school_name"])
print(grouped_schools)
total_grades_school = grouped_schools.sum()
total_grades_school

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa15af65d50>


Unnamed: 0_level_0,Student ID,reading_score,math_score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,83006203,343946,313998
Cabrera High School,30544600,152232,149831
Figueroa High School,10477295,201832,182254
Ford High School,78558499,183779,167684
Griffin High School,18533799,120145,118910
Hernandez High School,37253547,316645,289312
Holden High School,9477891,34685,34444
Huang High School,3471441,200888,181956
Johnson High School,125411472,326493,297795
Pena High School,21927071,78160,77351


In [34]:
passing_math = students_cleaned_df["math_score"] 

In [35]:
summary_3 = pd.merge(school_summary1,total_grades_school, how='outer', on='school_name')
summary_3

Unnamed: 0,school_name,type,size,budget,per_student_budget,Student ID,reading_score,math_score
0,Huang High School,District,2917,1910635,655.0,3471441,200888,181956
1,Figueroa High School,District,2949,1884411,639.0,10477295,201832,182254
2,Shelton High School,Charter,1761,1056600,600.0,11383469,142404,140821
3,Hernandez High School,District,4635,3022020,652.0,37253547,316645,289312
4,Griffin High School,Charter,1468,917500,625.0,18533799,120145,118910
5,Wilson High School,Charter,2283,1319574,578.0,32776470,186297,183550
6,Cabrera High School,Charter,1858,1081356,582.0,30544600,152232,149831
7,Bailey High School,District,4976,3124928,628.0,83006203,343946,313998
8,Holden High School,Charter,427,248087,581.0,9477891,34685,34444
9,Pena High School,Charter,962,585858,609.0,21927071,78160,77351


In [52]:
summary_4 = pd.DataFrame(summary_3)
summary_4


Unnamed: 0,school_name,type,size,budget,per_student_budget,Student ID,reading_score,math_score,average_math_score,average_reading_score
0,Huang High School,District,2917,1910635,655.0,3471441,200888,181956,62.377785,68.868015
1,Figueroa High School,District,2949,1884411,639.0,10477295,201832,182254,61.801967,68.440827
2,Shelton High School,Charter,1761,1056600,600.0,11383469,142404,140821,79.966496,80.865417
3,Hernandez High School,District,4635,3022020,652.0,37253547,316645,289312,62.418986,68.316073
4,Griffin High School,Charter,1468,917500,625.0,18533799,120145,118910,81.001362,81.842643
5,Wilson High School,Charter,2283,1319574,578.0,32776470,186297,183550,80.398598,81.60184
6,Cabrera High School,Charter,1858,1081356,582.0,30544600,152232,149831,80.641012,81.933262
7,Bailey High School,District,4976,3124928,628.0,83006203,343946,313998,63.102492,69.120981
8,Holden High School,Charter,427,248087,581.0,9477891,34685,34444,80.665105,81.229508
9,Pena High School,Charter,962,585858,609.0,21927071,78160,77351,80.406445,81.247401


In [37]:
summary_4["average_math_score"] = summary_4["math_score"] / summary_4["size"]

In [38]:
summary_4["average_reading_score"] = summary_4["reading_score"] / summary_4["size"]

In [54]:
student_data

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90


In [75]:
# % passing math (the percentage of students who passed math)

passing_math_count = student_data[(student_data["math_score"] >= 70)]
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_math_percentage

TypeError: unsupported operand type(s) for /: 'str' and 'float'

In [71]:
summary_6 = student_data.groupby(["school_name","grade"])
summary_6.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Student ID,reading_score,math_score
school_name,grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,10th,20365.058918,80.907183,76.996772
Bailey High School,11th,20345.148681,80.945643,77.515588
Bailey High School,12th,20386.724708,80.912451,76.492218
Bailey High School,9th,20344.481481,81.303155,77.083676
Cabrera High School,10th,16909.487124,84.253219,83.154506
Cabrera High School,11th,16955.047718,83.788382,82.76556
Cabrera High School,12th,16924.570681,84.287958,83.277487
Cabrera High School,9th,16969.63447,83.676136,83.094697
Figueroa High School,10th,4332.703801,81.408912,76.539974
Figueroa High School,11th,4424.478138,80.640339,76.884344


In [55]:
# % passing math (the percentage of students who passed math)

passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_math_percentage

74.9808526933878