Py City Schools Analysis



In [2]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load
school_data_to_load = Path("/Users/grahammarsh/Documents/GitHub/pandas-challenge/Resources/schools_complete.csv")
student_data_to_load = Path("/Users/grahammarsh/Documents/GitHub/pandas-challenge/Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


Distric Summary

In [3]:
# Calculate the total number of unique schools
total_schools = len(school_data_complete["school_name"].unique())
print(total_schools)

15


In [4]:
# Calculate the total number of unique students
total_students = len(school_data_complete["Student ID"].unique())
print(total_students)

39170


In [5]:
# Calculate the total budget
total_budget = school_data["budget"].sum()
print(total_budget)

24649428


In [6]:
# Calculate the average math score
average_math_score = student_data["math_score"].mean()
print(average_math_score)

78.98537145774827


In [7]:
# Calculate the average reading score
average_reading_score = student_data["reading_score"].mean()
print(average_reading_score)

81.87784018381414


In [8]:
# Calculate the percentage of students with a passing math score (70 or greater)
passing_math = student_data.loc[student_data["math_score"] >= 70]
passing_math_count = passing_math["student_name"].count()
passing_math_percentage = passing_math_count / float(total_students) * 100
print(passing_math_percentage)

74.9808526933878


In [9]:
# Calculate the percentage of students with a passing reading score (70 or greater)
passing_reading = student_data.loc[student_data["reading_score"] >= 70]
passing_reading_count = passing_reading["student_name"].count()
passing_reading_percentage = passing_reading_count / float(total_students) * 100
print(passing_reading_percentage)

85.80546336482001


In [10]:
# Calculate the percentage of students who passed math and reading (% Overall Passing)
overall_passing = student_data.loc[(student_data["math_score"] >= 70) & (student_data["reading_score"] >= 70)]
overall_passing_count = overall_passing["student_name"].count()
overall_passing_percentage = overall_passing_count / float(total_students) * 100
print(overall_passing_percentage)

65.17232575950983


In [11]:
# Create a dataframe to hold the above results
district_summary_df = pd.DataFrame(
    [{"Total Schools": total_schools,
      "Total Students": total_students,
      "Total Budget": total_budget,
      "Average Math Score": average_math_score,
      "Average Reading Score": average_reading_score,
      "% Passing Math": passing_math_percentage,
      "% Passing Reading": passing_reading_percentage,
      "% Overall Passing": overall_passing_percentage}])

# Format the "Total Students" to have the comma for a thousands separator
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$"
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

# Display the dataframe with the formatting applied
district_summary_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


School Summary

In [30]:
# Display the school type by school name
school_types = school_data.set_index(["school_name"])["type"]
print(school_types)


school_name
Huang High School        District
Figueroa High School     District
Shelton High School       Charter
Hernandez High School    District
Griffin High School       Charter
Wilson High School        Charter
Cabrera High School       Charter
Bailey High School       District
Holden High School        Charter
Pena High School          Charter
Wright High School        Charter
Rodriguez High School    District
Johnson High School      District
Ford High School         District
Thomas High School        Charter
Name: type, dtype: object


In [31]:
# Calculate the total student count per school
# Calculated by counting "school_name" from the school_data_complete dataframe
# Each school name appears as many times as there are students in that school

per_school_counts = school_data_complete["school_name"].value_counts()
print(per_school_counts)

school_name
Bailey High School       4976
Johnson High School      4761
Hernandez High School    4635
Rodriguez High School    3999
Figueroa High School     2949
Huang High School        2917
Ford High School         2739
Wilson High School       2283
Cabrera High School      1858
Wright High School       1800
Shelton High School      1761
Thomas High School       1635
Griffin High School      1468
Pena High School          962
Holden High School        427
Name: count, dtype: int64


In [37]:
# Calculate the total school budget and per capita spending
# Calculated by grouping the school_data_complete dataframe by "school_name" and finding the mean of the "budget" column
# Then dividing the budget by the total students per school

per_school_budget = school_data_complete.groupby(["school_name"])["budget"].mean()
per_school_capita = per_school_budget / per_school_counts

print("Budget per School")
print("-----------------")
print(per_school_budget)
print()
print("Budget per Capita")
print("-----------------")
print(per_school_capita)

Budget per School
-----------------
school_name
Bailey High School       3124928.0
Cabrera High School      1081356.0
Figueroa High School     1884411.0
Ford High School         1763916.0
Griffin High School       917500.0
Hernandez High School    3022020.0
Holden High School        248087.0
Huang High School        1910635.0
Johnson High School      3094650.0
Pena High School          585858.0
Rodriguez High School    2547363.0
Shelton High School      1056600.0
Thomas High School       1043130.0
Wilson High School       1319574.0
Wright High School       1049400.0
Name: budget, dtype: float64

Budget per Capita
-----------------
school_name
Bailey High School       628.0
Cabrera High School      582.0
Figueroa High School     639.0
Ford High School         644.0
Griffin High School      625.0
Hernandez High School    652.0
Holden High School       581.0
Huang High School        655.0
Johnson High School      650.0
Pena High School         609.0
Rodriguez High School    637.0
Shelton 

In [38]:
# Calculate the average test scores per school
# Calculated by grouping the school_data_complete dataframe by "school_name" and finding the mean of the "math_score" and "reading_score" columns

per_school_math = school_data_complete.groupby(["school_name"])["math_score"].mean()
per_school_reading = school_data_complete.groupby(["school_name"])["reading_score"].mean()

print("Average Math Score per School")
print("-----------------------------")
print(per_school_math)
print()
print("Average Reading Score per School")
print("--------------------------------")
print(per_school_reading)

Average Math Score per School
-----------------------------
school_name
Bailey High School       77.048432
Cabrera High School      83.061895
Figueroa High School     76.711767
Ford High School         77.102592
Griffin High School      83.351499
Hernandez High School    77.289752
Holden High School       83.803279
Huang High School        76.629414
Johnson High School      77.072464
Pena High School         83.839917
Rodriguez High School    76.842711
Shelton High School      83.359455
Thomas High School       83.418349
Wilson High School       83.274201
Wright High School       83.682222
Name: math_score, dtype: float64

Average Reading Score per School
--------------------------------
school_name
Bailey High School       81.033963
Cabrera High School      83.975780
Figueroa High School     81.158020
Ford High School         80.746258
Griffin High School      83.816757
Hernandez High School    80.934412
Holden High School       83.814988
Huang High School        81.182722
Johnson Hig

In [44]:
# Calculate the number of students per school with match scores greater than or equal to 70
# Calculated by filtering the school_data_complete dataframe by "math_score" >= 70 and grouping by "school_name"
# # Each school name appears as many times as there are students in that school with a math score >= 70

per_school_passing_math = school_data_complete[school_data_complete["math_score"] >= 70].groupby(["school_name"])["school_name"].count()
print("Number of Students Passing Math per School")
print("-----------------------------------------")
print(per_school_passing_math)

Number of Students Passing Math per School
-----------------------------------------
school_name
Bailey High School       3318
Cabrera High School      1749
Figueroa High School     1946
Ford High School         1871
Griffin High School      1371
Hernandez High School    3094
Holden High School        395
Huang High School        1916
Johnson High School      3145
Pena High School          910
Rodriguez High School    2654
Shelton High School      1653
Thomas High School       1525
Wilson High School       2143
Wright High School       1680
Name: school_name, dtype: int64


In [45]:
# Calculate the number of students per school with reading scores greater than or equal to 70
# Calculated by filtering the school_data_complete dataframe by "reading_score" >= 70 and grouping by "school_name"
# Each school name appears as many times as there are students in that school with a math score >= 70

per_school_passing_reading = school_data_complete[school_data_complete["reading_score"] >= 70].groupby(["school_name"])["school_name"].count()
print("Number of Students Passing Reading per School")
print("--------------------------------------------")
print(per_school_passing_reading)

Number of Students Passing Reading per School
--------------------------------------------
school_name
Bailey High School       4077
Cabrera High School      1803
Figueroa High School     2381
Ford High School         2172
Griffin High School      1426
Hernandez High School    3748
Holden High School        411
Huang High School        2372
Johnson High School      3867
Pena High School          923
Rodriguez High School    3208
Shelton High School      1688
Thomas High School       1591
Wilson High School       2204
Wright High School       1739
Name: school_name, dtype: int64


In [49]:
#Calculate the number of students per school that passed both math and reading with scores of 70 or higher
# Calculated by filtering the school_data_complete dataframe by "math_score" >= 70 and "reading_score" >= 70 and grouping by "school_name"
# Each school name appears as many times as there are students in that school with a math score >= 70 and a reading score >= 70

students_passing_math_and_reading = school_data_complete[(school_data_complete["reading_score"] >= 70) & (school_data_complete["math_score"] >= 70)]
school_students_passing_math_and_reading = students_passing_math_and_reading.groupby(["school_name"]).size()

print("Number of Students Passing Math and Reading per School")
print("-----------------------------------------------------")
print(school_students_passing_math_and_reading)


Number of Students Passing Math and Reading per School
-----------------------------------------------------
school_name
Bailey High School       2719
Cabrera High School      1697
Figueroa High School     1569
Ford High School         1487
Griffin High School      1330
Hernandez High School    2481
Holden High School        381
Huang High School        1561
Johnson High School      2549
Pena High School          871
Rodriguez High School    2119
Shelton High School      1583
Thomas High School       1487
Wilson High School       2068
Wright High School       1626
dtype: int64


In [50]:
#calculate the passing rates per school
# Calculated by dividing the number of students passing math or reading by the total number of students per school
# Then multiplying by 100 to get the percentage

per_school_passing_math_percentage = per_school_passing_math / per_school_counts * 100
per_school_passing_reading_percentage = per_school_passing_reading / per_school_counts * 100
per_school_passing_math_and_reading_percentage = school_students_passing_math_and_reading / per_school_counts * 100

print("Percentage of Students Passing Math per School")
print("---------------------------------------------")
print(per_school_passing_math_percentage)
print()
print("Percentage of Students Passing Reading per School")
print("------------------------------------------------")
print(per_school_passing_reading_percentage)
print()
print("Percentage of Students Passing Math and Reading per School")
print("---------------------------------------------------------")
print(per_school_passing_math_and_reading_percentage)


Percentage of Students Passing Math per School
---------------------------------------------
school_name
Bailey High School       66.680064
Cabrera High School      94.133477
Figueroa High School     65.988471
Ford High School         68.309602
Griffin High School      93.392371
Hernandez High School    66.752967
Holden High School       92.505855
Huang High School        65.683922
Johnson High School      66.057551
Pena High School         94.594595
Rodriguez High School    66.366592
Shelton High School      93.867121
Thomas High School       93.272171
Wilson High School       93.867718
Wright High School       93.333333
dtype: float64

Percentage of Students Passing Reading per School
------------------------------------------------
school_name
Bailey High School       81.933280
Cabrera High School      97.039828
Figueroa High School     80.739234
Ford High School         79.299014
Griffin High School      97.138965
Hernandez High School    80.862999
Holden High School       96.25292

In [52]:
# Create a DataFrame called `per_school_summary` with columns for the calculations above.

per_school_summary_df = pd.DataFrame({
    "School Type": school_types,
    "Total Students": per_school_counts,
    "Total School Budget": per_school_budget,
    "Per Student Budget": per_school_capita,
    "Average Math Score": per_school_math,
    "Average Reading Score": per_school_reading,
    "% Passing Math": per_school_passing_math_percentage,
    "% Passing Reading": per_school_passing_reading_percentage,
    "% Overall Passing": per_school_passing_math_and_reading_percentage})

# Formatting
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)

# Display the DataFrame
print(per_school_summary_df)


                      School Type  Total Students Total School Budget  \
school_name                                                             
Bailey High School       District            4976       $3,124,928.00   
Cabrera High School       Charter            1858       $1,081,356.00   
Figueroa High School     District            2949       $1,884,411.00   
Ford High School         District            2739       $1,763,916.00   
Griffin High School       Charter            1468         $917,500.00   
Hernandez High School    District            4635       $3,022,020.00   
Holden High School        Charter             427         $248,087.00   
Huang High School        District            2917       $1,910,635.00   
Johnson High School      District            4761       $3,094,650.00   
Pena High School          Charter             962         $585,858.00   
Rodriguez High School    District            3999       $2,547,363.00   
Shelton High School       Charter            1761  