In [61]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = ("/Users/emilynotaro/Downloads/Starter_Code/PyCitySchools/Resources/schools_complete.csv")
student_data_to_load = ("/Users/emilynotaro/Downloads/Starter_Code/PyCitySchools/Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head()

# Calculate the total number of unique schools
school_count = len(school_data_complete["school_name"].unique())

# Calculate the total number of students
student_count = len(school_data_complete["student_name"])

# Calculate the total budget
total_budget = sum(school_data_complete["budget"].unique())

# Calculate the average (mean) math score
average_math_score = school_data_complete["math_score"].mean()

# Calculate the average (mean) reading score
average_reading_score = school_data_complete["reading_score"].mean()

# Use the following to calculate the percentage of students who passed math (math scores greather than or equal to 70)
passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]
passing_math_percentage = passing_math_count / float(student_count) * 100

# Calculate the percentage of students who passeed reading (hint: look at how the math percentage was calculated)  
passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 70)].count()["student_name"]
passing_reading_percentage = passing_reading_count / float(student_count) * 100

# Use the following to calculate the percentage of students that passed math and reading
passing_math_reading_count = school_data_complete[
    (school_data_complete["math_score"] >= 70) & (school_data_complete["reading_score"] >= 70)
].count()["student_name"]
overall_passing_rate = passing_math_reading_count /  float(student_count) * 100

# Create a high-level snapshot of the district's key metrics in a DataFrame
district_summary = pd.DataFrame([
    {"District Metric": "Total Number of Unique Schools", "Value": school_count},
    {"District Metric": "Total Students", "Value": student_count},
    {"District Metric": "Total Budget", "Value": total_budget},
    {"District Metric": "Average Math Score", "Value": average_math_score},
    {"District Metric": "Average Reading Score", "Value": average_reading_score},
    {"District Metric": "Percent Passing Math", "Value": passing_math_percentage},
    {"District Metric": "Percent Passing Reading", "Value": passing_reading_percentage},
    {"District Metric": "Percent Overall Passing", "Value": overall_passing_rate}
])

# Formatting - have to fix this
district_summary["Value"] = district_summary["Value"].map("{:,.2f}".format)
district_summary["Value"][5:8] = district_summary["Value"][5:8] +"%"
district_summary["Value"][2] = "$" + district_summary["Value"][2]

# Display the DataFrame
district_summary

Unnamed: 0,District Metric,Value
0,Total Number of Unique Schools,15.00
1,Total Students,39170.00
2,Total Budget,"$24,649,428.00"
3,Average Math Score,78.99
4,Average Reading Score,81.88
5,Percent Passing Math,74.98%
6,Percent Passing Reading,85.81%
7,Percent Overall Passing,65.17%


In [86]:
# Use the code provided to select the school type
school_types = school_data.set_index(["school_name"])["type"]

# Calculate the total student count
per_school_counts = school_data_complete.groupby(["school_name"])["student_name"].count()

# Calculate the total school budget and per capita spending
per_school_budget = school_data_complete.groupby(["school_name"]).mean()["budget"]
per_school_capita = per_school_budget / per_school_counts
per_school_capita = pd.Series(per_school_capita,
              name="Per Student Budget")
per_school_capita.to_frame()

# Calculate the average test scores
per_school_math = school_data_complete.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete.groupby(["school_name"]).mean()["reading_score"]

# Calculate the number of schools with math scores of 70 or higher
school_passing_math = school_data_complete[(school_data_complete["math_score"] >= 70)]

# Calculate the number of schools with reading scores of 70 or higher
school_passing_reading = school_data_complete[(school_data_complete["reading_score"] >= 70)]

# Use the provided code to calculate the schools that passed both math and reading with scores of 70 or higher
passing_math_and_reading = school_data_complete[
    (school_data_complete["reading_score"] >= 70) & (school_data_complete["math_score"] >= 70)
]

# Use the provided code to calculate the passing rates
per_school_passing_math = school_passing_math.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100
per_school_passing_reading = school_passing_reading.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100
overall_passing_rate = passing_math_and_reading.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100

# Create a DataFrame called `per_school_summary` with columns for the calculations above.
per_school_summary = pd.merge(school_types, per_school_counts, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_budget, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_capita, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_math, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_reading, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_passing_math, on="school_name")
per_school_summary = pd.merge(per_school_summary, per_school_passing_reading, on="school_name")

per_school_summary = per_school_summary.rename(columns={"type": "School Type",
                                                        "student_name_x": "Total Students",
                                                        "budget": "Total School Budget",
                                                        "math_score": "Average Math Score",
                                                        "reading_score": "Average Reading Score",
                                                        "student_name_y": "Percent Passing Math",
                                                        "student_name": "Percent Passing Reading"})
per_school_summary = pd.merge(per_school_summary, overall_passing_rate, on="school_name")
per_school_summary = per_school_summary.rename(columns={"student_name": "Percent Overall Passing"})

# Display the DataFrame
per_school_summary

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
Figueroa High School,District,2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Shelton High School,Charter,1761,1056600.0,600.0,83.359455,83.725724,93.867121,95.854628,89.892107
Hernandez High School,District,4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
Griffin High School,Charter,1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
Cabrera High School,Charter,1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Bailey High School,District,4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
Holden High School,Charter,427,248087.0,581.0,83.803279,83.814988,92.505855,96.252927,89.227166
Pena High School,Charter,962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


In [87]:
top_schools = per_school_summary.sort_values(["Percent Overall Passing"], ascending = False)
top_schools.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Thomas High School,Charter,1635,1043130.0,638.0,83.418349,83.84893,93.272171,97.308869,90.948012
Griffin High School,Charter,1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
Pena High School,Charter,962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


In [88]:
bottom_schools = per_school_summary.sort_values(["Percent Overall Passing"])
bottom_schools.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Percent Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363.0,637.0,76.842711,80.744686,66.366592,80.220055,52.988247
Figueroa High School,District,2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Huang High School,District,2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
Hernandez High School,District,4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
Johnson High School,District,4761,3094650.0,650.0,77.072464,80.966394,66.057551,81.222432,53.539172


In [89]:
# Use the code provided to separate the data by grade
ninth_graders = school_data_complete[(school_data_complete["grade"] == "9th")]
tenth_graders = school_data_complete[(school_data_complete["grade"] == "10th")]
eleventh_graders = school_data_complete[(school_data_complete["grade"] == "11th")]
twelfth_graders = school_data_complete[(school_data_complete["grade"] == "12th")]

ninth_graders_scores = ninth_graders.groupby(["school_name"])["math_score"].mean()
tenth_graders_scores = tenth_graders.groupby(["school_name"])["math_score"].mean()
eleventh_graders_scores = eleventh_graders.groupby(["school_name"])["math_score"].mean()
twelfth_graders_scores = twelfth_graders.groupby(["school_name"])["math_score"].mean()

# Combine each of the scores above into single DataFrame called `math_scores_by_grade`
math_scores_by_grade = pd.DataFrame([ninth_graders_scores,
                                     tenth_graders_scores,
                                     eleventh_graders_scores,
                                     twelfth_graders_scores],
                                    index=['9th', '10th', '11th', '12th']
                                   )

# Minor data wrangling
math_scores_by_grade.index.name = "grade"

# Display the DataFrame
math_scores_by_grade

school_name,Bailey High School,Cabrera High School,Figueroa High School,Ford High School,Griffin High School,Hernandez High School,Holden High School,Huang High School,Johnson High School,Pena High School,Rodriguez High School,Shelton High School,Thomas High School,Wilson High School,Wright High School
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9th,77.083676,83.094697,76.403037,77.361345,82.04401,77.438495,83.787402,77.027251,77.187857,83.625455,76.859966,83.420755,83.590022,83.085578,83.264706
10th,76.996772,83.154506,76.539974,77.672316,84.229064,77.337408,83.429825,75.908735,76.691117,83.372,76.6125,82.917411,83.087886,83.724422,84.010288
11th,77.515588,82.76556,76.884344,76.918058,83.842105,77.136029,85.0,76.446602,77.491653,84.328125,76.395626,83.383495,83.498795,83.195326,83.836782
12th,76.492218,83.277487,77.151369,76.179963,83.356164,77.186567,82.855422,77.225641,76.863248,84.121547,77.690748,83.778976,83.497041,83.035794,83.644986


In [90]:
# Group by "school_name" and take the mean of each.
ninth_graders_scores = ninth_graders.groupby(["school_name"])["reading_score"].mean()
tenth_graders_scores = tenth_graders.groupby(["school_name"])["reading_score"].mean()
eleventh_graders_scores = eleventh_graders.groupby(["school_name"])["reading_score"].mean()
twelfth_graders_scores = twelfth_graders.groupby(["school_name"])["reading_score"].mean()

# Combine each of the scores above into single DataFrame called `reading_scores_by_grade`
reading_scores_by_grade = pd.DataFrame([ninth_graders_scores,
                                        tenth_graders_scores,
                                        eleventh_graders_scores,
                                        twelfth_graders_scores],
                                        index=['9th', '10th', '11th', '12th']
                                       )

# Minor data wrangling
reading_scores_by_grade.index.name = "grade"

# Display the DataFrame
reading_scores_by_grade

school_name,Bailey High School,Cabrera High School,Figueroa High School,Ford High School,Griffin High School,Hernandez High School,Holden High School,Huang High School,Johnson High School,Pena High School,Rodriguez High School,Shelton High School,Thomas High School,Wilson High School,Wright High School
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9th,81.303155,83.676136,81.198598,80.632653,83.369193,80.86686,83.677165,81.290284,81.260714,83.807273,80.993127,84.122642,83.72885,83.939778,83.833333
10th,80.907183,84.253219,81.408912,81.262712,83.706897,80.660147,83.324561,81.512386,80.773431,83.612,80.629808,83.441964,84.254157,84.021452,83.812757
11th,80.945643,83.788382,80.640339,80.403642,84.288089,81.39614,83.815534,81.417476,80.616027,84.335938,80.864811,84.373786,83.585542,83.764608,84.156322
12th,80.912451,84.287958,81.384863,80.662338,84.013699,80.857143,84.698795,80.305983,81.227564,84.59116,80.376426,82.781671,83.831361,84.317673,84.073171


In [91]:
# Establish the bins 
spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

# Create a copy of the school summary since it has the "Per Student Budget" 
school_spending_df = per_school_summary.copy()

In [92]:
# Use `pd.cut` to categorize spending based on the bins.
school_spending_df["Spending Ranges (Per Student)"] = pd.cut(per_school_summary["Per Student Budget"], bins=spending_bins, labels=labels, include_lowest=True)

In [93]:
#  Calculate averages for the desired columns. 
spending_math_scores = school_spending_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Math Score"]
spending_reading_scores = school_spending_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Reading Score"]
spending_passing_math = school_spending_df.groupby(["Spending Ranges (Per Student)"]).mean()["Percent Passing Math"]
spending_passing_reading = school_spending_df.groupby(["Spending Ranges (Per Student)"]).mean()["Percent Passing Reading"]
overall_passing_spending = school_spending_df.groupby(["Spending Ranges (Per Student)"]).mean()["Percent Overall Passing"]

In [94]:
# Assemble into DataFrame
spending_summary = pd.DataFrame([spending_math_scores,
                                 spending_reading_scores,
                                 spending_passing_math,
                                 spending_passing_reading,
                                 overall_passing_spending])

# Display results
spending_summary

Spending Ranges (Per Student),<$585,$585-630,$630-645,$645-680
Average Math Score,83.455399,81.899826,78.518855,76.99721
Average Reading Score,83.933814,83.155286,81.624473,81.027843
Percent Passing Math,93.460096,87.133538,73.484209,66.164813
Percent Passing Reading,96.610877,92.718205,84.391793,81.133951
Percent Overall Passing,90.369459,81.418596,62.857656,53.526855


In [95]:
# Establish the bins.
size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

In [96]:
# Create a copy of the school summary
school_size_df = per_school_summary.copy()

# Categorize the spending based on the bins
# Use `pd.cut` on the "Total Students" column of the `per_school_summary` DataFrame.
school_size_df["School Size"] = pd.cut(per_school_summary["Total Students"], bins=size_bins, labels=labels, include_lowest=True)

In [97]:
# Calculate averages for the desired columns. 
size_math_scores = school_size_df.groupby(["School Size"]).mean()["Average Math Score"]
size_reading_scores = school_size_df.groupby(["School Size"]).mean()["Average Reading Score"]
size_passing_math = school_size_df.groupby(["School Size"]).mean()["Percent Passing Math"]
size_passing_reading = school_size_df.groupby(["School Size"]).mean()["Percent Passing Reading"]
size_overall_passing = school_size_df.groupby(["School Size"]).mean()["Percent Overall Passing"]

In [98]:
# Create a DataFrame called `size_summary` that breaks down school performance based on school size (small, medium, or large).
# Use the scores above to create a new DataFrame called `size_summary`
size_summary = pd.DataFrame([size_math_scores,
                                 size_reading_scores,
                                 size_passing_math,
                                 size_passing_reading,
                                 size_overall_passing])

# Display results
size_summary

School Size,Small (<1000),Medium (1000-2000),Large (2000-5000)
Average Math Score,83.821598,83.374684,77.746417
Average Reading Score,83.929843,83.864438,81.344493
Percent Passing Math,93.550225,93.599695,69.963361
Percent Passing Reading,96.099437,96.79068,82.766634
Percent Overall Passing,89.883853,90.621535,58.286003


In [99]:
# Calculate averages for the desired columns. 
type_math_scores = school_size_df.groupby(["School Type"]).mean()["Average Math Score"]
type_reading_scores = school_size_df.groupby(["School Type"]).mean()["Average Reading Score"]
type_passing_math = school_size_df.groupby(["School Type"]).mean()["Percent Passing Math"]
type_passing_reading = school_size_df.groupby(["School Type"]).mean()["Percent Passing Reading"]
type_overall_passing = school_size_df.groupby(["School Type"]).mean()["Percent Overall Passing"]

In [100]:
# Create a DataFrame called `type_summary` that breaks down school performance based on school type (District or Charter).
# Use the scores above to create a new DataFrame called `type_summary`
type_summary = pd.DataFrame([type_math_scores,
                                 type_reading_scores,
                                 type_passing_math,
                                 type_passing_reading,
                                 type_overall_passing])

# Display results
type_summary

School Type,Charter,District
Average Math Score,83.473852,76.956733
Average Reading Score,83.896421,80.966636
Percent Passing Math,93.62083,66.548453
Percent Passing Reading,96.586489,80.799062
Percent Overall Passing,90.432244,53.672208
