# PyCitySchools Challenge

In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to change the path if needed.)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read the School Data and Student Data and store into a Pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

# Cleaning Student Names and Replacing Substrings in a Python String
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

# Check names.
student_data_df.head(5)



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


## Deliverable 1: Replace the reading and math scores.

### Replace the 9th grade reading and math scores at Thomas High School with NaN.

In [2]:
# Install numpy using conda install numpy or pip install numpy. 
# Step 1. Import numpy as np.
import numpy as np

In [3]:
# Step 2. Use the loc method on the student_data_df to select all the reading scores from the 9th 
# grade at Thomas High School and replace them with NaN.

# IF row has "Thomas High School" AND "9th" change "reading_score" to NaN
student_data_df.loc[(student_data_df['grade'] == '9th') & (student_data_df['school_name'] == 'Thomas High School'), ['reading_score']] = np.nan

In [4]:
#  Step 3. Refactor the code in Step 2 to replace the math scores with NaN.

student_data_df.loc[(student_data_df['grade'] == '9th') & (student_data_df['school_name'] == 'Thomas High School'), ['math_score']] = np.nan

In [5]:
#  Step 4. Check the student data for NaN's.
# student_data_df.loc[(student_data_df['math_score'] == np.nan) & (student_data_df['reading_score'] == np.nan)]
student_data_df.isnull().sum()

Student ID         0
student_name       0
gender             0
grade              0
school_name        0
reading_score    461
math_score       461
dtype: int64

In [6]:
student_data_df.tail(10)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
39160,39160,Katie Weaver,F,11th,Thomas High School,89.0,86.0
39161,39161,April Reyes,F,10th,Thomas High School,70.0,84.0
39162,39162,Derek Weeks,M,12th,Thomas High School,94.0,77.0
39163,39163,John Reese,M,11th,Thomas High School,90.0,75.0
39164,39164,Joseph Anthony,M,9th,Thomas High School,,
39165,39165,Donna Howard,F,12th,Thomas High School,99.0,90.0
39166,39166,Dawn Bell,F,10th,Thomas High School,95.0,70.0
39167,39167,Rebecca Tanner,F,9th,Thomas High School,,
39168,39168,Desiree Kidd,F,10th,Thomas High School,99.0,90.0
39169,39169,Carolyn Jackson,F,11th,Thomas High School,95.0,75.0


## Deliverable 2 : Repeat the school district analysis

### District Summary

In [7]:
# Combine the data into a single dataset
school_data_complete_df = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66.0,79.0,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94.0,61.0,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90.0,60.0,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67.0,58.0,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97.0,84.0,0,District,2917,1910635


In [8]:
# Calculate the Totals (Schools and Students)
school_count = len(school_data_complete_df["school_name"].unique())
student_count = school_data_complete_df["Student ID"].count()

# Calculate the Total Budget
total_budget = school_data_df["budget"].sum()

In [9]:
# Calculate the Average Scores using the "clean_student_data".
average_reading_score = school_data_complete_df["reading_score"].mean()
average_math_score = school_data_complete_df["math_score"].mean()

In [10]:
# Step 1. Get the number of students that are in ninth grade at Thomas High School.
# These students have no grades. 


# Get the total student count 
student_count = school_data_complete_df["Student ID"].count()


# Step 2. Subtract the number of students that are in ninth grade at 
# Thomas High School from the total student count to get the new total student count.


In [11]:
# Calculate the passing rates using the "clean_student_data".
passing_math_count = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)].count()["student_name"]
passing_reading_count = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)].count()["student_name"]

In [12]:
# Step 3. Calculate the passing percentages with the new total student count.

In [13]:
# Calculate the students who passed both reading and math.
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)
                                               & (school_data_complete_df["reading_score"] >= 70)]

# Calculate the number of students that passed both reading and math.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()


# Step 4.Calculate the overall passing percentage with new total student count.

In [14]:
# Create a DataFrame
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count, 
          "Total Students": student_count, 
          "Total Budget": total_budget,
          "Average Math Score": average_math_score, 
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])



# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.1f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.1f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.1f}".format)

# Display the data frame
district_summary_df

NameError: name 'passing_math_percentage' is not defined

##  School Summary

In [None]:
# Determine the School Type
per_school_types = school_data_df.set_index(["school_name"])["type"]

# Calculate the total student count.
per_school_counts = school_data_complete_df["school_name"].value_counts()

# Calculate the total school budget and per capita spending
per_school_budget = school_data_complete_df.groupby(["school_name"]).mean()["budget"]
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts

# Calculate the average test scores.
per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

# Calculate the passing scores by creating a filtered DataFrame.
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]

# Calculate the number of students passing math and passing reading by school.
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]

# Calculate the percentage of passing math and reading scores per school.
per_school_passing_math = per_school_passing_math / per_school_counts * 100
per_school_passing_reading = per_school_passing_reading / per_school_counts * 100

# Calculate the students who passed both reading and math.
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)
                                               & (school_data_complete_df["math_score"] >= 70)]

# Calculate the number of students passing math and passing reading by school.
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]

# Calculate the percentage of passing math and reading scores per school.
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100

In [None]:
# Create the DataFrame
per_school_summary_df = pd.DataFrame({
    "School Type": per_school_types,
    "Total Students": per_school_counts,
    "Total School Budget": per_school_budget,
    "Per Student Budget": per_school_capita,
    "Average Math Score": per_school_math,
    "Average Reading Score": per_school_reading,
    "% Passing Math": per_school_passing_math,
    "% Passing Reading": per_school_passing_reading,
    "% Overall Passing": per_overall_passing_percentage})


# per_school_summary_df.head()

In [None]:
# Format the Total School Budget and the Per Student Budget
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)

# Display the data frame
per_school_summary_df

In [None]:
# Step 5.  Get the number of 10th-12th graders from Thomas High School (THS).


In [None]:
# Step 6. Get all the students passing math from THS


In [None]:
# Step 7. Get all the students passing reading from THS


In [None]:
# Step 8. Get all the students passing math and reading from THS


In [None]:
# Step 9. Calculate the percentage of 10th-12th grade students passing math from Thomas High School. 


In [None]:
# Step 10. Calculate the percentage of 10th-12th grade students passing reading from Thomas High School.


In [None]:
# Step 11. Calculate the overall passing percentage of 10th-12th grade from Thomas High School. 


In [None]:
# Step 12. Replace the passing math percent for Thomas High School in the per_school_summary_df.


In [None]:
# Step 13. Replace the passing reading percentage for Thomas High School in the per_school_summary_df.


In [None]:
# Step 14. Replace the overall passing percentage for Thomas High School in the per_school_summary_df.


In [None]:
# per_school_summary_df


## High and Low Performing Schools 

In [None]:
# Sort and show top five schools.


In [None]:
# Sort and show top five schools.


## Math and Reading Scores by Grade

In [None]:
# Create a Series of scores by grade levels using conditionals.


# Group each school Series by the school name for the average math score.


# Group each school Series by the school name for the average reading score.


In [None]:
# Combine each Series for average math scores by school into single data frame.


In [None]:
# Combine each Series for average reading scores by school into single data frame.


In [None]:
# Format each grade column.


In [None]:
# Remove the index.


# Display the data frame


In [None]:
## Remove the index.


# Display the data frame


## Scores by School Spending

In [None]:
# Establish the spending bins and group names.


# Categorize spending based on the bins.


In [None]:
# Calculate averages for the desired columns. 


In [None]:
# Create the DataFrame


In [None]:
# Format the DataFrame 


## Scores by School Size

In [None]:
# Establish the bins.

# Categorize spending based on the bins.


In [None]:
# Calculate averages for the desired columns. 


In [None]:
# Assemble into DataFrame. 


In [None]:
# Format the DataFrame  


## Scores by School Type

In [None]:
# Calculate averages for the desired columns. 


In [None]:
# Assemble into DataFrame. 


In [None]:
# # Format the DataFrame 


# !!!!PyCitySchools ORIGINAL STARTS BELOW!!!!

In [None]:
# Import Dependencies
import pandas as pd

In [None]:
# Declare Variables
# Files to Load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [None]:
# Import School data into Pandas DataFram using the read_csv() function
school_data_df = pd.read_csv(school_data_to_load)

# View dataframe. Use head() function to view only the first 5 rows.
school_data_df.head()

In [None]:
# View dataframe. Use head() function to view only the first 5 rows.
school_data_df.head()

In [None]:
# View dataframe. Use tail() function to view only the last 5 rows.
school_data_df.tail()

In [None]:
# Import Student data intoPandas DataFram using the read_csv() function
student_data_df = pd.read_csv(student_data_to_load)
student_data_df.head()

In [None]:
# 4.5.1 - Find Missing Values
# Use count() method to inspect the student data.

# Determine if there are any missing values in the school data
school_data_df.count()

In [None]:
# Determine if there are any missing values in the student data
student_data_df.count()

In [None]:
# Use isnull() method to find NULL values
school_data_df.isnull()

In [None]:
student_data_df.isnull()

In [None]:
# Use sum method to get the number of empty rows, or rows that are "True"
student_data_df.isnull().sum()

In [None]:
# Alternatively, use the notnull() method. Returns the opposite of the isnull() method.
school_data_df.notnull()

In [None]:
student_data_df.notnull().sum()

In [None]:
# Determine Data Types
school_data_df.dtypes

In [None]:
student_data_df.dtypes

Next step, remove the prefixes and suffixes so student names match their records.

In [None]:
# Add each prefix and suffix to be removed to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [None]:
# Iterate through the words in the prefixes_suffixes variable and replace them with an empty space. ""
for word in prefixes_suffixes:
    student_data_df['student_name'] = student_data_df['student_name'].str.replace(word, "")

In [None]:
student_data_df.head()

In [None]:
# Load the clean student data from CSV
clean_student_data_to_load = "Resources/clean_students_complete.csv"

# Import Student data intoPandas DataFram using the read_csv() function
clean_student_data_df = pd.read_csv(clean_student_data_to_load)
clean_student_data_df.head()

In [None]:
# Combine the two data frames into one merged dataframe.
school_data_complete_df = pd.merge(clean_student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df.head()

In [None]:
# Get the total number of students
student_count = school_data_complete_df["Student ID"].count()
student_count

In [None]:
# Get the total number of schools
school_count = len(school_data_complete_df["school_name"].unique())
school_count

In [None]:
# Calculate the total budget
total_budget = school_data_df["budget"].sum()
total_budget

In [None]:
# Find the average reading score
average_reading_score = school_data_complete_df["reading_score"].mean()
average_reading_score

In [None]:
# Find the average math score
average_math_score = school_data_complete_df["math_score"].mean()
average_math_score

In [None]:
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70

passing_math.head()

In [None]:
# Get all the students who are passing math in a new DataFrame
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math.head()

In [None]:
# Get all the students who are passing reading in a new DataFrame
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
passing_reading.head()

In [None]:
# Calculate the number of students passing math
passing_math_count = passing_math["student_name"].count()

# Calculate the number of students passing reading
passing_reading_count = passing_reading["student_name"].count()

print(passing_math_count)
print(passing_reading_count)

In [None]:
# Calculate the percentage of students passing math
passing_math_percentage = (passing_math_count / float(student_count)) * 100 

# Calculate teh percentage of students passing reading
passing_reading_percentage = (passing_reading_count / float(student_count)) * 100

print(passing_math_percentage)
print(passing_reading_percentage)

In [None]:
# Combine students passing both math and reading into a new DataFrame

passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
passing_math_reading.head()

In [None]:
# Calculate the number of students passing both math & reading
overall_passing_math_reading_count = passing_math_reading["Student ID"].count()
overall_passing_math_reading_count

In [None]:
# Calculate the percentage of students passing both math & reading
overall_passing_percentage = overall_passing_math_reading_count / float(student_count) * 100
overall_passing_percentage

In [None]:
# Create a District Summry DataFrame

# Adding a list of values with keys to create a new DataFrame
district_summary_df = pd.DataFrame(
    [
        {
            "Total Schools": school_count,
            "Total Students": int(student_count),
            "Total Budget": float(total_budget),
            "Average Math Score": float(average_math_score),
            "Average Reading Score": float(average_reading_score),
            "% Passing Math": float(passing_math_percentage),
            "% Passing Reading": float(passing_reading_percentage),
            "% Overall Passing": float(overall_passing_percentage)
        }
    ]
)
district_summary_df

In [None]:
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

district_summary_df["Total Students"]


In [None]:
# Check if the change was made to the DF.
district_summary_df

In [None]:
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a '$.'

district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df

In [None]:
# Format the remaining columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}%".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}%".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}%".format)

district_summary_df

## 4.8.1 - Set the Index to the School Name

In [None]:
# Determine the school type
per_school_types = school_data_df.set_index(["school_name"])["type"]
per_school_types.head()

In [None]:
df = pd.DataFrame(per_school_types)
df.head()

In [None]:
# Rename "Type" column to "School Type"
df = df.rename(columns = {'type':'School Type'})
df.head()

In [None]:
# Calculate the total student count per school
per_school_counts = school_data_df["size"]
per_school_counts.head()

In [None]:
# No school_name column to use as the index.

# Calculate the total student count
per_school_counts = school_data_df.set_index(["school_name"])["size"]
per_school_counts.head()

In [None]:
# Calculate the total student count using school_data_complete_df DataFrame
per_school_counts = school_data_complete_df["school_name"].value_counts()
per_school_counts.head()

Ether method above will work.

In [None]:
# Calculate the total school budget
per_school_budget = school_data_df.set_index(["school_name"])["budget"]
per_school_budget.head()

In [None]:
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts
per_school_capita.head()

In [None]:
# Calculate math scores
student_school_math = student_data_df.set_index(["school_name"])["math_score"]
student_school_math.head()

In [None]:
# This will not work. Need to use the Pandas "groupby" function.

# Calculate the average math scores per school
per_school_averages = school_data_complete_df.groupby(["school_name"]).mean()
per_school_averages.head()

In [None]:
# We only want the average of the Math & Reading Scores, so add each column to the previous code to get 
# each column individually.

per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_math.head()

In [None]:
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]
per_school_reading.head()

### 4.8.5: Get Passing percentages per school

Pseudocode steps:
1. Determine what is the passing grade.
  * We have this from previous calculation (70)
2. Get the number of students who passed math and reading.
  * We have this formula from calculating the numbers for the entire school district. Can use to calculate per school counts.
3. Get the students who passed math and passed reading

In [None]:
# Calculate the passing scores by creating a filtered DataFrame
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]

In [None]:
per_school_passing_math.head()

In [None]:
per_school_passing_reading.head()

In [None]:
# Use the groupby() function to calculate the number of students passing math
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
per_school_passing_math.head()

In [None]:
per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading.head()

In [None]:
# Calculate the percentage of passing math and reading scores per school
per_school_passing_math = per_school_passing_math / per_school_counts * 100
per_school_passing_reading = per_school_passing_reading / per_school_counts * 100

per_school_passing_math.head()

In [None]:
per_school_passing_reading.head()

In [None]:
# Calculate the students who based both math and reading
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
per_passing_math_reading.head()

In [None]:
# Calculate the number of students who passed both math and reading
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]
per_passing_math_reading.head()

In [None]:
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100
per_overall_passing_percentage.head()

In [None]:
# Create the School Summary DataFrame

#Adding a list of values with keys to create a new DataFrame
per_school_summary_df = pd.DataFrame(
        {
            "School Type": per_school_types,
            "Total Students": per_school_counts,
            "Total School Budget": per_school_budget,
            "Per Student Budget": per_school_capita,
            "Average Math Score": per_school_math,
            "Average Reading Score": per_school_reading,
            "% Passing Math": per_school_passing_math,
            "% Passing Reading": per_school_passing_reading,
            "% Overall Passing": per_overall_passing_percentage
        }
)
per_school_summary_df.dtypes

### 4.8.7 - Clean Up the DataFrame
#### Formatting

In [None]:
# Format the "Total School Budget" & "Per Student Budget" columns w/ '$' sign, 2 decimal places & thousands separator
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a '$.'

per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)
per_school_summary_df["Average Math Score"] = per_school_summary_df["Average Math Score"].map("{:.2f}".format)
per_school_summary_df["Average Reading Score"] = per_school_summary_df["Average Reading Score"].map("{:.2f}".format)
per_school_summary_df["% Passing Math"] = per_school_summary_df["% Passing Math"].map("{:.2f}".format)
per_school_summary_df["% Passing Reading"] = per_school_summary_df["% Passing Reading"].map("{:.2f}".format)
per_school_summary_df["% Overall Passing"] = per_school_summary_df["% Overall Passing"].map("{:.2f}".format)

per_school_summary_df.head()


# district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}%".format)
# district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}%".format)
# district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}%".format)


In [None]:
# Sort and show the top five schools
top_schools = per_school_summary_df.sort_values(["% Overall Passing"], ascending=False)
top_schools.head(5)

In [None]:
# Sort and show the lowest performing schools 
bottom_schools = per_school_summary_df.sort_values(["% Overall Passing"], ascending=True)
bottom_schools.head(5)

## 4.10.1 - Create Grade-Level DataFrames

In [None]:
# Create Grade Level DataFrames
ninth_graders = school_data_complete_df[(school_data_complete_df["grade"] == "9th")]
tenth_graders = school_data_complete_df[(school_data_complete_df["grade"] == "10th")]
eleventh_graders = school_data_complete_df[(school_data_complete_df["grade"] == "11th")]
twelfth_graders = school_data_complete_df[(school_data_complete_df["grade"] == "12th")]

ninth_graders.head(5)

In [None]:
# Group each grade level DataFrame by the school name for the average math score.
ninth_grade_math_scores = ninth_graders.groupby(["school_name"]).mean()["math_score"]
tenth_grade_math_scores = tenth_graders.groupby(["school_name"]).mean()["math_score"]
eleventh_grade_math_scores = eleventh_graders.groupby(["school_name"]).mean()["math_score"]
twelfth_grade_math_scores = twelfth_graders.groupby(["school_name"]).mean()["math_score"]

ninth_grade_math_scores.head()


In [None]:
# Group each grade level DataFrame by the school name for the average reading score.
ninth_grade_reading_scores = ninth_graders.groupby(["school_name"]).mean()["reading_score"]
tenth_grade_reading_scores = tenth_graders.groupby(["school_name"]).mean()["reading_score"]
eleventh_grade_reading_scores = eleventh_graders.groupby(["school_name"]).mean()["reading_score"]
twelfth_grade_reading_scores = twelfth_graders.groupby(["school_name"]).mean()["reading_score"]

twelfth_grade_reading_scores.head()

## 4.10.3 - Combine each grade level Series into a DataFrame

In [None]:
# Combine each grade level Series for avg math scores by school into a single DataFrame
math_scores_by_grade = pd.DataFrame({
        
    "9th": ninth_grade_math_scores,
    "10th": tenth_grade_math_scores,
    "11th": eleventh_grade_math_scores,
    "12th": twelfth_grade_math_scores })

math_scores_by_grade.head()

In [None]:
# Combine each grade level Series for avg reading scores by school into a single DataFrame
reading_scores_by_grade = pd.DataFrame({
        
    "9th": ninth_grade_reading_scores,
    "10th": tenth_grade_reading_scores,
    "11th": eleventh_grade_reading_scores,
    "12th": twelfth_grade_reading_scores })

reading_scores_by_grade.head()

In [None]:
# Format the Math & Reading scores by grade DataFrames
math_scores_by_grade["9th"] = math_scores_by_grade["9th"].map("{:.1f}".format)
math_scores_by_grade["10th"] = math_scores_by_grade["10th"].map("{:.1f}".format)
math_scores_by_grade["11th"] = math_scores_by_grade["11th"].map("{:.1f}".format)
math_scores_by_grade["12th"] = math_scores_by_grade["12th"].map("{:.1f}".format)

reading_scores_by_grade["9th"] = reading_scores_by_grade["9th"].map("{:.1f}".format)
reading_scores_by_grade["10th"] = reading_scores_by_grade["10th"].map("{:.1f}".format)
reading_scores_by_grade["11th"] = reading_scores_by_grade["11th"].map("{:.1f}".format)
reading_scores_by_grade["12th"] = reading_scores_by_grade["12th"].map("{:.1f}".format)

# Make sure the grade columns are in the correct order
math_scores_by_grade = math_scores_by_grade[["9th", "10th", "11th", "12th"]]
reading_scores_by_grade = reading_scores_by_grade[["9th", "10th", "11th", "12th"]]

# Remove the index name
math_scores_by_grade.index.name = None

#Display the Math DataFrame
math_scores_by_grade.head()

In [None]:
reading_scores_by_grade.head()

## 4.11 - Establish the Spending Ranges per Student

In [None]:
# Get the descriptive statistics for the per_school_capita Series.
per_school_capita.describe()

In [None]:
# Ranges determined by looking the per capita spending & descriptive statistics.
# In this case, there are 4 schools that spend $585 or less so that is the lowest bucket.
# Then round up the standard deviation to 30, then add 30 to 585 to get the next three buckets.

# Cut the per_school_capita into the spending ranges.
spending_bins = [0, 585, 630, 645, 675]
group_names = ["<$586", "$586-630", "$631-645", "$646-675"]
per_school_capita.groupby(pd.cut(per_school_capita, spending_bins)).count()

In [None]:
# Categorize spending based on the bins.
per_school_summary_df["Spending Ranges (Per Student)"] = pd.cut(per_school_capita, spending_bins, labels=group_names)
per_school_summary_df.head()

In [None]:
per_school_summary_df.dtypes

In [None]:
# Convert Average Math Score, Average Reading Score, % Passing Math, % Passing Reading, and % Overall Passing 
# columns to float types
per_school_summary_df["Average Math Score"] = per_school_summary_df["Average Math Score"].astype('float')
per_school_summary_df["Average Reading Score"] = per_school_summary_df["Average Reading Score"].astype('float')
per_school_summary_df["% Passing Math"] = per_school_summary_df["% Passing Math"].astype('float')
per_school_summary_df["% Passing Reading"] = per_school_summary_df["% Passing Reading"].astype('float')
per_school_summary_df["% Overall Passing"] = per_school_summary_df["% Overall Passing"].astype('float')
per_school_summary_df.dtypes

In [None]:
# Calculate averages for the desired columns.

spending_math_scores = per_school_summary_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Math Score"]
spending_reading_scores = per_school_summary_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Reading Score"]
spending_passing_math = per_school_summary_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Math"]
spending_passing_reading = per_school_summary_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Reading"]
overall_passing_spending = per_school_summary_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Overall Passing"]

In [None]:
spending_math_scores

In [None]:
spending_reading_scores

In [None]:
spending_passing_math

In [None]:
spending_passing_reading

In [None]:
overall_passing_spending

In [None]:
# Assemble into DataFrame
spending_summary_df = pd.DataFrame({
    "Average Math Score": spending_math_scores,
    "Average Reading Score": spending_reading_scores,
    "% Passing Math": spending_passing_math,
    "% Passing Reading": spending_passing_reading,
    "% Overall Passing": overall_passing_spending
})
spending_summary_df

In [None]:
spending_summary_df.dtypes

In [None]:
# Format the spending_summary_df DataFrame

# Format scores to one decimal
spending_summary_df["Average Math Score"] = spending_summary_df["Average Math Score"].map("{:.1f}".format)
spending_summary_df["Average Reading Score"] = spending_summary_df["Average Reading Score"].map("{:.1f}".format)

# Format nearest whole number
spending_summary_df["% Passing Math"] = spending_summary_df["% Passing Math"].map("{:.0f}".format)
spending_summary_df["% Passing Reading"] = spending_summary_df["% Passing Reading"].map("{:.0f}".format)
spending_summary_df["% Overall Passing"] = spending_summary_df["% Overall Passing"].map("{:.0f}".format)

spending_summary_df

## 4.12 - Create Bins for School Size

In the last section we grouped the students scores by spending per student. Now, we will group the 
scores by the size of the school.

In [None]:
# Step 1: Determine the size of the bins (Small, Medium, & Large)
per_school_counts

In [None]:
# "Small" < 1,000 students
# "Medium" > 1,000 && < 1,999
# "Large" > 2,000 && < 5,000

# Create the bins
size_bins = [0, 999, 1999, 5000]
group_names = ["Small (<1,000)", "Medium (1,000-1,999)", "Large (2,000-5,000)"]

In [None]:
# Categorize spending based on the bins
per_school_summary_df["School Size"] = pd.cut(per_school_summary_df['Total Students'], size_bins, labels=group_names)


per_school_summary_df.head(2)

In [None]:
# Create new DataFrame that provides the following based on the school sizes: average math & reading scores, 
# average of students who passed may & reading. & the overall percentage.

# Create four new series

# Average Math Score
size_math_scores = per_school_summary_df.groupby(["School Size"]).mean()["Average Math Score"]

# Average Reading Score
size_reading_scores = per_school_summary_df.groupby(["School Size"]).mean()["Average Reading Score"]

# Average % Passing Math
size_passing_math = per_school_summary_df.groupby(["School Size"]).mean()["% Passing Math"]

# Average % Passing Reading
size_passing_reading = per_school_summary_df.groupby(["School Size"]).mean()["% Passing Reading"]

# Average Overall % Passing
size_overall_passing = per_school_summary_df.groupby(["School Size"]).mean()["% Overall Passing"]

In [None]:
# Add the new Series to the new DataFrame
size_summary_df = pd.DataFrame ({
    "Average Math Score": size_math_scores,
    "Average Reading Score": size_reading_scores,
    "% Passing Math": size_passing_math,
    "% Passing Reading": size_passing_reading,
    "% Overall Passing": size_overall_passing
})

size_summary_df

In [None]:
size_summary_df.dtypes

In [None]:
# Format scores to one decimal
size_summary_df["Average Math Score"] = size_summary_df["Average Math Score"].map("{:.1f}".format)
size_summary_df["Average Reading Score"] = size_summary_df["Average Reading Score"].map("{:.1f}".format)

# Format nearest whole number
size_summary_df["% Passing Math"] = size_summary_df["% Passing Math"].map("{:.0f}".format)
size_summary_df["% Passing Reading"] = size_summary_df["% Passing Reading"].map("{:.0f}".format)
size_summary_df["% Overall Passing"] = size_summary_df["% Overall Passing"].map("{:.0f}".format)

size_summary_df

## 4.13 - Group By School Type

In [None]:
# Group the averages and percentages by school type.
# Step 1: Create the Series

# Average Math Score
type_math_scores = per_school_summary_df.groupby(["School Type"]).mean()["Average Math Score"]

# Average Reading Score
type_reading_scores = per_school_summary_df.groupby(["School Type"]).mean()["Average Reading Score"]

# Percent Passing Math
type_passing_math = per_school_summary_df.groupby(["School Type"]).mean()["% Passing Math"]

# Percent Passing Reading
type_passing_reading = per_school_summary_df.groupby(["School Type"]).mean()["% Passing Reading"]

# Percent Overall Passing
type_overall_passing = per_school_summary_df.groupby(["School Type"]).mean()["% Overall Passing"]

# Step 2: Create the new DataFrame
type_summary_df = pd.DataFrame({
    "Average Math Score": type_math_scores,
    "Average Reading Score": type_reading_scores,
    "% Passing Math": type_passing_math,
    "% Passing Reading": type_passing_reading,
    "% Overall Passing": type_overall_passing
})
type_summary_df

In [None]:
# Format scores to one decimal
type_summary_df["Average Math Score"] = type_summary_df["Average Math Score"].map("{:.1f}".format)
type_summary_df["Average Reading Score"] = type_summary_df["Average Reading Score"].map("{:.1f}".format)

# Format nearest whole number
type_summary_df["% Passing Math"] = type_summary_df["% Passing Math"].map("{:.0f}".format)
type_summary_df["% Passing Reading"] = type_summary_df["% Passing Reading"].map("{:.0f}".format)
type_summary_df["% Overall Passing"] = type_summary_df["% Overall Passing"].map("{:.0f}".format)

type_summary_df