# Unit 4 Homework: Pandas, Pandas, Pandas

Using Pandas and Jupyter Notebook, create a report that includes the following data. Your report must include a written description of at least two observable trends based on the data.

In [1]:
# Dependencies and set up
import os
import csv
import pandas as pd

# Load files
school_data_path= "./Resources/schools_complete.csv"
student_data_path = "./Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_path)
student_data = pd.read_csv(student_data_path)

# Combine the data into a single dataset
school_data_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

# Check dataframe
school_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [22]:
school_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39170 entries, 0 to 39169
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Student ID     39170 non-null  int64 
 1   student_name   39170 non-null  object
 2   gender         39170 non-null  object
 3   grade          39170 non-null  object
 4   school_name    39170 non-null  object
 5   reading_score  39170 non-null  int64 
 6   math_score     39170 non-null  int64 
 7   School ID      39170 non-null  int64 
 8   type           39170 non-null  object
 9   size           39170 non-null  int64 
 10  budget         39170 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 3.6+ MB


# District Summary
Create a high-level snapshot, in a DataFrame, of the district's key metrics, including the following:
* Total schools
* Total students
* Total budget
* Average math score
* Average reading score
* % passing math (the percentage of students who passed math)
* % passing reading (the percentage of students who passed reading)
* % overall passing (the percentage of students who passed math AND reading)

In [41]:
# number of schools
total_schools = school_data_df["school_name"].nunique()

# number of students
total_students = school_data_df["student_name"].count()

# total budget
# sum(school_data["budget"])
total_budget = sum(school_data_df["budget"].unique())

# average math score
avg_math = school_data_df["math_score"].mean()

# average reading score
avg_read = school_data_df["reading_score"].mean()

# percentage of passing math score 70%
pass_math = school_data_df.query('math_score >=70')
total_pass_math = pass_math["math_score"].count()

perc_math = (total_pass_math / total_students) * 100

# percentage of passing reading score 70%
pass_read = school_data_df.query('reading_score >=70')
total_pass_read = pass_read["reading_score"].count()

perc_read = (total_pass_read / total_students) * 100

# percentage of overall passing
both_pass_count = school_data_df[
    (school_data_df["math_score"] >= 70) &
    (school_data_df["reading_score"] >=70)
].count()["student_name"]
perc_all = (both_pass_count / total_students) * 100

# Create summary dataframe
district_summary_df = pd.DataFrame({
    "Total Schools": total_schools,
    "Total Students": total_students,
    "Total Budget": total_budget,
    "Average Math Score": avg_math,
    "Average Reading Score": avg_read,
    "% Passing Math": [perc_math],
    "% Passing Reading": perc_read,
    "% Overall Passing": perc_all
})

# apply clean formatting
pd.options.display.float_format = "{:,.2f}".format
district_summary_df["Total Budget"] = district_summary_df[
    "Total Budget"].map("${:,.2f}".format)

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98,85.81,65.17


# School Summary
Create an overview table that summarizes key metrics about each school, including:
* School Name
* School Type
* Total Students
* Total School Budget
* Per Student Budget
* Average Math Score
* Average Reading Score
* % Passing Math
* % Passing Reading
* % Overall Passing (The percentage of students that passed math and reading.)

Create a dataframe to hold the above results

In [43]:
# List school names and set as index
schools = school_data_df["school_name"].unique()
schools

array(['Huang High School', 'Figueroa High School', 'Shelton High School',
       'Hernandez High School', 'Griffin High School',
       'Wilson High School', 'Cabrera High School', 'Bailey High School',
       'Holden High School', 'Pena High School', 'Wright High School',
       'Rodriguez High School', 'Johnson High School', 'Ford High School',
       'Thomas High School'], dtype=object)

In [44]:
# List school districts
schooldistrict = school_data_df["type"].unique()
schooldistrict

array(['District', 'Charter'], dtype=object)

In [46]:
school_data_gdf = school_data_df.groupby(["school_name"])


school_summary = {"School Type":
#     school_data.set_index("school_name")['type'],
#     # Complicated way:
#     # school_data_complete.sort_values(by=["school_name"]).groupby("school_name")["type"].head(1).tolist(),
    
#     "Total Students":
#     grouped_school_data_complete['school_name'].count(),
    
#     "Total School Budget":
#     grouped_school_data_complete['budget'].mean(),
    
#     "Per Student Budget":
#     grouped_school_data_complete['budget'].mean() / 
#     grouped_school_data_complete['school_name'].count(),
    
#     "Average Math Score":
#     round(grouped_school_data_complete['math_score'].mean(), 6),
    
#     "Average Reading Score":
#     round(grouped_school_data_complete['reading_score'].mean(), 6),
    
#     "% Passing Math":
#     round(100 * (school_data_complete.loc[school_data_complete['math_score'] >= 70, :].groupby('school_name')['student_name'].count() 
#                  / grouped_school_data_complete['school_name'].count()), 6),
    
#     "% Passing Reading":
#     round(100 * (school_data_complete.loc[school_data_complete['reading_score'] >= 70, :].groupby('school_name')['student_name'].count() 
#                  / grouped_school_data_complete['school_name'].count()), 6),
    
#     "% Overall Passing":
#     round(100 * (school_data_complete.loc[(school_data_complete['reading_score'] >= 70) & 
#                                           (school_data_complete['math_score'] >= 70), :].groupby('school_name')['student_name'].count() 
#                  / grouped_school_data_complete['school_name'].count()), 6)

# }

    
# school_summary_df = pd.DataFrame(school_summary)
# school_summary_df2 = school_summary_df.copy()
# school_summary_df['Total School Budget'] = school_summary_df['Total School Budget'].map('${:,.2f}'.format)
# school_summary_df['Per Student Budget'] = school_summary_df['Per Student Budget'].map('${:,.2f}'.format)
# school_summary_df.head(15)

# Create summary dataframe
school_summary_df = pd.DataFrame({
    "School Name": schoolnames
    "School Type": schooltype,
    "Total Students": totalstudent,
    "Total School Budget": totalbudget,
    "Per Student Budget": perstudentbudget,
    "Average Math Score": avgmath,
    "Average Reading Score": avgreading,
    "% Passing Math": mathpassperc,
    "% Passing Reading": readingpassperc,
    "% Overall Passing": overallpassperc
})
# pd.options.display.float_format = "{:,.2f}".format

school_summary_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x122b2d1f0>

# Highest-Performing Schools (By % Overall Passing)
Create a DataFrame that highlights the top 5 performing schools based on % Overall Passing. Include the following metrics:
* School name
* School type
* Total students
* Total school budget
* Per student budget
* Average math score
* Average reading score
* % passing math (the percentage of students who passed math)
* % passing reading (the percentage of students who passed reading)
* % overall passing (the percentage of students who passed math AND reading)

In [None]:
# School

In [None]:
# Create summary dataframe
highest_summary_df = pd.DataFrame({
    "School Type": [],
    "Total Students": [],
    "Total School Budget": [],
    "Per Student Budget": [],
    "Average Math Score": [],
    "Average Reading Score": [],
    "% Passing Math": [],
    "% Passing Reading": [],
    "% Overall Passing": []
})
pd.options.display.float_format = "{:,.2f}".format

highest 


# Lowest-Performing Schools (By % Overall Passing)
Create a DataFrame that highlights the top 5 performing schools based on % Overall Passing. Include the following metrics:
* School name
* School type
* Total students
* Total school budget
* Per student budget
* Average math score
* Average reading score
* % passing math (the percentage of students who passed math)
* % passing reading (the percentage of students who passed reading)
* % overall passing (the percentage of students who passed math AND reading)

In [None]:
# School

In [None]:
# Create summary dataframe
lowest_summary_df = pd.DataFrame({
    "School Type": [],
    "Total Students": [],
    "Total School Budget": [],
    "Per Student Budget": [],
    "Average Math Score": [],
    "Average Reading Score": [],
    "% Passing Math": [],
    "% Passing Reading": [],
    "% Overall Passing": []
})
pd.options.display.float_format = "{:,.2f}".format

lowest_summary_df 


# Math Scores by Grade
Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
* Create a pandas series for each grade. Hint: use a conditional statement.
* Group each series by school
* Combine the series into a dataframe
* Optional: give the displayed data cleaner formatting

In [None]:
# School

In [None]:
# Create summary dataframe
math_summary_df = pd.DataFrame({
    "9th": [],
    "10th": [],
    "11th": [],
    "12th": [],
})
pd.options.display.float_format = "{:,.2f}".format

math_summary_df 


# Reading Score by Grade
Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
* Create a pandas series for each grade. Hint: use a conditional statement.
* Group each series by school
* Combine the series into a dataframe
* Optional: give the displayed data cleaner formatting

In [None]:
# School

In [None]:
# Create summary dataframe
reading_summary_df = pd.DataFrame({
    "9th": [],
    "10th": [],
    "11th": [],
    "12th": [],
})
pd.options.display.float_format = "{:,.2f}".format

reading_summary_df 


# Scores by School Spending
Create a table that breaks down school performance based on average spending ranges (per student). Use your judgment to create four bins with reasonable cutoff values to group school spending. Include the following metrics in the table:
* Average math score
* Average reading score
* % passing math (the percentage of students who passed math)
* % passing reading (the percentage of students who passed reading)
* % overall passing (the percentage of students who passed math AND reading)

In [None]:
# Spending Ranges (Per Student) $:
# <585
# 585-630
# 630-645
# 645-680

In [None]:
# Create summary dataframe
spending_summary_df = pd.DataFrame({
    "Average Math Score": [],
    "Average Reading Score": [],
    "% Passing Math": [],
    "% Passing Reading": [],
    "% Overall Passing": []
})
pd.options.display.float_format = "{:,.2f}".format

spending_summary_df 


## Scored by School Size
Create a table that breaks down school performance based on school size (small, medium, or large).

In [None]:
# School Size
# Small (<1000)
# Medium (1000-2000)
# Large (2000-5000)

In [None]:
# Create summary dataframe
schoolsize_summary_df = pd.DataFrame({
    "Average Math Score": [],
    "Average Reading Score": [],
    "% Passing Math": [],
    "% Passing Reading": [],
    "% Overall Passing": []
})
pd.options.display.float_format = "{:,.2f}".format

schoolsize_summary_df 


## Scores by School Type
Create a table that breaks down school performance based on type of school (district or charter).

In [None]:
# School Type
# charter
# district

In [None]:
# Create summary dataframe
schooltype_summary_df = pd.DataFrame({
    "Average Math Score": [],
    "Average Reading Score": [],
    "% Passing Math": [],
    "% Passing Reading": [],
    "% Overall Passing": []
})
pd.options.display.float_format = "{:,.2f}".format

schooltype_summary_df 
