In [22]:
 # Import libraries and setup
import pandas as pd
import locale
locale.setlocale(locale.LC_ALL, 'en_US') # will use for currency formatting 

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"

student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into DataFrames
# Sort school df into alphabetical order
school_data = pd.read_csv(school_data_to_load)
school_data.sort_values(by="school_name", inplace=True)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single df (combined_df)  
combined_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [10]:
combined_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [4]:
# calculate total school number from index of school_data
total_schools = len(school_data.index)

# calculate total student number from index of student_data
total_students = len(student_data.index)

# calculate total budget as sum of individual school budgets
total_budget = school_data['budget'].sum()

# calculate average math score
average_math = combined_df['math_score'].sum()/len(combined_df)
average_math = round(average_math, 2)

# calculate average reading score
average_reading = combined_df['reading_score'].sum()/len(combined_df)
average_reading = round(average_reading, 2)

# caculate percent passing math by creating a df containing only those with scores >= 70 for math 
passing_math = combined_df[combined_df['math_score'] >= 70]
percent_passed_math = round((len(passing_math)/len(combined_df)*100),3)

# caculate percent passing reading by creating a df containing only those with scores >= 70 for reading 
passing_reading = combined_df[combined_df['reading_score'] >= 70]
percent_passed_reading = round((len(passing_reading)/len(combined_df)*100),3)

# calculate percent passing both
# start with the passing_math df and narrow down to only those with scores >= 70 for reading (i.e. passing both)
passing_both = passing_math[passing_math['reading_score'] >= 70]
percent_passing_both = round((len(passing_both)/len(combined_df)*100),3)

# merge results into a dataframe and display
overview = pd.DataFrame([{"Total Schools": total_schools, "Total Students": total_students, "Total Budget": total_budget,
                         "Average Math Score":average_math, "Average Reading Score": average_reading, "% Passing Math": percent_passed_math,
                         "% Passing Reading": percent_passed_reading, "% Overall Passing":percent_passing_both}])
display(overview)

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.99,81.88,74.981,85.805,65.172


In [5]:
# list variable declaratations 
name, school_type, total_students, total_budget, per_student_budget, average_math_score, \
average_reading_score, percent_passing_math, percent_passing_reading, percent_passing_both = ([] for i in range(10))

# iterate through schools list
for school in school_data['school_name']:
    
    # create df for current school
    school_df = combined_df[combined_df['school_name'] == school]
    
    # current school name and append to list
    name.append(school)
    
    # get school type and append to list
    school_type.append(school_df.iloc[0,8])

    # get total students for school and append to list 
    total_students.append(school_df.iloc[0,9])
    
    # get budget for school and append to list. value displayed with currency formatting 
    budget_data = school_df.iloc[0,10]
    total_budget.append(locale.currency(budget_data, grouping=True))
    
    # get budget per student (total budget / total students) and append to list
    budget_data = (school_df.iloc[0,10])/(school_df.iloc[0,9])
    per_student_budget.append(locale.currency(budget_data, grouping=True))
    
    # get average math score for school and append to list
    average_math_score.append(round((sum(school_df['math_score'])/len(school_df)),3))
    
    # get average reading score for school and append to list
    average_reading_score.append(round((sum(school_df['reading_score'])/len(school_df)),3))
    
    # get percentage that passed math (>= 70%) for current school and append to list
    num_passing_math = school_df[school_df['math_score'] >= 70]
    percent_passing_math.append(f"{round((len(num_passing_math)/len(school_df)*100),3)}%")
          
    # get percentage that passed reading (>= 70%) for current school and append to list
    num_passing_reading = school_df[school_df['reading_score'] >= 70]
    percent_passing_reading.append(f"{round((len(num_passing_reading)/len(school_df)*100),3)}%")          
        
    # get percentage that passed both
    num_passing_both = num_passing_math[num_passing_math['reading_score'] >= 70]
    percent_passing_both.append(f"{round((len(num_passing_both)/len(school_df)*100),3)}%")

# merge results into a dataframe and display
school_summary = pd.DataFrame({"School": name, "School Type": school_type, "Total Students": total_students,
                               "Total School Budget": total_budget, "Per Student Budget": per_student_budget, 
                               "Average Math Score": average_math_score, "Average Reading Score": average_reading_score, 
                               "% Passing Math": percent_passing_math, "% Passing Reading": percent_passing_reading, 
                               "% Overall Passing": percent_passing_both})

display(school_summary)



Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048,81.034,66.68%,81.933%,54.642%
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.062,83.976,94.133%,97.04%,91.335%
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.712,81.158,65.988%,80.739%,53.204%
3,Ford High School,District,2739,"$1,763,916.00",$644.00,77.103,80.746,68.31%,79.299%,54.29%
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351,83.817,93.392%,97.139%,90.599%
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.29,80.934,66.753%,80.863%,53.528%
6,Holden High School,Charter,427,"$248,087.00",$581.00,83.803,83.815,92.506%,96.253%,89.227%
7,Huang High School,District,2917,"$1,910,635.00",$655.00,76.629,81.183,65.684%,81.316%,53.514%
8,Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072,80.966,66.058%,81.222%,53.539%
9,Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.045,94.595%,95.946%,90.541%


In [6]:
# sort the school summary df by '% Overall Passing' column, in descending order, leaving only top 5 schools
top_performers = school_summary.sort_values(by="% Overall Passing", ascending=False)
top_performers = top_performers.iloc[:5]

display(top_performers)

Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.062,83.976,94.133%,97.04%,91.335%
12,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418,83.849,93.272%,97.309%,90.948%
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351,83.817,93.392%,97.139%,90.599%
13,Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274,83.989,93.868%,96.54%,90.583%
9,Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.045,94.595%,95.946%,90.541%


In [9]:
# sort the school summary df by '% Overall Passing' column, in ascending order, leaving only bottom 5 schools
bottom_performers = school_summary.sort_values(by="% Overall Passing", ascending=True)
bottom_performers = bottom_performers.iloc[:5]

display(bottom_performers)

Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
10,Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.843,80.745,66.367%,80.22%,52.988%
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.712,81.158,65.988%,80.739%,53.204%
7,Huang High School,District,2917,"$1,910,635.00",$655.00,76.629,81.183,65.684%,81.316%,53.514%
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.29,80.934,66.753%,80.863%,53.528%
8,Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072,80.966,66.058%,81.222%,53.539%


In [54]:
# create df for summary data, with row and column headers given by schools and grade names 
grades = ['9th', '10th', '11th', '12th']
schools = combined_df.school_name.unique()
schools.sort()
grade_summary = pd.DataFrame(columns=grades, index=schools)

# iterate through schools
for school in schools:
    
    # create df for current school
    school_df = combined_df[combined_df['school_name'] == school]
    
    # iterate through grades for current school
    for grade in grades:
        # create df for current grades data and average math scores
        current_grade_data = school_df[school_df['grade'] == grade]
        current_average = round((sum(current_grade_data['math_score'])/len(current_grade_data)),3)
        
        # add corresponding location in in summary df
        grade_summary.loc[school, grade] = current_average        

display(grade_summary)       
        
    

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.084,76.997,77.516,76.492
Cabrera High School,83.095,83.155,82.766,83.277
Figueroa High School,76.403,76.54,76.884,77.151
Ford High School,77.361,77.672,76.918,76.18
Griffin High School,82.044,84.229,83.842,83.356
Hernandez High School,77.438,77.337,77.136,77.187
Holden High School,83.787,83.43,85.0,82.855
Huang High School,77.027,75.909,76.447,77.226
Johnson High School,77.188,76.691,77.492,76.863
Pena High School,83.625,83.372,84.328,84.122


In [55]:
# df for summary data, with row and column headers given by schools and grade names 
grades = ['9th', '10th', '11th', '12th']
schools = combined_df.school_name.unique()
schools.sort()
grade_summary = pd.DataFrame(columns=grades, index=schools)

# iterate through schools
for school in schools:
    
    # create df for current school
    school_df = combined_df[combined_df['school_name'] == school]
    
    # iterate through grades for current school
    for grade in grades:
        # create df for current grades data and average math scores
        current_grade_data = school_df[school_df['grade'] == grade]
        current_average = round((sum(current_grade_data['reading_score'])/len(current_grade_data)),3)
        # add corresponding location in in summary df
        grade_summary.loc[school, grade] = current_average        

display(grade_summary)      

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.303,80.907,80.946,80.912
Cabrera High School,83.676,84.253,83.788,84.288
Figueroa High School,81.199,81.409,80.64,81.385
Ford High School,80.633,81.263,80.404,80.662
Griffin High School,83.369,83.707,84.288,84.014
Hernandez High School,80.867,80.66,81.396,80.857
Holden High School,83.677,83.325,83.816,84.699
Huang High School,81.29,81.512,81.417,80.306
Johnson High School,81.261,80.773,80.616,81.228
Pena High School,83.807,83.612,84.336,84.591
