# PyCity Schools Analysis

* As a whole, schools with higher budgets, did not yield better test results. By contrast, schools with higher spending per student actually (\$645-675) underperformed compared to schools with smaller budgets (<\$585 per student).

* As a whole, smaller and medium sized schools dramatically out-performed large sized schools on passing math performances (89-91% passing vs 67%).

* As a whole, charter schools out-performed the public district schools across all metrics. However, more analysis will be required to glean if the effect is due to school practices or the fact that charter schools tend to serve smaller student populations per school. 
---

In [None]:
# Dependencies and Setup
import pandas as pd

# File to Load 
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset (consider using a left join)
pyschools_merge = pd.merge(school_data, student_data, on="school_name", how='left')

pyschools_merge = pyschools_merge.rename(columns={'budget': 'Budget', 'grade': 'Grade', "school_name":"School", "size": 'Enrollment', "student_name": 'Student', 'reading_score': "Reading Score", "math_score": 'Math Score'})

student_data.head()


In [None]:
student_data.head()
total_enr = student_data.groupby('school_name').count()['Student ID']
total_enr

In [None]:
passed_reading = student_data.loc[student_data['reading_score']>=70].groupby('school_name').count()['Student ID']
passed_reading

In [None]:
passed_math = student_data.loc[student_data['math_score']>=70].groupby('school_name').count()['Student ID']
passed_math

In [None]:
pct_passed_reading_by_school = passed_reading/total_enr
pct_passed_reading_by_school

In [None]:
pct_passed_math_by_school = passed_math/total_enr
pct_passed_math_by_school

In [None]:
overall_pct_passed_by_school = (pct_passed_reading_by_school + pct_passed_math_by_school)/2
overall_pct_passed_by_school

In [None]:
school_data.head()

In [None]:
pyschools_merge.head()

In [2]:

#dataframe needs to have school ID, budget, schoool, enrollment,  

#Calculate the Totals (Schools and Students)
numschools=school_data["School ID"].count()

districtbudget=school_data['budget'].sum()


#create customized dataframe for rest of analysis
districtsum_df = pyschools_merge [['Student ID', 'Reading Score', 'Math Score']]


numstudents=districtsum_df["Student ID"].count()


# Calculate the Average Scores
readingave=districtsum_df["Reading Score"].mean()


mathave=districtsum_df["Math Score"].mean()


# Calculate the Percentage Pass Rates
passingreading = districtsum_df.loc[districtsum_df['Reading Score'] >= 70]['Reading Score']
percpassreading = (len(passingreading)/numstudents)*100



passingmath = districtsum_df.loc[districtsum_df['Math Score'] >= 70]['Math Score']
percpassmath = (len(passingmath)/numstudents) * 100

percpassboth = (percpassreading+percpassmath)/2




school_data_complete = pd.DataFrame({"Number of Schools": [numschools],
                             "Total Enrollment": [numstudents],
                             "Total Budget": [districtbudget],
                              "Average Reading Score": [readingave],
                              "Average Math Score": [mathave],
                              "Reading % Passing": [percpassreading],
                              "Math % Passing": [percpassmath],
                              "Overall % Passing": [percpassboth]})

school_data_complete







# Display the data frame


Unnamed: 0,Number of Schools,Total Enrollment,Total Budget,Average Reading Score,Average Math Score,Reading % Passing,Math % Passing,Overall % Passing
0,15,39170,24649428,81.87784,78.985371,85.805463,74.980853,80.393158


## School Summary

In [None]:
#school group you need to set index to school name, merge on index
#score_ave = pyschools_merge[['School', 'type', 'Student', 'Reading Score', 'Math Score']]

#core_ave = score_ave.groupby(['School', 'Student']).sum()
#reading_df = score_ave[score_ave["Reading Score"]>=70]["Reading Score"].groupby(["School"]).count()
#students_took_reading = score_ave["Reading Score"].groupby(["School"]).count()
#schoolreadingpass = (reading_df/students_took_reading)*100
#reset index, merge on column name (same for both


In [8]:
#create school_data_complete



In [9]:
school_sum

Unnamed: 0,School,type,Enrollment,Budget,Reading Score,Math Score
0,Huang High School,District,2917,1910635,66,79
1,Huang High School,District,2917,1910635,94,61
2,Huang High School,District,2917,1910635,90,60
3,Huang High School,District,2917,1910635,67,58
4,Huang High School,District,2917,1910635,97,84
...,...,...,...,...,...,...
39165,Thomas High School,Charter,1635,1043130,99,90
39166,Thomas High School,Charter,1635,1043130,95,70
39167,Thomas High School,Charter,1635,1043130,73,84
39168,Thomas High School,Charter,1635,1043130,99,90


In [1]:
# Calculate the total school budget and per capita spending: DONE

school_sum = pyschools_merge [['School', 'type','Enrollment', 'Budget', 'Reading Score', 'Math Score']]
school_group = school_sum.groupby(["School"]).mean()
school_group['Per Pupil Budget'] = school_group ["Budget"]/school_group['Enrollment']


passed_reading = student_data.loc[student_data['reading_score']>=70].groupby('school_name').count()['Student ID']

passed_math = student_data.loc[student_data['math_score']>=70].groupby('school_name').count()['Student ID']
total_enr = student_data.groupby('school_name').count()['Student ID']
pct_passed_reading_by_school = (passed_reading/total_enr)*100
pct_passed_reading_by_school
pct_passed_math_by_school = (passed_math/total_enr)*100
overall_pct_passed_by_school = ((pct_passed_reading_by_school + pct_passed_math_by_school)/2)


#    "school_summary_df=pd.DataFrame(school_group)\n",
#    "school_summary_df['Passing % Reading'] = pct_passed_reading_by_school\n",
#    "school_summary_df['Passing % Math'] = pct_passed_math_by_school\n",
#    "school_summary_df['Overall % Passing'] = overall_pct_passed_by_school \n",
 

#Reading_pass_pct = Read_pass/ ['Enrollment']
#schoolspassingreading.count()

#school_group['Reading % Passing'] = (schoolpassingreading/school_group['Enrollment'])*100

#schoolpassingreading



#schoolpassingmath = school_data.loc[school_data['Math Score'] >= 70]['Math Score']
#schoolpercpassmath = (len(schoolpassingmath)/"Enrollment") * 100

#schoolpercpassboth = (schoolpercpassreading+schoolpercpassmath)/2




#pd.DataFrame({"Number of Schools": [numschools],
#                             "Total Enrollment": [numstudents],
#                             "Total Budget": [districtbudget],
#                              "Average Reading Score": [readingave],
#                              "Average Math Score": [mathave],
#                              "Reading % Passing": [percpassreading],
#                              "Math % Passing": [percpassmath],
#                              "Overall % Passing": [percpassboth]})

#school group you need to set index to school name, merge on index
#score_ave = pyschools_merge[['School', 'type', 'Student', 'Reading Score', 'Math Score']]

#score_ave = score_ave.groupby(['School', 'Student']).sum()
#reading_df = score_ave[score_ave["Reading Score"]>=70]["Reading Score"].groupby(["School"]).count()
#students_took_reading = score_ave["Reading Score"].groupby(["School"]).count()
#schoolreadingpass = (reading_df/students_took_reading)*100
#reset index, merge on column name (same for both



#pyschools_merge = pd.merge(school_data, student_data, on="school_name", how='left')



# Calculate the average test scores:DONE through the dataframe

# Calculate the passing scores by creating a filtered data frame?????

# Convert to data frame

# Minor data munging: rearrange collumns, format numbers

#pyschools_merge.head()







                                              

NameError: name 'pyschools_merge' is not defined

In [None]:
school_sum


## Top Performing Schools (By Passing Rate)

In [None]:
# Sort and show top five schools


## Bottom Performing Schools (By Passing Rate)

In [None]:
# Sort and show bottom five schools


## Math Scores by Grade

In [5]:
# Create data series of scores by grade levels using conditionals

gradeleveldf=pyschools_merge[['School', 'Grade', 'Reading Score', 'Math Score']]

ninth_graders=gradeleveldf[(gradeleveldf['Grade'] == "9th")]
tenth_graders=gradeleveldf[(gradeleveldf['Grade'] == "10th")]
eleventh_graders=gradeleveldf[(gradeleveldf['Grade'] == "11th")]
twelfth_graders=gradeleveldf[(gradeleveldf['Grade'] == "12th")]

# Group each by school name
ninth_graders_scores=ninth_graders.groupby(["School"]).mean()["Math Score"]
tenth_graders_scores=tenth_graders.groupby(["School"]).mean()["Math Score"]
eleventh_graders_scores=eleventh_graders.groupby(["School"]).mean()["Math Score"]
twelfth_graders_scores=twelfth_graders.groupby(["School"]).mean()["Math Score"]

# Combine series into single data frame
math_by_grade=pd.DataFrame({'9th':ninth_graders_scores, '10th':tenth_graders_scores, '11th': eleventh_graders_scores, '12th': twelfth_graders_scores})

# Minor data munging
math_by_grade=math_by_grade[['9th', '10th', '11th', '12th']]
math_by_grade.index.name=None


# Display the data frame
math_by_grade


Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


## Reading Score by Grade 

In [6]:
# Create data series of scores by grade levels using conditionals



#ninth_graders=gradeleveldf[(gradeleveldf['Grade'] == "9th")]
#tenth_graders=gradeleveldf[(gradeleveldf['Grade'] == "10th")]
#eleventh_graders=gradeleveldf[(gradeleveldf['Grade'] == "11th")]
#twelfth_graders=gradeleveldf[(gradeleveldf['Grade'] == "12th")]

# Group each by school name
ninth_graders_reading=ninth_graders.groupby(["School"]).mean()["Reading Score"]
tenth_graders_reading=tenth_graders.groupby(["School"]).mean()["Reading Score"]
eleventh_graders_reading=eleventh_graders.groupby(["School"]).mean()["Reading Score"]
twelfth_graders_reading=twelfth_graders.groupby(["School"]).mean()["Reading Score"]

# Combine series into single data frame
reading_by_grade=pd.DataFrame({'9th':ninth_graders_scores, '10th':tenth_graders_scores, '11th': eleventh_graders_scores, '12th': twelfth_graders_scores})

# Minor data munging
reading_by_grade=reading_by_grade[['9th', '10th', '11th', '12th']]
reading_by_grade.index.name=None


# Display the data frame
reading_by_grade


Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


## Scores by School Spending

In [None]:
# Establish the bins -- choose any set of bins you would like, but see below for testing bins
# to test, set your bins as follows: [0, 585, 615, 645, 675]
# ALSO -- Note that the values for `% Passing Math`, `% Passing Reading` and `% Overall Passing Rate`
# were computed using averages of averages -- your results may vary if you use weighted averages 

# Categorize the spending based on the bins

# Assemble into data frame

# Minor data munging

# Display results


## Scores by School Size

In [None]:
#establish a data frame with average math score, average reading score, %passing math, %passing reading, %overall passing rate

schoolsize_merge = pd.merge(school_data, student_data, on="school_name", how='left')




schoolsize=school_data [['School', 'Enrollment', 'Reading Score', 'Math Score']]
 
# Establish the bins 

bins = [0, 1000, 2000, 5000]



# Categorize the spending based on the bins
group_names=["Small", "Medium", "Large"]
# Calculate the scores based on bins

schoolsize=pd.cut(schoolsize["Enrollment"], bins, labels=group_names, include_lowest=True)

schoolsize


# Assemble into data frame

# Minor data munging

# Display results


## Scores by School Type

In [7]:
# Type | Average Math Score | Average Reading Score | % Passing Math | % Passing Reading | % Overall Passing Rate

# Assemble into data frame

schooltype = pyschools_merge[['type', 'Math Score', 'Reading Score']]

schooltype = schooltype.groupby(["type"]).mean()

schooltype = schooltype.reset_index(drop=False)
schooltype

# Minor data munging

# Display results


Unnamed: 0,type,Math Score,Reading Score
0,Charter,83.406183,83.902821
1,District,76.987026,80.962485
