In [1]:
# Dependencies and Setup
from bokeh.io import output_notebook, show
output_notebook()
import pandas as pd
import pandas_bokeh
pandas_bokeh.output_notebook()
import os

# File to Load
folder = 'Resources'
school_data_to_load = os.path.join(folder,'schools_complete.csv') 
student_data_to_load = os.path.join(folder,'students_complete.csv') 

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

ModuleNotFoundError: No module named 'pandas_bokeh'

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [None]:
#Renaming the columns of the data frame
school_data_complete = school_data_complete.rename(columns={'student_name':'Student', 'gender':'Gender',
                                                           'grade':'Grade', 'school_name':'School','reading_score':'Reading Score',
                                                           'math_score':'Math Score', 'type':'School type', 
                                                            'size':'School size', 'budget':'School Budget'})
#checking if the dataset is complete
school_data_complete.count()


In [None]:
#Total Number of schools
school_count = len(school_data_complete['School ID'].unique())
#Total Number of Students
students_count = len(school_data_complete['Student'])
#Total Budget
total_budget = sum(school_data_complete.groupby('School ID')['School Budget'].mean())
#Average Math Score
avg_math_score = school_data_complete['Math Score'].sum()/len(school_data_complete['Math Score'])
#Average Reading Score
avg_reading_score = school_data_complete['Reading Score'].sum()/len(school_data_complete['Reading Score'])
#% of students passing Math
passing_math = school_data_complete.loc[school_data_complete['Math Score']>=70,:]
passing_math_percentage = len(passing_math['Student ID'])/students_count*100
#% of student passing reading
passing_read = school_data_complete.loc[school_data_complete['Reading Score']>=70,:]
passing_read_percentage = len(passing_read['Student ID'])/students_count*100
#% of students passing math AND reading
passing_math_reading = pd.merge(passing_math, passing_read, on='Student ID', how='inner', suffixes =('_math','_reading'))
math_reading_percentage = len(passing_math_reading['Student ID'])/students_count*100

district_summary = pd.DataFrame.from_dict({"Total Schools": [school_count],
                           "Total Students": students_count,
                            "Total Budget": '${:0,.2f}'.format(total_budget),
                           "Average Math Score": avg_math_score,
                           "Average Reading Score": avg_reading_score,
                           "% Passing Math": passing_math_percentage,
                           "% Passing Reading": passing_read_percentage,
                           "% Overall Passing": math_reading_percentage})
district_summary

## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [None]:
column_name = 'School'
#retrieving the school names
school_names = school_data_complete[column_name].unique()

dic = dict()
#populating the dictionary with the values of choice
for name in school_names:
    lists = []
    school_type = school_data_complete.loc[school_data_complete[column_name]== name, :]['School type'].unique()
    total_students = school_data_complete.loc[school_data_complete[column_name]== name, :]['Student'].count()
    total_school_budget = school_data_complete.loc[school_data_complete[column_name]== name, :]['School Budget'].mean()
    student_budget = total_school_budget/total_students
    school_math_percent = len(passing_math.loc[passing_math[column_name] == name, :]['Math Score'])/total_students*100
    school_read_percent = len(passing_read.loc[passing_read[column_name]== name, :]['Reading Score'])/total_students*100
#    school_math_reading = len(passing_math_reading.loc[passing_math_reading['School_x']==name+'_x',:])/total_students*100
    lists = [str(school_type).strip("['']")]+[total_students]+[total_school_budget]+[student_budget]+[school_math_percent]+[school_read_percent]
    dic[name] = lists
school_math_reading = passing_math_reading.groupby('School_math')['Student ID'].count()/school_data_complete.groupby('School')['Student ID'].count()*100
_school_summary = pd.DataFrame.from_dict(dic,orient='index', columns=['School Type','Total Students', 'Total School Budget', 'Per Student Budget','% Passing Math','% Passing Reading'])
school_avg_math_score = school_data_complete.groupby(column_name)['Math Score'].mean()
school_avg_reading_score = school_data_complete.groupby(column_name)['Reading Score'].mean()
score_df = pd.concat([pd.DataFrame(school_math_reading),school_avg_math_score,school_avg_reading_score],axis =1)
_school_summary = _school_summary.join(score_df, how='left').sort_index() 
print(_school_summary.columns)
_school_summary = _school_summary.rename(columns={'Student ID':'% Overall Passing', 'Math Score':'Average Math Score', 'Reading Score':'Average Reading Score'})


In [None]:
school_summary = _school_summary.copy()

school_summary['Total School Budget'] = school_summary['Total School Budget'].map("${:,.2f}".format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map("${:.2f}".format)

school_summary

## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [None]:
top_schools = school_summary.sort_values(by=['% Overall Passing'], ascending = False)
top_schools.head(5)

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [None]:
bottom_schools = school_summary.sort_values(by=['% Overall Passing'], ascending = True)
bottom_schools.head(5)

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [None]:
# https://stackoverflow.com/questions/5967500/how-to-correctly-sort-a-string-with-a-number-inside
#unutbu is the creator of these two functions. They were taken from the link above.
import re

def atof(text):
    try:
        retval = float(text)
    except ValueError:
        retval = text
    return retval

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    float regex comes from https://stackoverflow.com/a/12643073/190597
    '''
    return ([atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text)]) 

In [None]:
# finding the unique grades
grade = list(school_data_complete['Grade'].unique())
#creating a multiple index series containing the average values for the math scores as a function of the school and the grade
math_score = school_data_complete.groupby(['School','Grade'])['Math Score'].mean()
# transforming the multiple index series in a multiple index dataframe
math_score = pd.DataFrame(math_score)

#separating the average values per grade and reconcatenating the extrapolated sieries in another dataframe
mgrades = []
for x in grade:
    mgrades += [math_score.xs(x, level = 1)]
mgrades_df = pd.concat(mgrades, axis=1)

# formatting the dataframe to look clean and nice
for x in range(len(grade)):
    mgrades_df.rename(columns={mgrades_df.columns[x]: grade[x] })
mgrades_df.columns=grade
mgrades_df = mgrades_df[sorted(grade,key = natural_keys)]
mgrades_df.index.name = None
#showing the final dataframe
mgrades_df

## Reading Score by Grade 

* Perform the same operations as above for reading scores

In [None]:
#creating a multiple index series containing the average values for the reading scores as a function of the school and the grade
reading_score = school_data_complete.groupby(['School','Grade'])['Reading Score'].mean()
reading_score = pd.DataFrame(reading_score)

#separating the average values per grade and reconcatenating the extrapolated sieries in another dataframe
grades = []
for x in grade:
    grades += [reading_score.xs(x, level = 1)]
grades_df = pd.concat(grades, axis=1)

# formatting the dataframe to look clean and nice
for x in range(len(grade)):
    grades_df.rename(columns={ grades_df.columns[x]: grade[x] })
grades_df.columns=grade
grades_df = grades_df[sorted(grade,key = natural_keys)]
grades_df.index.name = None

#showing the final dataframe
grades_df

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [None]:
def summary_binning(df, column_tobin, new_column_name, bins, bin_labels, ):
    #binning the array
    df[new_column_name] = pd.cut(df[column_tobin], bins, labels = bin_labels)
    #grouping by the variable of choide and calculating the average of the parameters
    new_df = df.groupby(new_column_name)[['Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading','% Overall Passing']].mean()
    #formatting the columns in the newly created dataframe
    for column in new_df.columns:
        new_df[column] = new_df[column].map('{:.2f}'.format)
    return new_df

In [None]:
# creating the bins
spending_bins = [0, 585, 630, 645, 680]
# creating the labels
spending_labels = ['<$585', '$585-630','$630-645','$645-680']
# running the function summary_binning to bin and format the summary result
school_spending_score = summary_binning(_school_summary, 'Per Student Budget', 'Spending Ranges per Student', spending_bins, spending_labels)
#printing the output on screen
school_spending_score

## Scores by School Size

* Perform the same operations as above, based on school size.

In [None]:
# creating the bins
size_bins = [0, 1000, 2000, 5000]
# creating the labels
size_labels = ['Small (<1000)', 'Medium (1000-2000)','Large (2000-5000)']
# binning the dataframe named school_summary
school_size = summary_binning(_school_summary, 'Total Students', 'School Size', size_bins, size_labels)
#printing the output on screen
school_size

## Scores by School Type

* Perform the same operations as above, based on school type

In [None]:
#In this case the column with the binning value already exists.
#grouping by the school type and averaging is enough
school_type = _school_summary.groupby('School Type')[['Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading','% Overall Passing']].mean()
school_type

## Trends observable from the analysis above

1. __*Charter Schools perform better than District Schools*__. Indeed, the top five schools are all charter schools while the bottom five are district schools. Additionally, the overall passing % for district schools is 53.67% while for the charter is 90.43%.
2. Surprisingly, **_the overall student performances decreased with increasing percentage of spending per student_**.
3. There is no difference in performance between small and medium schools. However, **large schools** (with more than 2000 students) **showed worse performances than small and medium schools**. 