In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
# File path for first csv
schools_data= "schools_complete.csv"

# Read our csv into Pandas as a dataframe and display first 5 rows
schools_data_df = pd.read_csv(schools_data)
schools_data_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Rename "name" column to schools
schools_data_df = schools_data_df.rename(index=str, columns={"name": "school"})
#schools_data_df.head()

In [4]:
# File path for second csv
student_data = "students_complete.csv"

# Read our csv into Pandas and display first 5 rows
student_data_df = pd.read_csv(student_data)
student_data_df.head()


Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [5]:
# Get a list of all of our columns for reference, in first csv
schools_data_df.columns

Index(['School ID', 'school', 'type', 'size', 'budget'], dtype='object')

In [6]:
# Get a list of all of our columns for reference, in 2nd csv
student_data_df.columns

Index(['Student ID', 'name', 'gender', 'grade', 'school', 'reading_score',
       'math_score'],
      dtype='object')

In [7]:
# Merge Dataframes and view first 5 rows of new df 
merge_table_df = pd.merge(schools_data_df, student_data_df, how="left", on="school")
merge_table_df.head()

Unnamed: 0,School ID,school,type,size,budget,Student ID,name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


In [8]:
# Drop unnecessary columns from df
merge_table_df = merge_table_df.drop("Student ID", axis=1)
merge_table_df = merge_table_df.drop("School ID", axis=1)
#merge_table_df.head()

In [9]:
# Find total number of records
total_records = len(merge_table_df)
#print(total_records)

# Find total number of schools
number_of_schools = (merge_table_df["school"]).nunique()

# Find total count of number of students 
# Should be the total of the .unique values [2917 2949 1761 4635 1468 2283 1858 4976  427  962 1800 3999 4761 2739 1635]
number_students = (merge_table_df["size"]).value_counts()
number_students = number_students.sum()
# print(number_students)

# Find the total budget 
reduced_df = merge_table_df.groupby("school").agg(np.mean)
total_budget = reduced_df["budget"].sum()
#print(total_budget)

# Find the average math score
average_math_score = merge_table_df.math_score.mean()
#print(average_math_score)

# Find the average reading score
average_reading_score = merge_table_df.reading_score.mean()
#print(average_reading_score)

# Find the percent passing math
percent_passing_math = merge_table_df[merge_table_df["math_score"]>= 70]
percent_passing_math = len(percent_passing_math)
percent_passing_math = (percent_passing_math/total_records) * 100

# Find the percent passing reading
percent_passing_reading = merge_table_df[merge_table_df["reading_score"]>= 70]
percent_passing_reading = len(percent_passing_reading)
percent_passing_reading = (percent_passing_reading/total_records) * 100

# Find the overall passing rate
overall_passing_rate = (percent_passing_math + percent_passing_reading)/2

In [10]:
# Create new dataframe from calculations
from collections import OrderedDict
district_summary_dict = {"Total Schools": [number_of_schools],
                        "Total Students": [number_students],
                        "Total Budget": [total_budget],
                        "Average Math Score": [average_math_score],
                        "Average Reading Score": [average_reading_score],
                        "Percent Passing Math":[percent_passing_math],
                        "Percent Passing Reading":[percent_passing_reading],
                        "Overall Passing Rate":[overall_passing_rate]}
district_summary_dict = OrderedDict(sorted(district_summary_dict.items()))
new_district_summary = OrderedDict([("Total Schools", [number_of_schools]),
                        ("Total Students", [number_students]),
                        ("Total Budget", [total_budget]),
                        ("Average Math Score", [average_math_score]),
                        ("Average Reading Score", [average_reading_score]),
                        ("Percent Passing Math", [percent_passing_math]),
                        ("Percent Passing Reading", [percent_passing_reading]),
                        ("Overall Passing Rate", [overall_passing_rate])])

district_summary_df = pd.DataFrame(new_district_summary)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Overall Passing Rate
0,15,39170,24649428.0,78.985371,81.87784,74.980853,85.805463,80.393158


In [11]:
#Improve formatting and rename columns

In [12]:
#School Summary
new_merge_df = merge_table_df.groupby("school").mean()
new_merge_df
# Reset index
new_merge_df = new_merge_df.reset_index(level=None, drop=False, inplace=False)
new_merge_df


Unnamed: 0,school,size,budget,reading_score,math_score
0,Bailey High School,4976.0,3124928.0,81.033963,77.048432
1,Cabrera High School,1858.0,1081356.0,83.97578,83.061895
2,Figueroa High School,2949.0,1884411.0,81.15802,76.711767
3,Ford High School,2739.0,1763916.0,80.746258,77.102592
4,Griffin High School,1468.0,917500.0,83.816757,83.351499
5,Hernandez High School,4635.0,3022020.0,80.934412,77.289752
6,Holden High School,427.0,248087.0,83.814988,83.803279
7,Huang High School,2917.0,1910635.0,81.182722,76.629414
8,Johnson High School,4761.0,3094650.0,80.966394,77.072464
9,Pena High School,962.0,585858.0,84.044699,83.839917


In [13]:
school_type_df = merge_table_df.loc[:,"school":"type"]
school_type_df = pd.DataFrame(school_type_df.groupby("school").min())
school_type_df
school_type_df = school_type_df.reset_index(level=None, drop=False, inplace=False)
school_summary_table_df = pd.merge(school_type_df, new_merge_df, how="left", on="school")
school_summary_table_df

Unnamed: 0,school,type,size,budget,reading_score,math_score
0,Bailey High School,District,4976.0,3124928.0,81.033963,77.048432
1,Cabrera High School,Charter,1858.0,1081356.0,83.97578,83.061895
2,Figueroa High School,District,2949.0,1884411.0,81.15802,76.711767
3,Ford High School,District,2739.0,1763916.0,80.746258,77.102592
4,Griffin High School,Charter,1468.0,917500.0,83.816757,83.351499
5,Hernandez High School,District,4635.0,3022020.0,80.934412,77.289752
6,Holden High School,Charter,427.0,248087.0,83.814988,83.803279
7,Huang High School,District,2917.0,1910635.0,81.182722,76.629414
8,Johnson High School,District,4761.0,3094650.0,80.966394,77.072464
9,Pena High School,Charter,962.0,585858.0,84.044699,83.839917


In [29]:
# Create a new column called percent passing reading
#percent_passing_reading_school = school_summary_table_df[school_summary_table_df["reading_score"]>= 70].count

# Create a new column called percent passing math 

# Create a new column called "Per Student Budget" and add to existing dataframe 
school_summary_table_df["Per Student Budget"] = school_summary_table_df["size"]/school_summary_table_df["budget"]
school_summary_table_df
# Create a new column called "Overall Passing Rate"
#new_merge_df["Overall Passing Rate"] = new_merge_df([percent_passing_math]+[percent_passing_reading]).mean()
#new_merge_df
#overall_passing_rate = new_merge_df([percent_passing_math]+[percent_passing_reading]).mean()

In [None]:
# Top performing schools
# Create a table that highlights the top 5 performing schools based on overall passing rate. Include
#School Name
#School Type
#Total Students
#Total School Budget
#Per Student Budget
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)
#new_merge_df = new_merge_df.loc[new_merge_df["size"]]
#new_merge_df.head()

In [None]:
#Bottom performing schools

In [None]:
# Math scores for each grade, grouped by school
student_data_df_9th = student_data_df[student_data_df["grade"] == '9th']
mathbygrade_9th = student_data_df_9th.groupby("school").math_score.mean()
mathbygrade_9th = mathbygrade_9th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_10th = student_data_df[student_data_df["grade"] == '10th']
mathbygrade_10th = student_data_df_10th.groupby("school").math_score.mean()
mathbygrade_10th = mathbygrade_10th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_11th = student_data_df[(student_data_df["grade"] == '11th')]
mathbygrade_df_11th = student_data_df_11th.groupby("school").math_score.mean()
mathbygrade_df_11th = mathbygrade_df_11th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_12th = student_data_df[(student_data_df["grade"] == '12th')]
mathbygrade_df_12th = student_data_df_12th.groupby("school").math_score.mean()
mathbygrade_df_12th = mathbygrade_df_12th.reset_index(level=None, drop=False, name=None, inplace=False)

# Merge all new data into one dataframe
merge_mathbygrade_df = pd.merge(mathbygrade_df_11th, mathbygrade_df_12th, how="outer", on= "school", suffixes=('_11th', '_12th'))
merge_mathbygrade_df = pd.merge(mathbygrade_10th, merge_mathbygrade_df, how= "outer", on= "school")
merge_mathbygrade_df = pd.merge(mathbygrade_9th, merge_mathbygrade_df, how= "outer", on= "school")
merge_mathbygrade_df
# Change column labels
merge_mathbygrade_df = merge_mathbygrade_df.rename(columns={"math_score_x":"math_score_9th","math_score_y":"math_score_10th"})
merge_mathbygrade_df

In [None]:
# Reading scores by grade, grouped by school

student_data_df_9th = student_data_df[student_data_df["grade"] == '9th']
readingbygrade_9th = student_data_df_9th.groupby("school").reading_score.mean()
readingbygrade_9th = readingbygrade_9th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_10th = student_data_df[student_data_df["grade"] == '10th']
readingbygrade_10th = student_data_df_10th.groupby("school").reading_score.mean()
readingbygrade_10th = readingbygrade_10th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_11th = student_data_df[(student_data_df["grade"] == '11th')]
readingbygrade_11th = student_data_df_11th.groupby("school").reading_score.mean()
readingbygrade_11th = readingbygrade_11th.reset_index(level=None, drop=False, name=None, inplace=False)

student_data_df_12th = student_data_df[(student_data_df["grade"] == '12th')]
readingbygrade_12th = student_data_df_12th.groupby("school").reading_score.mean()
readingbygrade_12th = readingbygrade_12th.reset_index(level=None, drop=False, name=None, inplace=False)

# Merge all new data into one dataframe
merge_readingbygrade_df = pd.merge(readingbygrade_11th, readingbygrade_12th, how="outer", on= "school", suffixes=('_11th', '_12th'))
merge_readingbygrade_df = pd.merge(readingbygrade_10th, merge_readingbygrade_df, how= "outer", on= "school")
merge_readingbygrade_df = pd.merge(readingbygrade_9th, merge_readingbygrade_df, how= "outer", on= "school")
merge_readingbygrade_df
# Change column labels
merge_readingbygrade_df = merge_readingbygrade_df.rename(columns={"reading_score_x":"reading_score_9th","reading_score_y":"reading_score_10th"})
merge_readingbygrade_df

In [None]:
#Scores by school spending
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)
#Scores by school size

bins = [0, 1750, 3250, 5000]
group_names = ["Small", "Medium", "Large"]
school_spending = pd.cut(new_merge_df["size"], bins, labels=group_names)
school_spending = pd.DataFrame(school_spending)
school_spending

In [None]:
#Scores by school size
bins = [0, 1750, 3250, 5000]
group_names = ["Small", "Medium", "Large"]
new_merge_df["School Size Category"] = pd.cut(new_merge_df["size"], bins, labels=group_names)
school_size_categories = new_merge_df.groupby("School Size Category")
school_size_categories.max()

In [None]:
#Scores by School type