In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
# File path for first csv
schools_data= "schools_complete.csv"

# Read our csv into Pandas as a dataframe and display first 5 rows
schools_data_df = pd.read_csv(schools_data)
schools_data_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Rename "name" column to schools
schools_data_df = schools_data_df.rename(index=str, columns={"name": "school"})
schools_data_df.head()

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
# File path for second csv
student_data = "students_complete.csv"

# Read our csv into Pandas and display first 5 rows
student_data_df = pd.read_csv(student_data)
student_data_df.head()


Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [5]:
number_student_records = len(student_data_df)
print(number_student_records)

39170


In [6]:
# Get a list of all of our columns for reference, in first csv
schools_data_df.columns

Index(['School ID', 'school', 'type', 'size', 'budget'], dtype='object')

In [7]:
#Get a list of all of our columns for reference, in 2nd csv
student_data_df.columns

Index(['Student ID', 'name', 'gender', 'grade', 'school', 'reading_score',
       'math_score'],
      dtype='object')

In [8]:
# Merge Dataframes
merge_table_df = pd.merge(schools_data_df, student_data_df, on="school")


In [9]:
merge_table_df.drop(["Student ID", "School ID"], axis=1, inplace=True)
merge_table_df.head()

Unnamed: 0,school,type,size,budget,name,gender,grade,reading_score,math_score
0,Huang High School,District,2917,1910635,Paul Bradley,M,9th,66,79
1,Huang High School,District,2917,1910635,Victor Smith,M,12th,94,61
2,Huang High School,District,2917,1910635,Kevin Rodriguez,M,12th,90,60
3,Huang High School,District,2917,1910635,Dr. Richard Scott,M,12th,67,58
4,Huang High School,District,2917,1910635,Bonnie Ray,F,9th,97,84


In [10]:
total_records = len(merge_table_df)
print(total_records)

39170


In [11]:
# Find total number of schools
number_of_schools = (merge_table_df["school"]).nunique()
print(number_of_schools)

15


In [12]:
# Find total count of number of students 
# Should be the total of the .unique values [2917 2949 1761 4635 1468 2283 1858 4976  427  962 1800 3999 4761 2739 1635]
number_students = (merge_table_df["size"]).sum()
print(number_students)

130551930


In [13]:
# Find the total budget 
budget = (merge_table_df["budget"]).sum()
print(budget)

82932329558


In [14]:
# Find the average math score
average_math_score = merge_table_df.math_score.mean()
print(average_math_score)

78.98537145774827


In [15]:
# Find the average reading score
average_reading_score = merge_table_df.reading_score.mean()
print(average_reading_score)

81.87784018381414


In [16]:
# Find the percent passing math
percent_passing_math = merge_table_df[merge_table_df["math_score"]>= 70]
percent_passing_math = len(percent_passing_math)
percent_passing_math = (percent_passing_math/total_records) * 100
print(percent_passing_math)

74.9808526933878


In [17]:
# Find the percent passing reading
percent_passing_reading = merge_table_df[merge_table_df["reading_score"]>= 70]

percent_passing_reading = len(percent_passing_reading)
percent_passing_reading = (percent_passing_reading/total_records) * 100
print(percent_passing_reading)

85.80546336482001


In [18]:
# Find the overall passing rate
overall_passing_rate = (percent_passing_math + percent_passing_reading)/2
print(overall_passing_rate)

80.39315802910392


In [19]:
# Create new dataframe from calculations
from collections import OrderedDict
district_summary_dict = {"Total Schools": [number_of_schools],
                        "Total Students": [number_students],
                        "Total Budget": [budget],
                        "Average Math Score": [average_math_score],
                        "Average Reading Score": [average_reading_score],
                        "Percent Passing Math":[percent_passing_math],
                        "Percent Passing Reading":[percent_passing_reading],
                        "Overall Passing Rate":[overall_passing_rate]}
district_summary_dict = OrderedDict(sorted(district_summary_dict.items()))
new_district_summary = OrderedDict([("Total Schools", [number_of_schools]),
                        ("Total Students", [number_students]),
                        ("Total Budget", [budget]),
                        ("Average Math Score", [average_math_score]),
                        ("Average Reading Score", [average_reading_score]),
                        ("Percent Passing Math", [percent_passing_math]),
                        ("Percent Passing Reading", [percent_passing_reading]),
                        ("Overall Passing Rate", [overall_passing_rate])])

district_summary_df = pd.DataFrame(new_district_summary)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Overall Passing Rate
0,15,130551930,82932329558,78.985371,81.87784,74.980853,85.805463,80.393158


In [20]:
#Improve formatting

In [21]:
#School Summary

#Create an overview table that summarizes key metrics about each school, including:
#School Name
#School Type
#Total Students
#Total School Budget
#Per Student Budget
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)

#merge_table_df.groupby("school").mean() produces the same results as code below
#new_merge_df.apply(np.mean, axis = 1) is also the same
new_merge_df = merge_table_df.groupby("school").agg(np.mean)
new_merge_df
#average_reading_score = [merge_table_df.reading_score.mean()]
#merge_table_df["Average Reading Score"] = average_reading_score
#merge_table_df['Average Reading Score'] = pd.Series([average_reading_score])
#merge_table_df

Unnamed: 0_level_0,size,budget,reading_score,math_score
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,4976.0,3124928.0,81.033963,77.048432
Cabrera High School,1858.0,1081356.0,83.97578,83.061895
Figueroa High School,2949.0,1884411.0,81.15802,76.711767
Ford High School,2739.0,1763916.0,80.746258,77.102592
Griffin High School,1468.0,917500.0,83.816757,83.351499
Hernandez High School,4635.0,3022020.0,80.934412,77.289752
Holden High School,427.0,248087.0,83.814988,83.803279
Huang High School,2917.0,1910635.0,81.182722,76.629414
Johnson High School,4761.0,3094650.0,80.966394,77.072464
Pena High School,962.0,585858.0,84.044699,83.839917


In [26]:
new_merge_df.apply(np.max, axis = 1)
new_merge_df

Unnamed: 0_level_0,size,budget,reading_score,math_score
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,4976.0,3124928.0,81.033963,77.048432
Cabrera High School,1858.0,1081356.0,83.97578,83.061895
Figueroa High School,2949.0,1884411.0,81.15802,76.711767
Ford High School,2739.0,1763916.0,80.746258,77.102592
Griffin High School,1468.0,917500.0,83.816757,83.351499
Hernandez High School,4635.0,3022020.0,80.934412,77.289752
Holden High School,427.0,248087.0,83.814988,83.803279
Huang High School,2917.0,1910635.0,81.182722,76.629414
Johnson High School,4761.0,3094650.0,80.966394,77.072464
Pena High School,962.0,585858.0,84.044699,83.839917


In [22]:
# Top performing schools
# Create a table that highlights the top 5 performing schools based on overall passing rate. Include
#School Name
#School Type
#Total Students
#Total School Budget
#Per Student Budget
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)


In [None]:
#Bottom performing schools

In [None]:
#Math scores by grade
#student_data_df.groupby("grade").mean
#mathbygrade_merge_table_df = merge_table_df.groupby("grade")["math_score"].mean()
#mathbygrade_merge_table_df.head()

In [None]:
#Reading scores by grade
#

In [None]:
#Scores by school spending

In [None]:
#Scores by school size

In [None]:
#Scores by School type