In [3]:
import pandas as pd
import os

In [5]:
# Files to load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [6]:
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [23]:
# Initial exploration
print("Students dataframe dims:\n",student_data_df.count())
print("\nNulls in student dataframe:\n", student_data_df.isnull().sum())
print("\nData types in student dataframe:\n", student_data_df.dtypes)

print("\nSchools dataframe dims:\n",school_data_df.count())
print("\nNulls in schools dataframe:\n", school_data_df.isnull().sum())
print("Data types in schools dataframe:\n", school_data_df.dtypes)


Students dataframe dims:
 Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

Nulls in student dataframe:
 Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

Data types in student dataframe:
 Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

Schools dataframe dims:
 School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

Nulls in schools dataframe:
 School ID      0
school_name    0
type           0
size           0
budget         0
dtype: int64
Data types in schools dataframe:
 School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype:

In [28]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

In [30]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df["school_name"].count()

39170

In [46]:
# Check that the number of unique schools in the merged data is 15
school_count = len(school_data_complete_df["school_name"].unique())
school_count

15

In [41]:
# Get total budget from school dataframe to reference later
total_budget = school_data_df["budget"].sum()
print("All schools total budget: $", total_budget)

All schools total budget: $ 24649428


In [40]:
# Calculate average scores
average_reading_score = school_data_complete_df["reading_score"].mean()
average_math_score = school_data_complete_df["math_score"].mean()
print("All schools average reading score: ", average_reading_score)
print("All schools average math score: ", average_math_score)

All schools average reading score:  81.87784018381414
All schools average math score:  78.98537145774827


In [45]:
# Get all the students who are passing math and reading into new dataframes
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
# Count numbers of students passing math, and passing reading
passing_math_count = passing_math["student_name"].count()
passing_reading_count = passing_reading["student_name"].count()
# Count all students
student_count = school_data_complete_df["Student ID"].count()
# Calculate passing percentage
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_reading_percentage = passing_reading_count / float(student_count) * 100
# Create dataframe of students who passed both math and reading
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
# Calculate the overall passing percentage.
overall_passing_percentage = overall_passing_math_reading_count / student_count * 100

In [53]:
# Create district summary dataframe
district_summary_df = pd.DataFrame(
    [{"Total Schools": school_count,
    "Total Students": student_count,
    "Total Budget": total_budget,
    "Average Math Score": average_math_score,
    "Average Reading Score": average_reading_score,
    "% Passing Math": passing_math_percentage,
    "% Passing Reading": passing_reading_percentage,
    "% Overall Passing": overall_passing_percentage}]
)
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
# Format reading score to tenths place
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
# Format math score to tenths place
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.0f}".format)
# Format % passing reading to whole percentage
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}%".format)
# Format % passing math to nearest whole percentage
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}%".format)
# Format % overall passing to nearest whole percentage
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}%".format)

# Reorder the columns in the order you want them to appear.
new_column_order = ["Total Schools", "Total Students", "Total Budget","Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

# Assign district summary df the new column order.
district_summary_df = district_summary_df[new_column_order]

# Display dataframe
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79,81.9,75%,86%,65%
