In [70]:
import pandas as pd
from pprint import pprint

In [71]:
print('\n')
print('Loading students records from CSV...')

students_df = pd.read_csv('../../files/students_record.csv')



Loading students records from CSV...


In [72]:
print('\n============== DATA EXPLORATION ==============')

(rows, columns) = students_df.shape

print(f"\n{rows} rows, {columns} columns")

print('\n------ COLUMN NAMES ------')
pprint(students_df.columns.tolist())

print('\n------ DATA TYPES ------')
pprint(students_df.dtypes)

print(students_df.head())

print('\n Missing values in each column')
print(students_df.isnull().sum())



21 rows, 11 columns

------ COLUMN NAMES ------
['student_id',
 'first_name',
 'last_name',
 'gender',
 'age',
 'grade_level',
 'math_score',
 'science_score',
 'english_score',
 'history_score',
 'attendance_rate']

------ DATA TYPES ------
student_id          object
first_name          object
last_name           object
gender              object
age                  int64
grade_level          int64
math_score         float64
science_score      float64
english_score      float64
history_score        int64
attendance_rate    float64
dtype: object
  student_id first_name last_name gender  age  grade_level  math_score  \
0     STU001       Emma   Johnson      F   16           10        92.0   
1     STU002       Liam     Smith      M   15           10        78.0   
2     STU003     Olivia  Williams      F   16           10        95.0   
3     STU004       Noah     Brown      M   17           11        45.0   
4     STU005        Ava     Jones      F   16           10        82.0   



In [73]:
print('\n============== BASIC STATISTICS ==============')

# subject_columns = []
# for col in students_df.columns:
#     if '_score' in col:
#         subject_columns.append(col)

# The code below is a list comprehension that produces the same result as the 4 lines of commented code above

subject_columns = [col for col in students_df.columns if '_score' in col]

print(students_df.describe())

students_df['average_score'] = students_df[subject_columns].mean(axis=1).round(2)

top_students = students_df.sort_values('average_score', ascending=False).head(5)

print('\n------ TOP 5 STUDENTS ------')
print(top_students[['student_id', 'first_name', 'last_name', 'average_score']])


             age  grade_level  math_score  science_score  english_score  \
count  21.000000    21.000000   17.000000      19.000000      20.000000   
mean   16.047619    10.142857   79.117647      81.105263      81.050000   
std     0.804748     0.727029   12.559271       9.085365       9.779005   
min    15.000000     9.000000   45.000000      58.000000      65.000000   
25%    15.000000    10.000000   73.000000      77.000000      74.250000   
50%    16.000000    10.000000   82.000000      81.000000      81.500000   
75%    17.000000    11.000000   89.000000      87.500000      88.500000   
max    17.000000    11.000000   95.000000      96.000000      95.000000   

       history_score  attendance_rate  
count      21.000000        21.000000  
mean       81.428571         0.900476  
std         9.228063         0.060123  
min        63.000000         0.750000  
25%        78.000000         0.880000  
50%        81.000000         0.910000  
75%        88.000000         0.940000  
max

In [None]:
print('\n============== PASS/FAIL STATUS ==============')

def determine_pass_or_fail(score):
    return score > 75

    # if score > 75:
    #     return True
    # else
    #     return False

    # return True if score > 75 else False

students_df['pass_fail'] = students_df['average_score'].apply(determine_pass_or_fail)

pass_fail_counts = students_df['pass_fail'].value_counts()

# print(pass_fail_counts[False])

print(
    '------ Pass Percentage:',
    (pass_fail_counts[True] / len(students_df) * 100).round(2),
    '------'
)

def calculate_pass_percentage(group):
    """
    Counts how many students have True in their pass_fail column, divides by the total number of students in that grade (or group), and multiplies by 100 to get a percentage
    """

    # pass_count = (group == True).sum()

    # Although the commented code above gives you the same result as the code below, the one below is a preferred and safer means
    # of getting the count of students with True in their respective pass_fail column.
    # It's even more self-explanatory than the one above
    pass_count = group.value_counts().get(True, 0)

    pass_percentage = (pass_count / len(group)) * 100
    return round(pass_percentage, 2)


# This creates groups of students based on their grade level.
# The ['pass_fail'] part specifies that we're only interested in the 'pass_fail' column values for each group
grade_level_groups = students_df.groupby('grade_level')['pass_fail']

# Here we execute a custom `calculate_pass_percentage`` function on each grade_level group
# and calculates what percentage of students that passed
grade_level_pass = grade_level_groups.apply(calculate_pass_percentage)
print('\n------ PASS PERCENTAGE BY GRADE LEVEL ------')
print(grade_level_pass)

# Groups the students dataframe by the 'grade_level' column, selects only the 'average_score' column from each group,
# and calculates the mean (average) of the 'average_score' values for each grade level group.
# Then we round each mean value to 2 decimal places for cleaner presentation
grade_level_avg = students_df.groupby('grade_level')['average_score'].mean().round(2)
print('\n------ AVERAGE SCORE BY GRADE LEVEL ------')
print(grade_level_avg)


------ Pass Percentage: 80.95 ------

------ PASS PERCENTAGE BY GRADE LEVEL ------
<class 'pandas.core.series.Series'>

------ AVERAGE SCORE BY GRADE LEVEL ------
grade_level
9     81.88
10    82.38
11    77.96
Name: average_score, dtype: float64


This tells us the average performance of students in each grade level. It's a useful metric for comparing performance across grades and can help identify trends (e.g., whether performance improves as students progress through grades).

This approach is much more efficient than manually filtering the dataframe for each grade level and then calculating the mean, especially when dealing with large datasets.

In [75]:
students_df.to_csv('../../files/students_updated_record.csv', index=False)