In [1]:
import pandas as pd

data = {
    "student_id": [1,2,3,4,5,6,7,8,9,10],
    "gender": ["M","F","M","F","M","F","M","F","M","F"],
    "math": [78,88,35,65,90,55,20,72,60,95],
    "science": [72,91,40,70,85,58,30,75,62,94],
    "english": [75,90,38,68,88,60,25,70,58,96],
    "attendance": [85,92,60,80,95,70,40,82,75,98],
    "grade": ["B","A","D","C","A","C","F","B","C","A"]
}

df = pd.DataFrame(data)

# save CSV to correct folder
df.to_csv("../data/students.csv", index=False)

df


Unnamed: 0,student_id,gender,math,science,english,attendance,grade
0,1,M,78,72,75,85,B
1,2,F,88,91,90,92,A
2,3,M,35,40,38,60,D
3,4,F,65,70,68,80,C
4,5,M,90,85,88,95,A
5,6,F,55,58,60,70,C
6,7,M,20,30,25,40,F
7,8,F,72,75,70,82,B
8,9,M,60,62,58,75,C
9,10,F,95,94,96,98,A


## Dataset Overview

This dataset represents student academic performance
Each row corresponds to one student.
The data includes subject wise marks, attendance percentage, and final grade.

Objective:
To understand performance patterns and identify factors affecting grades.


In [5]:
df.shape

(10, 7)

In [6]:
df.columns

Index(['student_id', 'gender', 'math', 'science', 'english', 'attendance',
       'grade'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   student_id  10 non-null     int64 
 1   gender      10 non-null     object
 2   math        10 non-null     int64 
 3   science     10 non-null     int64 
 4   english     10 non-null     int64 
 5   attendance  10 non-null     int64 
 6   grade       10 non-null     object
dtypes: int64(5), object(2)
memory usage: 692.0+ bytes


In [9]:
df.isna().sum()

student_id    0
gender        0
math          0
science       0
english       0
attendance    0
grade         0
dtype: int64

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df.describe()

Unnamed: 0,student_id,math,science,english,attendance
count,10.0,10.0,10.0,10.0,10.0
mean,5.5,65.8,67.7,66.8,77.7
std,3.02765,24.30272,20.8862,22.65588,17.619749
min,1.0,20.0,30.0,25.0,40.0
25%,3.25,56.25,59.0,58.5,71.25
50%,5.5,68.5,71.0,69.0,81.0
75%,7.75,85.5,82.5,84.75,90.25
max,10.0,95.0,94.0,96.0,98.0


## Data Quality Summary

- Total records:
- Total features:
- Missing values:
- Duplicate records:
- Initial observations:


In [13]:
df.dtypes

student_id     int64
gender        object
math           int64
science        int64
english        int64
attendance     int64
grade         object
dtype: object

In [15]:
df['math'].describe()

count    10.00000
mean     65.80000
std      24.30272
min      20.00000
25%      56.25000
50%      68.50000
75%      85.50000
max      95.00000
Name: math, dtype: float64

In [16]:
df['math'].value_counts().sort_index()

math
20    1
35    1
55    1
60    1
65    1
72    1
78    1
88    1
90    1
95    1
Name: count, dtype: int64

In [17]:
df['science'].describe()

count    10.0000
mean     67.7000
std      20.8862
min      30.0000
25%      59.0000
50%      71.0000
75%      82.5000
max      94.0000
Name: science, dtype: float64

In [19]:
df['english'].describe()

count    10.00000
mean     66.80000
std      22.65588
min      25.00000
25%      58.50000
50%      69.00000
75%      84.75000
max      96.00000
Name: english, dtype: float64

In [21]:
df['attendance'].describe()

count    10.000000
mean     77.700000
std      17.619749
min      40.000000
25%      71.250000
50%      81.000000
75%      90.250000
max      98.000000
Name: attendance, dtype: float64

In [23]:
df['gender'].value_counts()

gender
M    5
F    5
Name: count, dtype: int64

In [24]:
df['gender'].value_counts(normalize=True)*100

gender
M    50.0
F    50.0
Name: proportion, dtype: float64

In [26]:
df['grade'].value_counts()

grade
A    3
C    3
B    2
D    1
F    1
Name: count, dtype: int64

In [27]:
df['grade'].value_counts().sort_index()

grade
A    3
B    2
C    3
D    1
F    1
Name: count, dtype: int64

## Univariate Analysis Summary

### Numerical Columns
- Math: 
- Science:
- English:
- Attendance:

### Categorical Columns
- Gender distribution:
- Grade distribution:

Key observations:


In [31]:
df.groupby('grade')['attendance'].mean().sort_values(ascending=False)

grade
A    95.0
B    83.5
C    75.0
D    60.0
F    40.0
Name: attendance, dtype: float64

In [36]:
df[['math', 'science' ,'english' ,'attendance']].corr()

Unnamed: 0,math,science,english,attendance
math,1.0,0.988853,0.993987,0.989499
science,0.988853,1.0,0.992402,0.977058
english,0.993987,0.992402,1.0,0.982376
attendance,0.989499,0.977058,0.982376,1.0


In [39]:
df.groupby('gender')[['math', 'science' , 'english']].mean()

Unnamed: 0_level_0,math,science,english
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,75.0,77.6,76.8
M,56.6,57.8,56.8


In [43]:
df.groupby('grade')[['math','science','english']].mean().sort_values('math',ascending=False)

Unnamed: 0_level_0,math,science,english
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,91.0,90.0,91.333333
B,75.0,73.5,72.5
C,60.0,63.333333,62.0
D,35.0,40.0,38.0
F,20.0,30.0,25.0


## Bivariate Analysis Summary

- Attendance vs Grade:
- Attendance vs Scores:
- Gender vs Performance:
- Grade vs Subject Scores:

Key insights:
"Students with higher attendance generally receive better grades.‚Äù

## Key Insights

1. Students with higher attendance generally achieve better grades, indicating attendance is a strong performance factor.
2. Science and Math scores show a strong positive relationship, suggesting similar skill requirements.
3. English scores are comparatively more consistent across students than Math and Science.
4. Grade distribution aligns well with subject-wise performance, validating the grading system.
5. A small group of students with very low attendance also shows poor academic performance, indicating risk cases.


## Executive Summary

This analysis examined student academic performance using subject-wise marks, attendance, and grades.

Key findings show that attendance has a significant impact on academic outcomes, with higher attendance linked to better grades and subject scores.

The grading system accurately reflects student performance across subjects. A small subset of students with low attendance and poor scores may require targeted intervention.

Overall, improving attendance could be a key strategy to enhance academic performance.
