In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"D:\Farooq_Hussain\Python\Data Analysis\08 - EDA_Students_Performance_Analysis\data\StudentsPerformance.csv", encoding='latin-1')

df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
df.info()
df.describe()

Dataset contains 1000 rows and 8 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [5]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [7]:
# Checking duplicate values

df.duplicated().sum()

0

In [9]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [11]:
# 1. Which parental education level is linked with the highest average math score?

parental_education_level = df.groupby('parental level of education')['math score'].mean().sort_values(ascending=False)

parental_education_level

parental level of education
master's degree       69.745763
bachelor's degree     69.389831
associate's degree    67.882883
some college          67.128319
some high school      63.497207
high school           62.137755
Name: math score, dtype: float64

In [13]:
# 2. Is there a significant score difference between males and females across all subjects?

score_difference = df.groupby('gender')[['math score', 'reading score', 'writing score']].mean()

score_difference

Unnamed: 0_level_0,math score,reading score,writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


In [None]:
subjects = ['math score', 'reading score', 'writing score']

for sub in subjects:
    sns.boxplot(x='gender', y=sub, data=df)
    plt.title(f'{sub} Distribution by Gender')
    plt.show()

In [17]:
# 3. How much does completing the test preparation course improve performance in each subject?

test_preparation_course = df.groupby('test preparation course')[['math score', 'reading score', 'writing score']].mean()

test_preparation_course

Unnamed: 0_level_0,math score,reading score,writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,69.695531,73.893855,74.418994
none,64.077882,66.534268,64.504673


In [19]:
# 4. Which combination of gender, lunch type and test preparation status produces the top 10% of scores?

df['total_score'] = df['math score'] + df['reading score'] + df['writing score']

cutoff = df['total_score'].quantile(0.90)

top_10 = df[df['total_score'] >= cutoff]

top_10per_of_scores = top_10.groupby(['gender', 'lunch', 'test preparation course']).size().sort_values(ascending=False)

top_10per_of_scores

gender  lunch         test preparation course
female  standard      none                       31
                      completed                  29
male    standard      completed                  20
                      none                        9
female  free/reduced  completed                   6
male    free/reduced  completed                   3
female  free/reduced  none                        2
male    free/reduced  none                        2
dtype: int64

In [21]:
# 5. Does lunch type have a uniform impact across all race/ethnicity groups or does its effect vary?

impact_across_all_race = df.groupby(['race/ethnicity', 'lunch'])['total_score'].mean()

impact_across_all_race

race/ethnicity  lunch       
group A         free/reduced    172.972222
                standard        199.849057
group B         free/reduced    182.927536
                standard        204.090909
group C         free/reduced    181.236842
                standard        212.604878
group D         free/reduced    194.000000
                standard        215.239521
group E         free/reduced    202.487805
                standard        224.787879
Name: total_score, dtype: float64

In [25]:
# 6. What is the correlation between reading and writing scores? Is it stronger than math and writing?

df[['reading score', 'writing score', 'math score']].corr()

corr_read_write = df['reading score'].corr(df['writing score'])
corr_math_write = df['math score'].corr(df['writing score'])

print(f"Correlation (Reading vs Writing): {corr_read_write:.4f}")
print(f"Correlation (Math vs Writing): {corr_math_write:.4f}")

Correlation (Reading vs Writing): 0.9546
Correlation (Math vs Writing): 0.8026


In [27]:
# 7. Identify the top 5% performing students and analyze their demographic profiles. What patterns emerge?

cutoff_95 = df['total_score'].quantile(0.95)

top_5 = df[df['total_score'] >= cutoff_95]


print("Gender distribution:")
display(top_5['gender'].value_counts(normalize=True)*100)

print("\nLunch type distribution:")
display(top_5['lunch'].value_counts(normalize=True)*100)

print("\nTest preparation status distribution:")
display(top_5['test preparation course'].value_counts(normalize=True)*100)

print("\nRace/ethnicity distribution:")
display(top_5['race/ethnicity'].value_counts(normalize=True)*100)

Gender distribution:


gender
female    72.0
male      28.0
Name: proportion, dtype: float64


Lunch type distribution:


lunch
standard        92.0
free/reduced     8.0
Name: proportion, dtype: float64


Test preparation status distribution:


test preparation course
completed    66.0
none         34.0
Name: proportion, dtype: float64


Race/ethnicity distribution:


race/ethnicity
group E    28.0
group C    26.0
group D    24.0
group B    14.0
group A     8.0
Name: proportion, dtype: float64

In [29]:
# 8. Can we cluster students into performance categories (e.g., low, medium, high performers) using just Pandas logic? If yes, how?

df['performance_level'] = pd.qcut(df['total_score'], q=3, labels=['Low', 'Medium', 'High'])

df['performance_level'].value_counts()

performance_level
Low       336
Medium    332
High      332
Name: count, dtype: int64