# Student Grades EDA — NumPy · Pandas · Matplotlib · SciPy · Scikit‑learn
This notebook walks through a compact, hands-on EDA using a tiny student dataset.

In [None]:

# If running in Jupyter, ensure inline plotting
%matplotlib inline

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path

print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)


## Exploring data arrays with NumPy

In [None]:

data = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]
print(data)


In [None]:

# Convert the list to a NumPy array
grades = np.array(data)
print(data)
print(grades)

print (type(data),'x 2:', data * 2)
print('---')
print (type(grades),'x 2:', grades * 2)


In [None]:
print(grades.shape)

In [None]:

# Define an array of study hours
study_hours = [10.0,11.5,9.0,16.0,9.25,1.0,11.5,9.0,8.5,14.5,15.5,
               13.75,9.0,8.0,15.5,8.0,9.0,6.0,10.0,12.0,12.5,12.0]

# Create a 2D array (an array of arrays)
student_data = np.array([study_hours, grades])

student_data, student_data.shape


In [None]:

print(student_data[0])
print(student_data[0][0])

# Get the mean value of each sub-array
avg_study = student_data[0].mean()
avg_grade = student_data[1].mean()

print('Average study hours: {:.2f}\nAverage grade: {:.2f}'.format(avg_study, avg_grade))


## Exploring tabular data with Pandas

In [None]:

df_students = pd.DataFrame({
    'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 
             'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
             'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
    'StudyHours':student_data[0],
    'Grade':student_data[1]
})
df_students


In [None]:

# loc / iloc examples
print(df_students.loc[5])
print(df_students.loc[0:5])
df_students.iloc[0:5]


In [None]:

# Column access examples
df_students.iloc[0,[1, 2]]


In [None]:

# Grade value of 0'th row
df_students.loc[0,'Grade']


In [None]:

# Filtering examples
print(df_students.query('Name=="Aisha"'))
print(df_students[df_students.Name == 'Aisha'])


## Loading a DataFrame from a file

In [None]:

# Ensure dataset exists locally (use helper script or curl beforehand)
csv_path = Path('..') / 'data' / 'grades.csv'
if not csv_path.exists():
    raise FileNotFoundError("Dataset not found. Run `python scripts/download_data.py` or place grades.csv in ./data")

df_students = pd.read_csv(csv_path, delimiter=',', header=0)
df_students.head()


## Handling missing values

In [None]:

df_students.isnull().sum()


In [None]:

df_students[df_students.isnull().any(axis=1)]


In [None]:

# Impute and show
df_students.StudyHours = df_students.StudyHours.fillna(df_students.StudyHours.mean())
df_students = df_students.dropna(axis=0, how='any')
df_students.head()


## Explore data in a DataFrame

In [None]:

mean_study = df_students['StudyHours'].mean()
mean_grade = df_students.Grade.mean()
print('Average weekly study hours: {:.2f}\nAverage grade: {:.2f}'.format(mean_study, mean_grade))

# Students who studied more than mean
df_students[df_students.StudyHours > mean_study]


In [None]:

# Mean grade of above-average studiers
df_students[df_students.StudyHours > mean_study].Grade.mean()


In [None]:

# Add Pass column
passes  = pd.Series(df_students['Grade'] >= 60)
df_students = pd.concat([df_students, passes.rename("Pass")], axis=1)
df_students.head()


In [None]:

print(df_students.groupby(df_students.Pass).Name.count())
print(df_students.groupby(df_students.Pass)[['StudyHours', 'Grade']].mean())


## Visualizing data with Matplotlib

In [None]:

plt.bar(x=df_students.Name, height=df_students.Grade)
plt.show()


In [None]:

plt.bar(x=df_students.Name, height=df_students.Grade, color='orange')
plt.title('Student Grades')
plt.xlabel('Student')
plt.ylabel('Grade')
plt.grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
plt.xticks(rotation=90)
plt.show()


In [None]:

fig = plt.figure(figsize=(8,3))
plt.bar(x=df_students.Name, height=df_students.Grade, color='orange')
plt.title('Student Grades')
plt.xlabel('Student')
plt.ylabel('Grade')
plt.grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
plt.xticks(rotation=90)
plt.show()


In [None]:

fig, ax = plt.subplots(1, 2, figsize = (12,5))
ax[0].bar(x=df_students.Name, height=df_students.Grade, color='orange')
ax[0].set_title('Grades')
ax[0].set_xticklabels(df_students.Name, rotation=90)

pass_counts = df_students['Pass'].value_counts()
ax[1].pie(pass_counts, labels=pass_counts)
ax[1].set_title('Passing Grades')
ax[1].legend(pass_counts.keys().tolist())

fig.suptitle('Student Data')
fig.show()


In [None]:

df_students.plot.bar(x='Name', y='StudyHours', color='teal', figsize=(8,4))
plt.show()


## Getting started with statistical analysis

In [None]:

# Histogram of grades
var_data = df_students['Grade']
fig = plt.figure(figsize=(10,4))
plt.hist(var_data)
plt.title('Data Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
fig.show()


In [None]:

# Central tendency lines on histogram
var = df_students['Grade']

min_val = var.min()
max_val = var.max()
mean_val = var.mean()
med_val = var.median()
mod_val = var.mode()[0]

print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,mean_val,med_val,mod_val,max_val))

fig = plt.figure(figsize=(10,4))
plt.hist(var)
plt.axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
plt.axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
plt.axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
plt.axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
plt.axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)
plt.title('Data Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
fig.show()


In [None]:

# Boxplot for grades
var = df_students['Grade']
fig = plt.figure(figsize=(10,4))
plt.boxplot(var)
plt.title('Data Distribution')
fig.show()


In [None]:

# Combined histogram + boxplot function
def show_distribution(var_data):
    from matplotlib import pyplot as plt

    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,mean_val,med_val,mod_val,max_val))

    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    fig.suptitle('Data Distribution')
    fig.show()

col = df_students['Grade']
show_distribution(col)


In [None]:

# Density function
def show_density(var_data):
    from matplotlib import pyplot as plt

    fig = plt.figure(figsize=(10,4))
    var_data.plot.density()

    plt.title('Data Density')
    plt.axvline(x=var_data.mean(), color = 'cyan', linestyle='dashed', linewidth = 2)
    plt.axvline(x=var_data.median(), color = 'red', linestyle='dashed', linewidth = 2)
    plt.axvline(x=var_data.mode()[0], color = 'yellow', linestyle='dashed', linewidth = 2)
    plt.show()

col = df_students['Grade']
show_density(col)


In [None]:

# Study hours distribution (with outlier handling)
col = df_students['StudyHours']
show_distribution(col)


In [None]:

# Exclude obvious outlier
col = df_students[df_students.StudyHours>1]['StudyHours']
show_distribution(col)


In [None]:

# Exclude low-end outliers by quantile
q01 = df_students.StudyHours.quantile(0.01)
col = df_students[df_students.StudyHours>q01]['StudyHours']
show_distribution(col)
show_density(col)


In [None]:

# Measures of variance
for col_name in ['Grade','StudyHours']:
    col = df_students[col_name]
    rng = col.max() - col.min()
    var = col.var()
    std = col.std()
    print('\n{}:\n - Range: {:.2f}\n - Variance: {:.2f}\n - Std.Dev: {:.2f}'.format(col_name, rng, var, std))


In [None]:

# Normal distribution stdev bands
import scipy.stats as stats
col = df_students['Grade']

density = stats.gaussian_kde(col)
col.plot.density()

s = col.std()
m = col.mean()

x1 = [m-s, m+s]
y1 = density(x1)
plt.plot(x1,y1, color='magenta')
plt.annotate('1 std (68.26%)', (x1[1],y1[1]))

x2 = [m-(s*2), m+(s*2)]
y2 = density(x2)
plt.plot(x2,y2, color='green')
plt.annotate('2 std (95.45%)', (x2[1],y2[1]))

x3 = [m-(s*3), m+(s*3)]
y3 = density(x3)
plt.plot(x3,y3, color='orange')
plt.annotate('3 std (99.73%)', (x3[1],y3[1]))

plt.axvline(col.mean(), color='cyan', linestyle='dashed', linewidth=1)
plt.axis('off')
plt.show()


## Comparing data

In [None]:

df_sample = df_students[df_students['StudyHours']>1]
df_sample.head()


In [None]:

# Boxplot StudyHours by Pass
df_sample.boxplot(column='StudyHours', by='Pass', figsize=(8,5))
plt.show()


In [None]:

# Bar chart of Grade and StudyHours (different scales)
df_sample.plot(x='Name', y=['Grade','StudyHours'], kind='bar', figsize=(8,5))
plt.show()


In [None]:

# Normalize with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_normalized = df_sample[['Name', 'Grade', 'StudyHours']].copy()
df_normalized[['Grade','StudyHours']] = scaler.fit_transform(df_normalized[['Grade','StudyHours']])
df_normalized.plot(x='Name', y=['Grade','StudyHours'], kind='bar', figsize=(8,5))
plt.show()

# Correlation
df_normalized.Grade.corr(df_normalized.StudyHours)


In [None]:

# Scatter + regression line
from scipy import stats
df_regression = df_sample[['Grade', 'StudyHours']].copy()

m, b, r, p, se = stats.linregress(df_regression['StudyHours'], df_regression['Grade'])
print('slope: {:.4f}\ny-intercept: {:.4f}'.format(m,b))
print('so...\n f(x) = {:.4f}x + {:.4f}'.format(m,b))

df_regression['fx'] = (m * df_regression['StudyHours']) + b
df_regression['error'] = df_regression['fx'] - df_regression['Grade']

ax = df_regression.plot.scatter(x='StudyHours', y='Grade', title='Study Time vs Grade (with regression)')
plt.plot(df_regression['StudyHours'],df_regression['fx'], color='cyan')
plt.show()

df_regression[['StudyHours', 'Grade', 'fx', 'error']]


In [None]:

# Prediction function using fixed coefficients from earlier example
def f(x):
    m = 6.3134
    b = -17.9164
    return m*x + b

study_time = 14
prediction = f(study_time)
expected_grade = max(0,min(100,prediction))
print('Studying for {} hours per week may result in a grade of {:.0f}'.format(study_time, expected_grade))
