# 01 – EDA & Bias Analysis
Fair & Explainable AI – Student Performance Project

## 1. Data Cleaning

In [9]:
# 1. Imports & Settings

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

# Correct paths INSIDE your current directory: C:\Users\Kal
DATA_DIR = "data"
FIG_DIR = "figures/eda"
REPORTS_DIR = "reports"

# Create folders if they don't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# Show all columns when inspecting data
pd.set_option("display.max_columns", None)


In [11]:
# 2. Load & Merge Both Datasets (correct with semicolon separator)

# Load the Math and Portuguese datasets using ; as separator
df_math = pd.read_csv("student-mat.csv", sep=";")    # Math course
df_por = pd.read_csv("student-por.csv", sep=";")     # Portuguese course

print("Math dataset shape:", df_math.shape)
print("Portuguese dataset shape:", df_por.shape)

# Merge/stack them into one combined dataset
df = pd.concat([df_math, df_por], ignore_index=True)

print("Combined dataset shape:", df.shape)

df.head()


Math dataset shape: (395, 33)
Portuguese dataset shape: (649, 33)
Combined dataset shape: (1044, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


In [12]:
# 3. Basic Structure & Summary

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      1044 non-null   object
 1   sex         1044 non-null   object
 2   age         1044 non-null   int64 
 3   address     1044 non-null   object
 4   famsize     1044 non-null   object
 5   Pstatus     1044 non-null   object
 6   Medu        1044 non-null   int64 
 7   Fedu        1044 non-null   int64 
 8   Mjob        1044 non-null   object
 9   Fjob        1044 non-null   object
 10  reason      1044 non-null   object
 11  guardian    1044 non-null   object
 12  traveltime  1044 non-null   int64 
 13  studytime   1044 non-null   int64 
 14  failures    1044 non-null   int64 
 15  schoolsup   1044 non-null   object
 16  famsup      1044 non-null   object
 17  paid        1044 non-null   object
 18  activities  1044 non-null   object
 19  nursery     1044 non-null   object
 20  higher  

In [13]:
df.describe(include="all").T


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
school,1044.0,2.0,GP,772.0,,,,,,,
sex,1044.0,2.0,F,591.0,,,,,,,
age,1044.0,,,,16.726054,1.239975,15.0,16.0,17.0,18.0,22.0
address,1044.0,2.0,U,759.0,,,,,,,
famsize,1044.0,2.0,GT3,738.0,,,,,,,
Pstatus,1044.0,2.0,T,923.0,,,,,,,
Medu,1044.0,,,,2.603448,1.124907,0.0,2.0,3.0,4.0,4.0
Fedu,1044.0,,,,2.387931,1.099938,0.0,1.0,2.0,3.0,4.0
Mjob,1044.0,5.0,other,399.0,,,,,,,
Fjob,1044.0,5.0,other,584.0,,,,,,,


In [15]:
# 4. Missing Values & Data Quality

missing_counts = df.isna().sum().sort_values(ascending=False)
missing_perc = (missing_counts / len(df) * 100).round(2)

missing_table = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": missing_perc
})

missing_table


Unnamed: 0,missing_count,missing_percent
school,0,0.0
paid,0,0.0
G2,0,0.0
G1,0,0.0
absences,0,0.0
health,0,0.0
Walc,0,0.0
Dalc,0,0.0
goout,0,0.0
freetime,0,0.0


In [16]:
# 5. Feature Types (Numeric vs Categorical)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric features:", numeric_cols)
print("Categorical features:", categorical_cols)


Numeric features: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
Categorical features: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


## 2. Exploratory Data Analysis (EDA)

In [18]:
# 6. Univariate Distributions

# Histograms for numeric features
for col in numeric_cols:
    plt.figure()
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"hist_{col}.png"))
    plt.close()

# Bar plots for categorical features (top 20 categories max)
for col in categorical_cols:
    plt.figure()
    value_counts = df[col].value_counts().head(20)
    sns.barplot(x=value_counts.index, y=value_counts.values)
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Category counts: {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"bar_{col}.png"))
    plt.close()


In [19]:
# 7. Outcome Variable by Groups (Bias-Oriented)

# Target column
TARGET_COL = "G3"

# Sensitive / fairness-related attributes to examine
GROUP_VARS = [
    "sex",           # gender: F / M
    "school",        # school: GP / MS
    "Pstatus",       # parent cohabitation
    "Medu",          # mother's education
    "Fedu",          # father's education
    "internet",      # internet access
    "romantic",      # in a romantic relationship
    "famsize",       # family size
]

# Keep only those that actually exist in df
GROUP_VARS = [g for g in GROUP_VARS if g in df.columns]

GROUP_VARS


['sex', 'school', 'Pstatus', 'Medu', 'Fedu', 'internet', 'romantic', 'famsize']

In [20]:
# Group-wise stats for target (G3)

group_stats_list = []

for var in GROUP_VARS:
    stats = (
        df.groupby(var)[TARGET_COL]
          .agg(["count", "mean", "std", "min", "max"])
          .reset_index()
    )
    stats.insert(0, "group_var", var)
    group_stats_list.append(stats)

group_stats = pd.concat(group_stats_list, ignore_index=True)
group_stats


Unnamed: 0,group_var,sex,count,mean,std,min,max,school,Pstatus,Medu,Fedu,internet,romantic,famsize
0,sex,F,591,11.448393,3.874334,0,19,,,,,,,
1,sex,M,453,11.203091,3.852167,0,20,,,,,,,
2,school,,772,11.63342,3.809208,0,20,GP,,,,,,
3,school,,272,10.514706,3.908521,0,19,MS,,,,,,
4,Pstatus,,121,11.669421,3.617984,0,19,,A,,,,,
5,Pstatus,,923,11.299025,3.895805,0,20,,T,,,,,
6,Medu,,9,12.111111,2.315407,9,15,,,0.0,,,,
7,Medu,,202,10.178218,3.674292,0,18,,,1.0,,,,
8,Medu,,289,10.972318,3.806874,0,19,,,2.0,,,,
9,Medu,,238,11.247899,3.893289,0,19,,,3.0,,,,


In [21]:
# Save group stats to CSV
group_stats_path = os.path.join(REPORTS_DIR, "group_target_stats.csv")
group_stats.to_csv(group_stats_path, index=False)
group_stats_path


'reports\\group_target_stats.csv'

In [23]:
# 8. Boxplots of G3 by Each Sensitive Group

for var in GROUP_VARS:
    plt.figure()
    sns.boxplot(data=df, x=var, y=TARGET_COL)
    plt.title(f"{TARGET_COL} by {var}")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"box_{TARGET_COL}_by_{var}.png"))
    plt.close()


In [24]:
# 9. Correlations & Relationships

# Correlation matrix for numeric features
corr = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Matrix – Numeric Features")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "corr_heatmap_numeric.png"))
plt.close()

# Optional: correlation of features with the target G3
if TARGET_COL in numeric_cols:
    target_corr = corr[TARGET_COL].sort_values(ascending=False)
    target_corr


In [25]:
# 10. Initial Fairness / Bias Checks
# Simple gap analysis: mean target (G3) difference between groups

bias_results = []

for var in GROUP_VARS:
    means = df.groupby(var)[TARGET_COL].mean()
    gap = means.max() - means.min()   # largest difference between groups
    
    bias_results.append({
        "group_var": var,
        "min_group": means.idxmin(),
        "min_mean": means.min(),
        "max_group": means.idxmax(),
        "max_mean": means.max(),
        "gap": round(gap, 3)
    })

bias_df = pd.DataFrame(bias_results).sort_values("gap", ascending=False)
bias_df


Unnamed: 0,group_var,min_group,min_mean,max_group,max_mean,gap
3,Medu,1,10.178218,4,12.509804,2.332
4,Fedu,1,10.367188,0,12.333333,1.966
1,school,MS,10.514706,GP,11.63342,1.119
5,internet,no,10.534562,yes,11.553809,1.019
6,romantic,yes,10.830189,no,11.624071,0.794
7,famsize,GT3,11.189702,LE3,11.70915,0.519
2,Pstatus,T,11.299025,A,11.669421,0.37
0,sex,M,11.203091,F,11.448393,0.245


In [29]:
# 11. Export Short Narrative Notes (Markdown)

notes_path = os.path.join(REPORTS_DIR, "eda_bias_notes.md")

with open(notes_path, "w", encoding="utf-8") as f:
    f.write("# EDA & Initial Bias Notes\n\n")

    f.write("## Dataset Overview\n")
    f.write(f"- Total rows: {df.shape[0]}\n")
    f.write(f"- Total columns: {df.shape[1]}\n")
    f.write(f"- Numeric features: {len(numeric_cols)} → {numeric_cols}\n")
    f.write(f"- Categorical features: {len(categorical_cols)} → {categorical_cols}\n\n")

    f.write("## Missing Values\n")
    f.write("- No missing values detected across all features.\n\n")

    f.write("## Outcome Variable (G3) Summary\n")
    f.write(f"- Mean G3: {df['G3'].mean():.2f}\n")
    f.write(f"- Min G3: {df['G3'].min()}\n")
    f.write(f"- Max G3: {df['G3'].max()}\n\n")

    f.write("## Group-wise G3 Differences (Bias Inspection)\n")
    f.write("The table below shows the largest differences in mean G3 across key demographic and socio-economic groups:\n\n")
    f.write(bias_df.to_markdown(index=False))
    f.write("\n\n")

    f.write("## Interpretations (Auto-generated, edit as needed)\n")
    f.write("- Some groups show meaningful differences in average G3 scores.\n")
    f.write("- These gaps may indicate potential bias or structural inequality.\n")
    f.write("- Further fairness evaluation will be needed during modeling.\n\n")

    f.write("---\n")
    f.write("_This report was auto-generated from Week 2 notebook output._\n")

notes_path


'reports\\eda_bias_notes.md'