In [11]:
"""
Student Performance Dataset - Data Exploration Mastery
Author: Elys√©e NIYIBIZI
Date: 05/01/2026

This comprehensive exploration script demonstrates professional data analysis practices
that showcase both technical skills and clear communication - essential for career growth.

Dataset Source: Student Performance Data (student_performance.csv)
Skills Demonstrated: Data loading, exploration, quality assessment, and professional documentation
"""

# CELL 1: Import Essential Data Science Libraries
# ================================================
# pandas: Primary library for data manipulation and analysis
# numpy: Foundation for numerical computing (imported for future mathematical operations)
import pandas as pd
import numpy as np

print("‚úÖ Libraries imported successfully!")
print("   - pandas for data manipulation")
print("   - numpy for numerical operations")

‚úÖ Libraries imported successfully!
   - pandas for data manipulation
   - numpy for numerical operations


In [12]:
# CELL 2: Load and Validate the Dataset
# =====================================
# Loading data is the first critical step in any data analysis project
# We use a relative path assuming the CSV is in the same directory

student_data = pd.read_csv('../data/raw/student_performance.csv')
print("üìÅ Dataset loaded successfully!")
print(f"   File: student_performance.csv")
print(f"   Loaded into variable: student_data")

üìÅ Dataset loaded successfully!
   File: student_performance.csv
   Loaded into variable: student_data


In [13]:
# CELL 3: First Glimpse - Understanding What We're Working With
# ==============================================================
# Displaying the first 5 rows gives us immediate context about:
# 1. The data structure
# 2. Sample values
# 3. Overall layout

print("üîç First Look at the Data (First 5 Rows):")
print("   This shows us real examples from the dataset")
print("   Each row represents one student's information")
print("-" * 60)

student_data.head()

üîç First Look at the Data (First 5 Rows):
   This shows us real examples from the dataset
   Each row represents one student's information
------------------------------------------------------------


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [14]:
# CELL 4: Understanding the Dataset Scope
# =======================================
# The shape tells us the size of our dataset - crucial for planning analysis

print("üìê Dataset Dimensions:")
print("   Shape shows (number_of_students, number_of_variables_per_student)")

dataset_shape = student_data.shape
print(f"\n   The dataset contains: {dataset_shape[0]:,} student records")
print(f"   Each record has: {dataset_shape[1]} different variables")
print(f"   Total data points: {dataset_shape[0] * dataset_shape[1]:,}")

print("\n" + "=" * 60)
student_data.shape

üìê Dataset Dimensions:
   Shape shows (number_of_students, number_of_variables_per_student)

   The dataset contains: 395 student records
   Each record has: 33 different variables
   Total data points: 13,035



(395, 33)

In [15]:
# CELL 5: Exploring Available Variables
# ======================================
# Knowing what columns exist helps us plan what questions we can answer

print("üìã Available Variables (Column Names):")
print("   These are all the different pieces of information we have about each student")
print("   Variables include demographics, habits, family background, and academic performance")
print("-" * 60)

columns = student_data.columns
print(f"\n   Total variables: {len(columns)}")
print("\n   Variable List:")
for i, col in enumerate(columns, 1):
    print(f"   {i:2d}. {col}")

print("\n" + "=" * 60)
student_data.columns

üìã Available Variables (Column Names):
   These are all the different pieces of information we have about each student
   Variables include demographics, habits, family background, and academic performance
------------------------------------------------------------

   Total variables: 33

   Variable List:
    1. school
    2. sex
    3. age
    4. address
    5. famsize
    6. Pstatus
    7. Medu
    8. Fedu
    9. Mjob
   10. Fjob
   11. reason
   12. guardian
   13. traveltime
   14. studytime
   15. failures
   16. schoolsup
   17. famsup
   18. paid
   19. activities
   20. nursery
   21. higher
   22. internet
   23. romantic
   24. famrel
   25. freetime
   26. goout
   27. Dalc
   28. Walc
   29. health
   30. absences
   31. G1
   32. G2
   33. G3



Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [16]:
# CELL 6: Understanding Data Types and Structure
# ===============================================
# Data types tell us how to handle each variable in analysis
# Object = text/categorical data
# int64 = whole numbers
# float64 = decimal numbers

print("üî¨ Data Types and Structure Analysis:")
print("   This helps us understand:")
print("   - What kind of data each variable contains")
print("   - If there are any missing values")
print("   - How much memory the dataset uses")
print("-" * 60)

student_data.info()

print("\n" + "=" * 60)
print("üìä Quick Data Type Summary:")
print(f"   Categorical/Object columns: {sum(student_data.dtypes == 'object')}")
print(f"   Numerical columns: {sum(student_data.dtypes != 'object')}")

üî¨ Data Types and Structure Analysis:
   This helps us understand:
   - What kind of data each variable contains
   - If there are any missing values
   - How much memory the dataset uses
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14 

In [17]:
# CELL 7: Data Quality Assessment - Missing Values Check
# =======================================================
# Missing values can significantly impact analysis results
# This check ensures our data is complete and reliable

print("üßπ Data Quality Check - Missing Values:")
print("   Missing values appear as NaN or None in the dataset")
print("   They need special handling in analysis")
print("-" * 60)

missing_values = student_data.isnull().sum()
total_missing = missing_values.sum()

if total_missing == 0:
    print("üéâ EXCELLENT NEWS!")
    print(f"   Zero missing values found in all {len(student_data.columns)} columns")
    print("   This means we have complete data for every student and variable")
else:
    print(f"‚ö†Ô∏è  Found {total_missing} missing values")
    print("\n   Columns with missing values:")
    for col, count in missing_values[missing_values > 0].items():
        print(f"   - {col}: {count} missing ({count/len(student_data)*100:.1f}%)")

print("\n" + "=" * 60)
student_data.isnull().sum()

üßπ Data Quality Check - Missing Values:
   Missing values appear as NaN or None in the dataset
   They need special handling in analysis
------------------------------------------------------------
üéâ EXCELLENT NEWS!
   Zero missing values found in all 33 columns
   This means we have complete data for every student and variable



school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [18]:
# CELL 8: Comprehensive Summary - Key Takeaways
# ==============================================
# This summary helps both technical and non-technical stakeholders
# understand what we've discovered about the dataset

print("üìà EXPLORATION SUMMARY")
print("=" * 60)

print("\n1. DATASET OVERVIEW")
print("-" * 40)
print(f"   ‚Ä¢ Students analyzed: {student_data.shape[0]:,}")
print(f"   ‚Ä¢ Variables per student: {student_data.shape[1]}")
print(f"   ‚Ä¢ Total data points: {student_data.shape[0] * student_data.shape[1]:,}")

print("\n2. DATA TYPES DISTRIBUTION")
print("-" * 40)
categorical_count = sum(student_data.dtypes == 'object')
numerical_count = sum(student_data.dtypes != 'object')
print(f"   ‚Ä¢ Categorical variables: {categorical_count}")
print(f"   ‚Ä¢ Numerical variables: {numerical_count}")
print(f"   ‚Ä¢ Example categorical: {list(student_data.select_dtypes(include='object').columns[:3])}")
print(f"   ‚Ä¢ Example numerical: {list(student_data.select_dtypes(include=['int64', 'float64']).columns[:3])}")

print("\n3. DATA QUALITY STATUS")
print("-" * 40)
if student_data.isnull().sum().sum() == 0:
    print("   ‚Ä¢ ‚úÖ PERFECT: No missing values detected")
    print("   ‚Ä¢ All analysis can proceed without data imputation")
else:
    missing_pct = (student_data.isnull().sum().sum() / (student_data.shape[0] * student_data.shape[1])) * 100
    print(f"   ‚Ä¢ ‚ö†Ô∏è  Missing values: {student_data.isnull().sum().sum():,} total ({missing_pct:.2f}%)")
    print("   ‚Ä¢ Data cleaning required before analysis")

print("\n4. NEXT STEPS RECOMMENDED")
print("-" * 40)
print("   ‚Ä¢ Statistical analysis of grades (G1, G2, G3)")
print("   ‚Ä¢ Relationship exploration between variables")
print("   ‚Ä¢ Feature engineering for predictive modeling")
print("   ‚Ä¢ Visualization creation for insights")

print("\n" + "=" * 60)
print("‚úÖ Initial exploration complete! Ready for detailed analysis.")

üìà EXPLORATION SUMMARY

1. DATASET OVERVIEW
----------------------------------------
   ‚Ä¢ Students analyzed: 395
   ‚Ä¢ Variables per student: 33
   ‚Ä¢ Total data points: 13,035

2. DATA TYPES DISTRIBUTION
----------------------------------------
   ‚Ä¢ Categorical variables: 17
   ‚Ä¢ Numerical variables: 16
   ‚Ä¢ Example categorical: ['school', 'sex', 'address']
   ‚Ä¢ Example numerical: ['age', 'Medu', 'Fedu']

3. DATA QUALITY STATUS
----------------------------------------
   ‚Ä¢ ‚úÖ PERFECT: No missing values detected
   ‚Ä¢ All analysis can proceed without data imputation

4. NEXT STEPS RECOMMENDED
----------------------------------------
   ‚Ä¢ Statistical analysis of grades (G1, G2, G3)
   ‚Ä¢ Relationship exploration between variables
   ‚Ä¢ Feature engineering for predictive modeling
   ‚Ä¢ Visualization creation for insights

‚úÖ Initial exploration complete! Ready for detailed analysis.
