In [1]:
"""
Exploratory Data Analysis (EDA) - Demonstration Script
Author: ElysÃ©e NIYIBIZI
Role: Machine Learning / Data Science Candidate

Description:
This single-cell notebook script demonstrates structured exploratory 
data analysis using pandas. It follows a typical ML workflow:

1. Data creation / loading
2. Structural inspection
3. Statistical summary
4. Column and row selection
5. Filtering
6. Missing value analysis
7. Basic aggregation

Designed to be clean, readable, and recruiter-friendly.
"""

import pandas as pd
import numpy as np
from IPython.display import display


# -------------------------------------------------------
# Configuration (Improves Notebook Display Readability)
# -------------------------------------------------------
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)


def create_dataframe() -> pd.DataFrame:
    """
    Create and return a sample employee dataset.

    Returns:
        pd.DataFrame: Structured dataset containing employee details.
    """
    data = {
        "Name": ["Alice", "Bob", "Eric"],
        "Age": [25, 30, 35],
        "Salary": [50000, 60000, 70000]
    }
    return pd.DataFrame(data)


def run_basic_eda(df: pd.DataFrame) -> None:
    """
    Perform structured exploratory data analysis.

    Args:
        df (pd.DataFrame): Input dataset.
    """

    separator = "=" * 60

    # -------------------------------
    # Full Dataset
    # -------------------------------
    print(f"\n{separator}\nFULL DATAFRAME\n{separator}")
    display(df)

    # -------------------------------
    # Structural Overview
    # -------------------------------
    print(f"\n{separator}\nSTRUCTURAL INFORMATION\n{separator}")
    print("Shape:", df.shape)
    display(df.dtypes)
    df.info()

    # -------------------------------
    # Preview Data
    # -------------------------------
    print(f"\n{separator}\nDATA PREVIEW\n{separator}")
    print("First 2 rows:")
    display(df.head(2))

    print("Last 2 rows:")
    display(df.tail(2))

    # -------------------------------
    # Statistical Summary
    # -------------------------------
    print(f"\n{separator}\nSTATISTICAL SUMMARY\n{separator}")
    display(df.describe())

    # -------------------------------
    # Column Selection
    # -------------------------------
    print(f"\n{separator}\nCOLUMN SELECTION\n{separator}")
    print("Single column (Age):")
    display(df["Age"])

    print("Multiple columns (Name, Age):")
    display(df[["Name", "Age"]])

    # -------------------------------
    # Row Selection
    # -------------------------------
    print(f"\n{separator}\nROW SELECTION\n{separator}")
    print("Row at index 2:")
    display(df.iloc[2])

    print("Rows from index 0 to 2:")
    display(df.iloc[0:3])

    # -------------------------------
    # Conditional Filtering
    # -------------------------------
    print(f"\n{separator}\nFILTERING\n{separator}")
    display(df[df["Age"] > 30])

    # -------------------------------
    # Missing Values
    # -------------------------------
    print(f"\n{separator}\nMISSING VALUE ANALYSIS\n{separator}")
    display(df.isnull().sum())

    # -------------------------------
    # Categorical & Aggregation
    # -------------------------------
    print(f"\n{separator}\nCATEGORICAL ANALYSIS & AGGREGATION\n{separator}")
    print("Value counts (Name):")
    display(df["Name"].value_counts())

    print("Mean Salary grouped by Age:")
    display(df.groupby("Age")["Salary"].mean())


# -------------------------------------------------------
# Execution
# -------------------------------------------------------
df = create_dataframe()
run_basic_eda(df)



FULL DATAFRAME


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Eric,35,70000



STRUCTURAL INFORMATION
Shape: (3, 3)


Name      object
Age        int64
Salary     int64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Salary  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 204.0+ bytes

DATA PREVIEW
First 2 rows:


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000


Last 2 rows:


Unnamed: 0,Name,Age,Salary
1,Bob,30,60000
2,Eric,35,70000



STATISTICAL SUMMARY


Unnamed: 0,Age,Salary
count,3.0,3.0
mean,30.0,60000.0
std,5.0,10000.0
min,25.0,50000.0
25%,27.5,55000.0
50%,30.0,60000.0
75%,32.5,65000.0
max,35.0,70000.0



COLUMN SELECTION
Single column (Age):


0    25
1    30
2    35
Name: Age, dtype: int64

Multiple columns (Name, Age):


Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Eric,35



ROW SELECTION
Row at index 2:


Name       Eric
Age          35
Salary    70000
Name: 2, dtype: object

Rows from index 0 to 2:


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Eric,35,70000



FILTERING


Unnamed: 0,Name,Age,Salary
2,Eric,35,70000



MISSING VALUE ANALYSIS


Name      0
Age       0
Salary    0
dtype: int64


CATEGORICAL ANALYSIS & AGGREGATION
Value counts (Name):


Name
Alice    1
Bob      1
Eric     1
Name: count, dtype: int64

Mean Salary grouped by Age:


Age
25    50000.0
30    60000.0
35    70000.0
Name: Salary, dtype: float64