# 01 — Data Profiling

The goal of this notebook is to understand the raw dataset before touching it.
We will inspect structure, data types, missing values, and basic statistics.
No cleaning happens here — only observation and documentation of findings.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from pathlib import Path
from loguru import logger
import yaml

# Plot settings
sns.set_theme(style="darkgrid")
plt.rcParams["figure.figsize"] = (12, 5)

## Load config file

In [4]:
with open("../configs/config.yaml", "r") as f:
    config = yaml.safe_load(f) # Reads the YAML file and converts to python dictionary

# Paths
raw_path = Path("../") / config["paths"]["raw_data"]
logger.info(f"Raw data path: {raw_path}")

# `Path("../")` — creates a Path object pointing one folder up (to `powersight/`)
# `config["paths"]["raw_data"]` — gets the value `"data/raw/household_power_consumption.txt"` from your config
#`/` — this is not division here. When used with `Path` objects, `/` joins paths together. So the result is:
# ../data/raw/household_power_consumption.txt

[32m2026-02-24 08:18:27.810[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mRaw data path: ../data/raw/household_power_consumption.txt[0m


## Import data using pandas

In [None]:
df = pd.read_csv(raw_path, sep = ";", na_values=["?"], low_memory=False)
# The actual missing values in this dataset are marked with a "?" character 
# that's how UCI encoded them. So we tell pandas to treat "?" as NaN instead.
logger.info(f"Dataset loaded succesfully")
logger.info(f"Shape: {df.shape[0]:,} rows* {df.shape[1]} columns")

[32m2026-02-24 08:19:30.574[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mDataset loaded succesfully[0m
[32m2026-02-24 08:19:30.575[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mShape: 2,075,259 rows* 9 columns[0m
