# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [None]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [None]:
import pandas as pd
from src import cleaning

In [1]:
from pathlib import Path
import pandas as pd

RAW_DIR = Path("../data/raw")
raw_files = sorted(RAW_DIR.glob("*.csv"))
assert raw_files, "Put at least one CSV into homework/homework6/data/raw/"
raw_path = raw_files[-1]
print("Using RAW file:", raw_path)

df_raw = pd.read_csv(raw_path)
df_raw.head()


Using RAW file: ../data/raw/sample_raw.csv


Unnamed: 0,id,ticker,price,volume,sector
0,1,AAPL,189.4,1000000.0,Tech
1,2,MSFT,,850000.0,Tech
2,3,NVDA,111.9,,Tech
3,4,AMZN,175.2,920000.0,Consumer
4,5,TSLA,,,


## Load Raw Dataset

In [1]:
from pathlib import Path
print("Notebook cwd:", Path(".").resolve())
print("Expected module path:", (Path("..") / "src" / "cleaning.py").resolve())
print("Exists? ", (Path("..") / "src" / "cleaning.py").exists())


Notebook cwd: /Users/liucanxin/bootcamp_mary_liu/homework/homework6/notebooks
Expected module path: /Users/liucanxin/bootcamp_mary_liu/homework/homework6/src/cleaning.py
Exists?  False


In [2]:
from pathlib import Path
code = r'''
from __future__ import annotations
import pandas as pd
import numpy as np

def fill_missing_median(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns:
            med = out[c].median(skipna=True)
            out[c] = out[c].fillna(med)
    return out

def drop_missing(df: pd.DataFrame, thresh: float = 0.0, subset: list[str] | None = None) -> pd.DataFrame:
    out = df.copy()
    if subset is None:
        subset = out.columns.tolist()
    if thresh <= 0:
        return out.dropna(subset=subset)
    if 0 < thresh <= 1:
        mask = out[subset].isna().mean(axis=1) <= thresh
        return out[mask]
    raise ValueError("thresh must be between 0 and 1 (inclusive) or 0 to behave like dropna.")

def normalize_data(df: pd.DataFrame, cols: list[str], method: str = "zscore") -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c not in out.columns:
            continue
        s = pd.to_numeric(out[c], errors="coerce")
        if method == "zscore":
            mu, sd = s.mean(skipna=True), s.std(skipna=True)
            out[c] = (s - mu) / sd if (sd and sd != 0) else 0.0
        elif method == "minmax":
            lo, hi = s.min(skipna=True), s.max(skipna=True)
            rng = hi - lo
            out[c] = (s - lo) / rng if (rng and rng != 0) else 0.0
        else:
            raise ValueError("method must be 'zscore' or 'minmax'")
    return out
'''
p = Path("..") / "src" / "cleaning.py"
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(code, encoding="utf-8")
print("Wrote:", p.resolve())
print("Exists now? ", p.exists())


Wrote: /Users/liucanxin/bootcamp_mary_liu/homework/homework6/src/cleaning.py
Exists now?  True


In [5]:
import pandas as pd
from pathlib import Path

# Adjust filename to match what you have in data/raw
raw_path = Path("../data/raw/sample_raw.csv")

df_raw = pd.read_csv(raw_path)
print("Loaded:", raw_path)
df_raw.head()


Loaded: ../data/raw/sample_raw.csv


Unnamed: 0,id,ticker,price,volume,sector
0,1,AAPL,189.4,1000000.0,Tech
1,2,MSFT,,850000.0,Tech
2,3,NVDA,111.9,,Tech
3,4,AMZN,175.2,920000.0,Consumer
4,5,TSLA,,,


In [6]:
import cleaning
from importlib import reload
reload(cleaning)

numeric_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])]
numeric_cols




['id', 'price', 'volume']

In [None]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head()

## Apply Cleaning Functions

In [9]:
import sys
from pathlib import Path
from importlib import reload

# make sure ../src is on the path
src_path = Path("..") / "src"
if str(src_path.resolve()) not in sys.path:
    sys.path.append(str(src_path.resolve()))

import cleaning
reload(cleaning)


<module 'cleaning' from '/Users/liucanxin/bootcamp_mary_liu/homework/homework6/src/cleaning.py'>

In [10]:
# 1) Fill missing numeric columns with median
df_filled = cleaning.fill_missing_median(df_raw, numeric_cols)

# 2) Drop rows with too much missing data (more than 50%)
df_dropped = cleaning.drop_missing(df_filled, thresh=0.5)

# 3) Normalize selected numeric columns (z-score scaling)
cols_to_scale = numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
df_clean = cleaning.normalize_data(df_dropped, cols_to_scale, method="zscore")

# Preview cleaned data
df_clean.head()



Unnamed: 0,id,ticker,price,volume,sector
0,-1.336306,AAPL,-0.239063,1.073767,Tech
1,-0.801784,MSFT,-0.292579,-0.063163,Tech
2,-0.267261,NVDA,-0.823217,0.202121,Tech
3,0.267261,AMZN,-0.346095,0.467404,Consumer
4,0.801784,TSLA,-0.292579,0.202121,


In [11]:
df_filled  = cleaning.fill_missing_median(df_raw, numeric_cols)
df_dropped = cleaning.drop_missing(df_filled, thresh=0.5)
cols_to_scale = numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
df_clean   = cleaning.normalize_data(df_dropped, cols_to_scale, method="zscore")
df_clean.head()




Unnamed: 0,id,ticker,price,volume,sector
0,-1.336306,AAPL,-0.239063,1.073767,Tech
1,-0.801784,MSFT,-0.292579,-0.063163,Tech
2,-0.267261,NVDA,-0.823217,0.202121,Tech
3,0.267261,AMZN,-0.346095,0.467404,Consumer
4,0.801784,TSLA,-0.292579,0.202121,


## Save Cleaned Dataset

In [13]:
from pathlib import Path
from datetime import datetime

# make sure processed directory exists
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

# timestamped filename
ts = datetime.utcnow().strftime("%Y%m%d-%H%M")
out_path = PROC_DIR / f"cleaned_{ts}.csv"

# save cleaned dataframe
df_clean.to_csv(out_path, index=False)

print("Saved cleaned dataset to:", out_path)


Saved cleaned dataset to: ../data/processed/cleaned_20250824-0357.csv


##HW6 Cleaning Notes

- **Fill missing values:** Used `fill_missing_median` on numeric columns (robust to outliers).
- **Drop missing:** Dropped rows with >50% missing across all columns using `drop_missing(thresh=0.5)`.
- **Normalize:** Applied z-score scaling on up to 3 numeric columns with `normalize_data`.
- **Comparison:** Checked raw vs cleaned shape and missing-value fractions.
- **Saved Output:** Final cleaned dataset written to `/homework/homework6/data/processed/cleaned_<timestamp>.csv`.
- **Reproducibility:** All cleaning functions live in `homework/homework6/src/cleaning.py`.


## Stage 06 — Data Preprocessing

- Implemented cleaning functions in `homework/homework6/src/cleaning.py`:
  - `fill_missing_median`
  - `drop_missing`
  - `normalize_data`
- Applied pipeline in `homework/homework6/notebooks/hw06_preprocessing.ipynb`
- Input raw data: `homework/homework6/data/raw/`
- Output cleaned data: `homework/homework6/data/processed/`
- Compared raw vs cleaned dataset and documented assumptions.
