# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [1]:
import pandas as pd
from src import cleaning

In [3]:
# Generate demo CSV if not exists
import os

csv_path = 'data/raw/sample_data.csv'
os.makedirs('data', exist_ok=True)

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'numeric_col': [10, None, 40, 55, 70],
        'category_col': ['A', 'B', 'A', 'B', 'C'],
        'price': ['$100', '$200', '$150', None, '$250'],
        'date_str': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

Demo CSV created at data/raw/sample_data.csv


## Load Raw Dataset

In [4]:
df = pd.read_csv('data/raw/sample_data.csv')
df.head()

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,$100,2025-08-01,Electronics
1,,B,$200,2025-08-02,Furniture
2,40.0,A,$150,,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,$250,2025-08-05,


## Apply Cleaning Functions

In [10]:
# Example:
# df = cleaning.fill_missing_median(df, ['col1','col2'])
# df = cleaning.drop_missing(df, threshold=0.5)
# df = cleaning.normalize_data(df, ['col1','col2'])

df = cleaning.fill_missing_median(df, ['numeric_col','price'])
df = cleaning.drop_missing(df, threshold=0.5) 
df = cleaning.normalize_data(df, ['numeric_col'], method='minmax')

## Save Cleaned Dataset

In [11]:
df.to_csv('data/processed/sample_data_cleaned.csv', index=False)

In [12]:
# Compare original vs cleaned data
df = pd.read_csv('data/raw/sample_data.csv')
df_cleaned = pd.read_csv('data/processed/sample_data_cleaned.csv')

df.info()
df_cleaned.head()
df_cleaned.info()
df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   numeric_col   4 non-null      float64
 1   category_col  5 non-null      object 
 2   price         4 non-null      object 
 3   date_str      4 non-null      object 
 4   category      4 non-null      object 
dtypes: float64(1), object(4)
memory usage: 328.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   numeric_col   5 non-null      float64
 1   category_col  5 non-null      object 
 2   price         5 non-null      float64
 3   date_str      4 non-null      object 
 4   category      4 non-null      object 
dtypes: float64(2), object(3)
memory usage: 328.0+ bytes


Unnamed: 0,numeric_col,category_col,price,date_str,category
0,0.0,A,100.0,2025-08-01,Electronics
1,0.625,B,200.0,2025-08-02,Furniture
2,0.5,A,150.0,,Toys
3,0.75,B,175.0,2025-08-04,Clothing
4,1.0,C,250.0,2025-08-05,


## Assumptions
- Missingness: Missing numeric values can be imputed with median without introducing significant bias.
- Row Removal: Dropping rows with more than 50% missing data does not significantly distort the dataset.
- Scaling: Normalization does not change the underlying relationships between variables but ensures features are comparable for analysis or ML tasks.
- Column Selection: Only numeric columns are targeted for median imputation and scaling.