# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [24]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = 'C:/Users/박서아/bootcamp_andrew_song/homework/homework6/data/raw'
processed_dir = 'C:/Users/박서아/bootcamp_andrew_song/homework/homework6/data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


File already exists at C:/Users/박서아/bootcamp_andrew_song/homework/homework6/data/raw\sample_data.csv. Skipping CSV creation to avoid overwrite.


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [25]:
import pandas as pd
from src import cleaning

## Load Raw Dataset

In [26]:
df = pd.read_csv('C:/Users/박서아/bootcamp_andrew_song/homework/homework6/data/raw/sample_data.csv')
df.head()

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


## Apply Cleaning Functions

In [27]:
# TODO: Apply your functions here

# Define numeric columns for processing
numeric_columns = ['age', 'income', 'score', 'extra_data']

df_clean = cleaning.drop_missing(df, threshold=0.5)

df_clean = cleaning.fill_missing_median(df_clean, numeric_columns)

df_clean = cleaning.normalize_data(df_clean, numeric_columns)

# Show summary statistics
print("\\n=== SUMMARY STATISTICS ===")
print("Original data stats:")
print(df[numeric_columns].describe())


\n=== SUMMARY STATISTICS ===
Original data stats:
             age        income     score  extra_data
count   6.000000      4.000000  6.000000    2.000000
mean   39.500000  51000.000000  0.801667   23.500000
std     7.556454   7071.067812  0.092826   26.162951
min    29.000000  42000.000000  0.650000    5.000000
25%    35.000000  47250.000000  0.767500   14.250000
50%    39.500000  52000.000000  0.805000   23.500000
75%    44.000000  55750.000000  0.865000   32.750000
max    50.000000  58000.000000  0.910000   42.000000


## Save Cleaned Dataset

In [28]:
df.to_csv('C:/Users/박서아/bootcamp_andrew_song/homework/homework6/data/processed/sample_data_cleaned.csv', index=False)