In [1]:
import pandas as pd
import numpy as np
import os

raw_dir = '../data/raw'
processed_dir = '../data/processed'

data = {
    "Age": [25, 30, np.nan, 40, 35, np.nan],
    "Salary": [50000, 60000, 55000, np.nan, 65000, 70000],
    "Department": ["HR", "IT", "IT", "Finance", np.nan, "HR"]
}
df = pd.DataFrame(data)

csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved')
else:
    print(f'File already exists')

File already exists


#### Import cleaning from src

In [2]:
import sys
from pathlib import Path

src = Path("../src")
sys.path.append(str(src))

import cleaning

In [9]:
numeric_cols = ["Age", "Salary"]

# Fill missing
df_filled = cleaning.fill_missing_median(df, numeric_cols)
df_filled

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Unnamed: 0,Age,Salary,Department
0,25.0,50000.0,HR
1,30.0,60000.0,IT
2,32.5,55000.0,IT
3,40.0,60000.0,Finance
4,35.0,65000.0,
5,32.5,70000.0,HR


In [7]:
# Drop missing rows
df_dropped = cleaning.drop_missing(df)
df_dropped

Unnamed: 0,Age,Salary,Department
0,25.0,50000.0,HR
1,30.0,60000.0,IT


In [6]:
# Normalize numeric columns
df_normalized = cleaning.normalize_data(df_filled, numeric_cols)
df_normalized

Unnamed: 0,Age,Salary,Department
0,0.0,0.0,HR
1,0.333333,0.5,IT
2,0.5,0.25,IT
3,1.0,0.5,Finance
4,0.666667,0.75,
5,0.5,1.0,HR


In [12]:
numeric_cols = ["Age", "Salary"]
df_original = df.copy()

df_cleaned = cleaning.fill_missing_median(df_original, numeric_cols)
df_cleaned = cleaning.normalize_data(df_cleaned, numeric_cols)

comparison = pd.concat([df_original, df_cleaned], axis=1, keys=["Original", "Cleaned"])
comparison

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Unnamed: 0_level_0,Original,Original,Original,Cleaned,Cleaned,Cleaned
Unnamed: 0_level_1,Age,Salary,Department,Age,Salary,Department
0,25.0,50000.0,HR,0.0,0.0,HR
1,30.0,60000.0,IT,0.333333,0.5,IT
2,,55000.0,IT,0.5,0.25,IT
3,40.0,,Finance,1.0,0.5,Finance
4,35.0,65000.0,,0.666667,0.75,
5,,70000.0,HR,0.5,1.0,HR


In [13]:
df_cleaned.to_csv('../data/processed/sample_data_cleaned.csv', index=False)