# Python data cleaning demo
This notebook demonstrates reading a CSV (`HN/data/sample_sales.csv`) with pandas, displaying rows, removing duplicates, handling nulls, and saving cleaned data.

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 120)

# File path to the sample data
csv_path = 'HN/data/sample_sales.csv'

In [None]:
# Load the CSV into a DataFrame
df = pd.read_csv(csv_path)

# Basic overview
print('Rows, Columns:', df.shape)
df.head(12)

In [None]:
# Drop duplicates (keep first occurrence)
df_no_dupes = df.drop_duplicates()
print('After drop_duplicates — rows:', len(df_no_dupes))

# Drop rows with any nulls
df_no_nulls = df_no_dupes.dropna()
print('After dropna (any nulls removed) — rows:', len(df_no_nulls))

# Alternatively: drop rows where specific columns are null
important_cols = ['product_id', 'price', 'quantity']
df_filtered = df_no_dupes.dropna(subset=important_cols)
print('After dropna on important columns — rows:', len(df_filtered))

# Fill nulls example: fill numeric nulls with 0 and empty dates with '1970-01-01'
df_filled = df_no_dupes.copy()
df_filled['quantity'] = df_filled['quantity'].fillna(0)
df_filled['price'] = df_filled['price'].fillna(0)
df_filled['sale_date'] = df_filled['sale_date'].fillna('1970-01-01')
print('\nSample after fillna:')
print(df_filled.head(8).to_string(index=False))

In [None]:
# Save a cleaned version to disk
cleaned_path = 'HN/data/cleaned_sales.csv'

df_filled.to_csv(cleaned_path, index=False)
print(f'Cleaned CSV written to: {cleaned_path}')

# Example: reading in chunks (useful for very large files)
print('\nChunked read example (2-row chunks):')
for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=2)):
    print(f'Chunk {i} shape:', chunk.shape)
    if i >= 2:  # limit demonstration to first three chunks
        break