# Task 5 — Data Cleaning & EDA (House Prices)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

data_path = Path('data/train.csv')
if data_path.exists():
    df = pd.read_csv(data_path)
    print('✅ Dataset loaded successfully!')
else:
    print('⚠️ train.csv not found!')
df.head()

In [None]:
print('Data Types:')
print(df.dtypes)
print('
Missing Values:')
print(df.isna().sum().sort_values(ascending=False).head(20))

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())
for col in cat_cols:
    if not df[col].mode().empty:
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna('Unknown')
print('Remaining Missing Values:', df.isna().sum().sum())

In [None]:
for col in ['SalePrice', 'GrLivArea']:
    print(f'
Column: {col}')
    print('Mean:', df[col].mean())
    print('Median:', df[col].median())
    print('Mode:', df[col].mode()[0])
    print('Variance:', df[col].var())
    print('Std Dev:', df[col].std())

In [None]:
plt.hist(df['SalePrice'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of SalePrice')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

df['MSZoning'].value_counts().plot(kind='bar', color='orange', edgecolor='black')
plt.title('Bar Chart of MSZoning')
plt.xlabel('MSZoning')
plt.ylabel('Count')
plt.show()

In [None]:
df.to_csv('cleaned_house_prices.csv', index=False)
print('✅ Cleaned dataset saved!')

### Reflection
- Learned how to explore and clean dataset.
- Visualization showed skewness and category frequencies.
- Filled missing values using median (numeric) and mode (categorical).