Load the messy data

In [1]:
import pandas as pd
import numpy as np

# A messy, real-world scenario
data = {
    'House_ID': [1, 2, 3, 4, 5],
    'City': ['Mumbai', 'Delhi', 'Mumbai', 'Bangalore', 'Delhi'],
    'Bedrooms': [3, np.nan, 2, 4, np.nan],
    'Price_USD': [150000, 85000, 120000, np.nan, 95000]
}

df = pd.DataFrame(data)
print("--- Raw Corrupted Data ---")
print(df)

--- Raw Corrupted Data ---
   House_ID       City  Bedrooms  Price_USD
0         1     Mumbai       3.0   150000.0
1         2      Delhi       NaN    85000.0
2         3     Mumbai       2.0   120000.0
3         4  Bangalore       4.0        NaN
4         5      Delhi       NaN    95000.0


Diagnosis

In [2]:
missing_values = df.isnull().sum()
missing_values

House_ID     0
City         0
Bedrooms     2
Price_USD    1
dtype: int64

Fix (Imputation)

In [4]:
medain_beds = df["Bedrooms"].median()
df["Bedrooms"] = df["Bedrooms"].fillna(medain_beds)

mean_price = df["Price_USD"].mean()
df["Price_USD"] = df["Price_USD"].fillna(mean_price)

df

Unnamed: 0,House_ID,City,Bedrooms,Price_USD
0,1,Mumbai,3.0,150000.0
1,2,Delhi,3.0,85000.0
2,3,Mumbai,2.0,120000.0
3,4,Bangalore,4.0,112500.0
4,5,Delhi,3.0,95000.0


Translation (One-hot Encoding)

In [6]:
df_clean = pd.get_dummies(df, columns=["City"], dtype= int)
df_clean

Unnamed: 0,House_ID,Bedrooms,Price_USD,City_Bangalore,City_Delhi,City_Mumbai
0,1,3.0,150000.0,0,0,1
1,2,3.0,85000.0,0,1,0
2,3,2.0,120000.0,0,0,1
3,4,4.0,112500.0,1,0,0
4,5,3.0,95000.0,0,1,0
