## Missing category imputation - pandas

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Two categorical columns and the target SalePrice
cols_to_use = ["BsmtQual", "FireplaceQu", "SalePrice"]

In [3]:
# Let's load the House Prices dataset.
fileUrl = 'https://raw.githubusercontent.com/cmps460s26/cmps460-content/refs/heads/main/examples/03.dp/datasets/houseprice.csv'
df = pd.read_csv(fileUrl, usecols=cols_to_use)

print(df.shape)
df.head()

(1460, 3)


Unnamed: 0,BsmtQual,FireplaceQu,SalePrice
0,Gd,,208500
1,Gd,TA,181500
2,Gd,TA,223500
3,TA,Gd,140000
4,Gd,TA,250000


In [4]:
# Let's separate into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(
    df.drop("SalePrice", axis=1),
    df["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 2), (438, 2))

In [5]:
# Find missing data

X_train.isnull().mean()

Unnamed: 0,0
BsmtQual,0.023483
FireplaceQu,0.46771


In [6]:
# Capture the imputation values in
# a dictionary

imputation_dict = {
    "BsmtQual": "Missing",
    "FireplaceQu": "Missing",
}

imputation_dict

{'BsmtQual': 'Missing', 'FireplaceQu': 'Missing'}

In [7]:
# Replace missing data
X_train.fillna(imputation_dict, inplace=True)
X_test.fillna(imputation_dict, inplace=True)

In [10]:
# Ensure imputation was done

X_train.isnull().sum()

Unnamed: 0,0
BsmtQual,0
FireplaceQu,0


In [None]:
# Corroborate replacement

X_test.isnull().sum()

BsmtQual       0
FireplaceQu    0
dtype: int64