In [2]:
import pandas as pd

# Read the data
X_train = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

print(X_train.shape)
print(X_test.shape)

#Remove rows with missing target, separate target from predictors
X_train.dropna(axis=0, subset=['SalePrice'], inplace=True)
print(X_train.shape)

#Separate target from predictors (i.e. input features)
y_train = X_train.SalePrice
X_train.drop(['SalePrice'], axis=1, inplace=True)

#Perform ordinal encoding to categorical feature columns 
from sklearn.preprocessing import OrdinalEncoder

# Get the columns that contain strings and treat them as categorical
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_test[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

# Drop categorical columns that will not be encoded
X_train_ordinal = X_train.drop(bad_label_cols, axis=1)
X_test_ordinal = X_test.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder() # Your code here
X_train_ordinal[good_label_cols] = ordinal_encoder.fit_transform(X_train_ordinal[good_label_cols])
X_test_ordinal[good_label_cols] = ordinal_encoder.transform(X_test_ordinal[good_label_cols])

#fill missing values in train and test sets using IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 

final_imputer = IterativeImputer(max_iter=3, random_state=0) 
X_train_imputed = final_imputer.fit_transform(X_train_ordinal)
X_test_imputed = final_imputer.transform(X_test_ordinal)


(1460, 80)
(1459, 79)
(1460, 80)
Categorical columns that will be ordinal encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Exterior2nd', 'Functional', 'MSZoning', 'Utilities', 'Exterior1st', 'KitchenQual', 'SaleType']
