In [1]:

import numpy as np
import pandas as pd 
import matplotlib as plt

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
TRAIN_DATA_PATH = "./data/train.csv"
TEST_DATA_PATH = "./data/test.csv"

full_df = pd.read_csv(TRAIN_DATA_PATH, index_col="Id")
X_test = pd.read_csv(TEST_DATA_PATH, index_col="Id")

In [3]:
full_df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [4]:
# Setting target and feature columns
y = full_df.SalePrice
full_df.drop('SalePrice', axis=1, inplace=True)

In [5]:
num_cols = full_df.select_dtypes(exclude='object').columns.to_list()
# choosing categorical columns that have less than 10 cardinality
categorical_cols = [cname for cname in full_df.columns
                    if full_df[cname].dtypes == "object" and full_df[cname].nunique() <=10]

feature_cols = categorical_cols + num_cols
print(f"Number of categorical columns:{len(categorical_cols)}")
print(f"Number of numerical columns:{len(num_cols)}")
print(f"Number of total features to be used for training: {len(feature_cols)}")

Number of categorical columns:40
Number of numerical columns:36
Number of total features to be used for training: 76


In [6]:
#setting the df with the selected features
X_train_full = full_df[feature_cols].copy()
X_test_f = X_test[feature_cols].copy()  

In [7]:
# training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y,test_size=0.2, random_state=1)
print(f"Shape of features {X_train.shape}")

Shape of features (1168, 76)


# Building Pipeline

In [8]:
# for numerical columns
num_transformer = SimpleImputer(strategy="mean")
# for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', categorical_transformer, categorical_cols)
])

my_model = RandomForestRegressor(n_estimators=100, random_state=1)

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', my_model)
    ])


## Training the model and evaluation

In [9]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond',

In [10]:
predictions = my_pipeline.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print(f"Mean Absolute Error {mae:0.2f}")

Mean Absolute Error 16447.12


In [11]:
my_pipeline.predict(X_test)



array([130555.5 , 154640.25, 179534.32, ..., 150011.09, 121966.5 ,
       226474.79])

In [13]:
test_pred = my_pipeline.predict(X_test_f)

In [16]:
submission_df = pd.DataFrame({"id":X_test_f.index,
                              "SalePrice": test_pred})
submission_df

Unnamed: 0,id,SalePrice
0,1461,130555.50
1,1462,154640.25
2,1463,179534.32
3,1464,181399.50
4,1465,198449.12
...,...,...
1454,2915,85617.50
1455,2916,85916.00
1456,2917,150011.09
1457,2918,121966.50


In [18]:
submission_df.to_csv("./data/submission2.csv", index=False)