# Set up environment

In [None]:
# Import the 'drive' module from the 'google.colab' package to enable Google Drive integration.
# Then, mount Google Drive to the '/drive' directory within the Colab environment.
# The 'force_remount=True' parameter ensures that the Drive is remounted even if it was previously mounted.

from google.colab import drive
drive.mount('/drive', force_remount=True)

# This allows easy access to files stored in Google Drive.
%cd '/drive/MyDrive/Colab Notebooks/Projects/House Price Competition'

Mounted at /drive
/drive/MyDrive/Colab Notebooks/Projects/House Price Competition


In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

# bmh makes more visual appealing
plt.style.use('bmh')

# set seed for reproducibility
np.random.seed(0)

# Load Data

In [None]:
# the test.csv file has no actual sales price for the competition purpose. So we need to split the trainning dataset into training and validation datasets
from sklearn.model_selection import train_test_split

# load the data
X = pd.read_csv('train.csv', index_col='Id')
X_test = pd.read_csv('test.csv', index_col='Id') # there's no target in here for the competition purpose

# remove rows with missing target – SalePrice
X.dropna(axis=0, subset=['SalePrice'], inplace=True)

# separate target from predictors
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
# check if training and testing datasets have the same format
print(X.shape)
print(X_test.shape)
print(X.columns.equals(X_test.columns))

cat_cols = [col for col in X.columns if X[col].dtype == 'object']
print(len(cat_cols))

(1460, 79)
(1459, 79)
True
43


# Preliminary Investigation

In [None]:
# shape of training data (num_rows, num_columns)
print(X.shape)

# X.info()

# calculate percentage of missing value
total_cells = np.product(X.shape)
missing_value_cols = X.isnull().sum()
total_missing_cells = missing_value_cols.sum()
missing_percentage = total_missing_cells / total_cells * 100
print('Percentage of missing values: ' + str(format(missing_percentage, '.2f')) + '%')
print(missing_value_cols[missing_value_cols > 0])

(1460, 79)
Percentage of missing values: 6.79%
LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [None]:
# numerical columns with missing value
missing_value_num_cols = [col for col in X.columns if X[col].dtype != 'object' and X[col].isnull().sum() > 0]
print('Numerical columns with missing value:')
print(X[missing_value_num_cols].isnull().sum())
print(len(missing_value_num_cols))

# categorical columns with missing value
missing_value_cat_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].isnull().sum()]
print('Categorical columns with missing value:')
print(X[missing_value_cat_cols].isnull().sum())
print(len(missing_value_cat_cols))

Numerical columns with missing value:
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64
3
Categorical columns with missing value:
Alley           1369
MasVnrType       872
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
16


In [None]:
# examine one feature in detail
feature = 'Electrical'
unique_values = X[feature].unique()
print(unique_values)
# Total count of values in the 'Electrical' column
total_count = X[feature].count()

# Count of occurrences of the specific value
count = X[feature].value_counts()[unique_values[0]]

# Calculate the percentage
percentage = (count / total_count) * 100

# Print the result
print('Percentage of the first value:', format(percentage,'.2f'))

['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
Percentage of the first value: 91.43


# Handle Missing Value and Encode Categorical Data

### Create a Data Process Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# Identify numerical and categorical columns
numerical_cols = [col for col in X.columns if X[col].dtype != 'object']
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

# Preprocessing for numerical data (mean imputation)
numerical_transformer = SimpleImputer(strategy='mean')

# Create OrdinalEncoder instance for categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill with string
    ('ordinal', ordinal_encoder)  # Use OrdinalEncoder
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Choose a training model
model = XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

### Apply Cross-Validation to Evaluate the Pipeline

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

# X is broken into 5 folds (80% of X for training, 20% of X for validation, and do 5 interations)
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the average cross-validation score
print('Mean Cross-Validation Score (MAE):', -cv_scores.mean())


Mean Cross-Validation Score (MAE): 16357.951607983734


**Results:**
- RandomForestRegressor, n_estimator = 100: Mean Cross-Validation Score (MAE): 17635.59697260274
- RandomForestRegressor, n_estimator = 500: Mean Cross-Validation Score (MAE): 17625.086326027398
- XGBRegressor: Mean Cross-Validation Score (MAE): 16357.951607983734


### Apply the Pipeline

In [None]:
# Fit the entire pipeline on training data
pipeline.fit(X, y)  # Fit directly on original data

# Transform the training data and test data
# X_transformed = pipeline.named_steps['preprocessor'].transform(X)
# X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)
X_transformed = pipeline[:-1].transform(X)
X_test_transformed = pipeline[:-1].transform(X_test)

# Get feature names after encoding
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

# Remove prefixes from feature names
feature_names = [name.split('__')[-1] for name in feature_names]

# Convert to DataFrame and assign column names
X_transformed = pd.DataFrame(X_transformed, columns=feature_names)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)

# Check if columns in transformed datasets are equal
print(X_transformed.columns.equals(X_test_transformed.columns))

True


In [74]:
# Predict on test data and save to file for submission
test_preds = pipeline.predict(X_test_transformed)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)