In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def rf_mae(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    mae = mean_absolute_error(predictions, val_y)
    return mae
    
def gbm_rmse(train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

In [3]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# train_df.columns

In [5]:
X = train_df.dropna(axis=0, subset='SalePrice')
X = X.drop('SalePrice', axis=1)
X_test = test_df.drop("Id", axis=1)
y = train_df.SalePrice

missing_value_ocol_train = [col for col in X.columns if X[col].isna().any() and X[col].dtype == 'object']
X = X.drop(missing_value_ocol_train, axis=1)
object_cols = [col for col in X.columns if X[col].dtype == 'object']
X_test = X_test.drop(missing_value_ocol_train, axis=1)

# There are more columns in test data with missing values
missing_value_ocol_test = [col for col in X_test.columns if X_test[col].isna().any() and X_test[col].dtype == 'object']
# Filling NaN values because dropping isn't an option
for col in missing_value_ocol_test:
    if X_test[col].dtype == 'object':
        most_frequent_value = X_test[col].mode()[0]
        X_test[col].fillna(most_frequent_value, inplace=True)

# X_test must have 1459 rows
print("Shape of X:", X.shape, "| NaN Values:", X.isna().sum().sum())
print("Shape of X_test:", X_test.shape, "| NaN Values:", X_test.isna().sum().sum())
# Checking if both have same Columns
if list(X_test.columns) == list(X.columns):
    print("Both DataFrames have same Columns")
else:
    print("There's a difference in columns between the DataFrames")
    
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Shape of X: (1460, 63) | NaN Values: 348
Shape of X_test: (1459, 63) | NaN Values: 330
Both DataFrames have same Columns


## Imputing Numerical Values

In [6]:
num_X_train = X_train.select_dtypes(exclude='object')
num_X_valid = X_valid.select_dtypes(exclude='object')
num_X_test = X_test.select_dtypes(exclude='object')

my_imputer = SimpleImputer(strategy='most_frequent')

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(num_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(num_X_valid))
imputed_X_test = pd.DataFrame(my_imputer.transform(num_X_test))

imputed_X_train.columns = num_X_train.columns
imputed_X_valid.columns = num_X_valid.columns
imputed_X_test.columns = num_X_test.columns

imputed_X_train.index = num_X_train.index
imputed_X_valid.index = num_X_valid.index
imputed_X_test.index = num_X_test.index

## Ordinal Encoding

In [7]:
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(X_train[col])]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

print("Columns with good labels that are gonna be ordinally encoded:\n", good_label_cols)
print()
print("Columns with bad labels that'll be dropped:", bad_label_cols)

Columns with good labels that are gonna be ordinally encoded:
 ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']

Columns with bad labels that'll be dropped: ['RoofMatl', 'Condition2', 'Functional']


In [8]:
ordinal_encoder = OrdinalEncoder()

object_X_train = X_train[good_label_cols].copy()
object_X_valid = X_valid[good_label_cols].copy()
object_X_test = X_test[good_label_cols].copy()

ord_X_train = pd.DataFrame(ordinal_encoder.fit_transform(object_X_train))
ord_X_valid = pd.DataFrame(ordinal_encoder.transform(object_X_valid))
ord_X_test = pd.DataFrame(ordinal_encoder.transform(object_X_test))

ord_X_train.index = object_X_train.index
ord_X_valid.index = object_X_valid.index
ord_X_test.index = object_X_test.index

col_names_to_replace_with = {col: "col_"+str(col+1) for col in ord_X_train.columns}
ord_X_train = ord_X_train.rename(columns=col_names_to_replace_with)
ord_X_valid = ord_X_valid.rename(columns=col_names_to_replace_with)
ord_X_test = ord_X_test.rename(columns=col_names_to_replace_with)

label_X_train = pd.concat([imputed_X_train, ord_X_train], axis=1)
label_X_valid = pd.concat([imputed_X_valid, ord_X_valid], axis=1)
label_X_test = pd.concat([imputed_X_test, ord_X_test], axis=1)

In [9]:
rf_mae(10, label_X_train, label_X_valid, y_train, y_valid), gbm_rmse(label_X_train, label_X_valid, y_train, y_valid)

(17296.766156112557, 30077.35838597809)

## One-Hot Encoding

In [10]:
d = {col: X_train[col].nunique() for col in object_cols}
d_sorted = sorted(d.items(), key=lambda item:item[1])

In [11]:
low_cardinal_col = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinal_col = list(set(object_cols) - set(low_cardinal_col))

In [12]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

low_cardinality_X_train = X_train[low_cardinal_col].copy()
low_cardinality_X_valid = X_valid[low_cardinal_col].copy()
low_cardinality_X_test = X_test[low_cardinal_col].copy()

ohe_train = pd.DataFrame(OH_encoder.fit_transform(low_cardinality_X_train))
ohe_valid = pd.DataFrame(OH_encoder.transform(low_cardinality_X_valid))
ohe_test = pd.DataFrame(OH_encoder.transform(low_cardinality_X_test))

ohe_train.index = low_cardinality_X_train.index
ohe_valid.index = low_cardinality_X_valid.index
ohe_test.index = low_cardinality_X_test.index

OH_col_names_to_replace_with = {col: "col_"+str(col+1) for col in ohe_train.columns}
ohe_train = ohe_train.rename(columns=OH_col_names_to_replace_with)
ohe_valid = ohe_valid.rename(columns=OH_col_names_to_replace_with)
ohe_test = ohe_test.rename(columns=OH_col_names_to_replace_with)

OH_X_train = pd.concat([imputed_X_train, ohe_train], axis=1)
OH_X_valid = pd.concat([imputed_X_valid, ohe_valid], axis=1)
OH_X_test = pd.concat([imputed_X_test, ohe_test], axis=1)

In [13]:
rf_mae(10, OH_X_train, OH_X_valid, y_train, y_valid), gbm_rmse(OH_X_train, OH_X_valid, y_train, y_valid)

(17624.56109655572, 29251.68800032398)

In [14]:
def final_pred_result(X_train, y_train, X_test):
    # Prediction from RandomForest Model
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    # Prediction from GBM Model
    gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    # Averaging both Predictions
    return (rf_preds + gbm_preds) / 2

Label_final_preds = final_pred_result(label_X_train, y_train, label_X_test)
OH_final_preds = final_pred_result(OH_X_train, y_train, OH_X_test)

In [15]:
# As of now, my best score is from Label_final_preds

In [16]:
output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': OH_final_preds})
output.to_csv("Submission.csv", index=False)

In [17]:
# I haven't trained my model on full data yet