In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
def rf_mae(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    mae = mean_absolute_error(predictions, val_y)
    return mae
    
def gbm_rmse(train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

In [18]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# train_df.columns

In [5]:
X = train_df.dropna(axis=0, subset='SalePrice')
X = X.drop('SalePrice', axis=1)
X_test = test_df.drop("Id", axis=1)
y = train_df.SalePrice

missing_value_col_train = [col for col in X.columns if X[col].isna().any()]
X = X.drop(missing_value_col_train, axis=1)
X_test = X_test.drop(missing_value_col_train, axis=1)
# There are more columns in test data with missing values
missing_value_col_test = [col for col in X_test.columns if X_test[col].isna().any()]
# Filling NaN values because dropping isn't an option
for col in missing_value_col_test:
    if X_test[col].dtype == 'object':
        most_frequent_value = X_test[col].mode()[0]
        X_test[col].fillna(most_frequent_value, inplace=True)
    else:
        mean_value = X_test[col].mean()
        X_test[col].fillna(mean_value, inplace=True)

# X_test must have 1459 rows
print("Shape of X:", X.shape, "| NaN Values:", X.isna().sum().sum())
print("Shape of X_test:", X_test.shape, "| NaN Values:", X_test.isna().sum().sum())
# Checking if both have same Columns
if list(X_test.columns) == list(X.columns):
    print("Both DataFrames have same Columns")
else:
    print("There's a difference in columns between the DataFrames")
    
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Shape of X: (1460, 60) | NaN Values: 0
Shape of X_test: (1459, 60) | NaN Values: 0
Both DataFrames have same Columns


## Ordinal Encoding

In [6]:
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(X_train[col])]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

print("Columns with good labels that are gonna be ordinally encoded:\n", good_label_cols)
print()
print("Columns with bad labels that'll be dropped:", bad_label_cols)

Columns with good labels that are gonna be ordinally encoded:
 ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']

Columns with bad labels that'll be dropped: ['RoofMatl', 'Condition2', 'Functional']


In [7]:
ordinal_encoder = OrdinalEncoder()

label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)
label_X_test = X_test.drop(bad_label_cols, axis=1)

ord_X_train = pd.DataFrame(ordinal_encoder.fit_transform(label_X_train[good_label_cols]))
ord_X_valid = pd.DataFrame(ordinal_encoder.transform(label_X_valid[good_label_cols]))
ord_X_test = pd.DataFrame(ordinal_encoder.transform(label_X_test[good_label_cols]))

ord_X_train.index = label_X_train.index
ord_X_valid.index = label_X_valid.index
ord_X_test.index = label_X_test.index

col_names_to_replace_with = {col: "col_"+str(col+1) for col in ord_X_train.columns}
ord_X_train = ord_X_train.rename(columns=col_names_to_replace_with)
ord_X_valid = ord_X_valid.rename(columns=col_names_to_replace_with)
ord_X_test = ord_X_test.rename(columns=col_names_to_replace_with)

label_X_train = pd.concat([label_X_train.drop(good_label_cols, axis=1), ord_X_train], axis=1)
label_X_valid = pd.concat([label_X_valid.drop(good_label_cols, axis=1), ord_X_valid], axis=1)
label_X_test = pd.concat([label_X_test.drop(good_label_cols, axis=1), ord_X_test], axis=1)

In [8]:
label_X_train.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24
618,20,11694,9,5,2007,2007,48,0,1774,1822,...,0.0,4.0,2.0,1.0,0.0,1.0,2.0,2.0,6.0,5.0
870,20,6600,5,5,1962,1962,0,0,894,894,...,3.0,4.0,1.0,1.0,2.0,0.0,3.0,2.0,8.0,4.0
92,30,13360,5,7,1921,2006,713,0,163,876,...,3.0,2.0,0.0,1.0,0.0,1.0,3.0,2.0,8.0,4.0
817,20,13265,8,5,2002,2002,1218,0,350,1568,...,2.0,4.0,2.0,1.0,0.0,1.0,2.0,2.0,8.0,4.0
302,20,13704,7,5,2001,2002,0,0,1541,1541,...,2.0,4.0,2.0,1.0,0.0,1.0,2.0,2.0,8.0,4.0


In [9]:
rf_mae(10, label_X_train, label_X_valid, y_train, y_valid)

17073.798638387438

## One-Hot Encoding

In [10]:
d = {col: X_train[col].nunique() for col in object_cols}
d_sorted = sorted(d.items(), key=lambda item:item[1])

In [11]:
low_cardinal_col = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinal_col = list(set(object_cols) - set(low_cardinal_col))

In [12]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

ohe_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinal_col]))
ohe_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinal_col]))
ohe_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinal_col]))

ohe_train.index = X_train.index
ohe_valid.index = X_valid.index
ohe_test.index = X_test.index

OH_col_names_to_replace_with = {col: "col_"+str(col+1) for col in ohe_train.columns}
ohe_train = ohe_train.rename(columns=OH_col_names_to_replace_with)
ohe_valid = ohe_valid.rename(columns=OH_col_names_to_replace_with)
ohe_test = ohe_test.rename(columns=OH_col_names_to_replace_with)

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, ohe_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, ohe_valid], axis=1)
OH_X_test = pd.concat([num_X_test, ohe_test], axis=1)

In [13]:
rf_mae(10, OH_X_train, OH_X_valid, y_train, y_valid)

17580.087214090203

In [14]:
def final_pred_result(X_train, y_train, X_test):
    # Prediction from RandomForest Model
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    # Prediction from GBM Model
    gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    # Averaging both Predictions
    return (rf_preds + gbm_preds) / 2

Label_final_preds = final_pred_result(label_X_train, y_train, label_X_test)
OH_final_preds = final_pred_result(OH_X_train, y_train, OH_X_test)

In [15]:
# output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': Label_final_preds})
# output.to_csv("Submission.csv", index=False)