In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# train_df.columns

In [4]:
def rf_rmse(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse
    
def gbm_rmse(train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

X = train_df.select_dtypes(exclude='object').drop('SalePrice', axis=1)
X_test = test_df.select_dtypes(exclude='object').drop("Id", axis=1)
y = train_df.SalePrice

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=1)

In [7]:
X_test.isna().sum()

MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [5]:
my_imputer = SimpleImputer(strategy='mean')

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_X_test.columns = X_test.columns

gbm_error = gbm_rmse(imputed_X_train, imputed_X_valid, y_train, y_valid)
rf_error = rf_rmse(25, imputed_X_train, imputed_X_valid, y_train, y_valid)

print(f"Validation RMSE for GradientBoostionRegressor: {gbm_error}")
print(f"Validation RMSE for RandomForestRegressor: {rf_error}")

Validation RMSE for GradientBoostionRegressor: 21014.722808714032
Validation RMSE for RandomForestRegressor: 22289.395560339508


In [6]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=25, random_state=1)
rf_model.fit(imputed_X_train, y_train)
rf_preds = rf_model.predict(imputed_X_test)

gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=1)
gbm_model.fit(imputed_X_train, y_train)
gbm_preds = gbm_model.predict(imputed_X_test)

final_preds = (rf_preds + gbm_preds) / 2

In [7]:
output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': final_preds})
output.to_csv("Submission.csv", index=False)