In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
color_pal = sns.color_palette('bright')

In [2]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/train.csv")
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/test.csv")
train_df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [3]:
def LotFrontage_rf_model(X, y, test_X):
    lot_frontage_model = RandomForestRegressor(max_depth=8, n_estimators=100, random_state=1)
    lot_frontage_model.fit(X, y)
    return lot_frontage_model.predict(test_X)
    
def preprocess_TRAIN_DF(train_df):
    conti_train_df = train_df.select_dtypes(exclude='object').drop('SalePrice', axis=1)
    conti_train_df = conti_train_df.drop("Id", axis=1)
    missing_value_col = [col for col in conti_train_df.columns if conti_train_df[col].isna().any()]
    conti_train_df['MasVnrArea'] = conti_train_df['MasVnrArea'].fillna(0)
    conti_train_df.drop("GarageYrBlt", axis=1, inplace=True)
    feature_train_df = conti_train_df.loc[conti_train_df['LotFrontage'].notna()]
    feature_test_df = conti_train_df.loc[conti_train_df['LotFrontage'].isna()]
    
    X = feature_train_df.drop("LotFrontage", axis=1)
    y = feature_train_df.LotFrontage
    test_X = feature_test_df.drop("LotFrontage", axis=1)
    
    predictions = LotFrontage_rf_model(X, y, test_X)
        
    p_lot_frontage_df = test_X.copy()
    p_lot_frontage_df['LotFrontage'] = predictions
    p_lot_frontage_df = p_lot_frontage_df[feature_train_df.columns]
    
    add_train_preds = pd.concat([feature_train_df, p_lot_frontage_df], axis=0).sort_index()
    train_output = pd.concat([add_train_preds, train_df['SalePrice']], axis=1)
    return train_output

def preprocess_test_DF(test_df):    
    conti_train_df = test_df.select_dtypes(exclude='object')
    conti_train_df = conti_train_df.drop("Id", axis=1)
    missing_value_col = [col for col in conti_train_df.columns if conti_train_df[col].isna().any()]
    missing_value_col.remove('LotFrontage')
    missing_value_col.remove('MasVnrArea')
    missing_value_col.remove('GarageYrBlt')
    conti_train_df['MasVnrArea'] = conti_train_df['MasVnrArea'].fillna(0)
    conti_train_df.drop("GarageYrBlt", axis=1, inplace=True)
    conti_train_df[missing_value_col] = conti_train_df[missing_value_col].fillna(-1)
    # print(conti_train_df.isna().sum())
    
    feature_train_df = conti_train_df.loc[conti_train_df['LotFrontage'].notna()]
    feature_test_df = conti_train_df.loc[conti_train_df['LotFrontage'].isna()]
    
    X = feature_train_df.drop("LotFrontage", axis=1)
    y = feature_train_df.LotFrontage
    test_X = feature_test_df.drop("LotFrontage", axis=1)
    
    predictions = LotFrontage_rf_model(X, y, test_X)
        
    p_lot_frontage_df = test_X.copy()
    p_lot_frontage_df['LotFrontage'] = predictions
    p_lot_frontage_df = p_lot_frontage_df[feature_train_df.columns]
    
    test_output = pd.concat([feature_train_df, p_lot_frontage_df], axis=0).sort_index()
    return test_output

In [4]:
final_train_df = preprocess_TRAIN_DF(train_df)
final_test_df = preprocess_test_DF(test_df)

In [5]:
conti_train_df.shape

NameError: name 'conti_train_df' is not defined

In [5]:
X = final_train_df.drop("SalePrice", axis=1)
y = final_train_df.SalePrice
test_X = final_test_df.copy()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1)

In [6]:
def rf_rmse(max_depth, X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(predictions, y_valid))
    return rmse
    
def gbm_rmse(X_train, X_valid, y_train, y_valid):
    model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(predictions, y_valid))
    return rmse
    
# for depth in range(10, 20):
#     error = rf_rmse(depth, X_train, X_valid, y_train, y_valid)
#     print(f"error: {error} | depth: {depth}")

gbm_error = gbm_rmse(X_train, X_valid, y_train, y_valid)
rf_error = rf_rmse(12, X_train, X_valid, y_train, y_valid)

print(f"Validation RMSE for GradientBoostionRegressor: {gbm_error}")
print(f"Validation RMSE for RandomForestRegressor: {rf_error}")

Validation RMSE for GradientBoostionRegressor: 23169.94000665154
Validation RMSE for RandomForestRegressor: 27243.073498958587


In [7]:
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=1)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(test_X)

gbm_model = GradientBoostingRegressor(n_estimators=200, random_state=1)
gbm_model.fit(X_train, y_train)
gbm_preds = gbm_model.predict(test_X)

final_preds = (rf_preds + gbm_preds) / 2

In [8]:
output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': final_preds})
output.to_csv("Submission.csv", index=False)