In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [15]:
def root_mean_squared_error(predictions, y_valid):
    return np.sqrt(mean_squared_error(predictions, y_valid))

def rf_rmse(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=42)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

def gbm_rmse(train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

In [9]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [24]:
X = train_df.dropna(axis=0, subset='SalePrice')
X = X.drop('SalePrice', axis=1)
X_test_full = test_df.drop("Id", axis=1)
y = train_df.SalePrice

# X_test must have 1459 rows
print("Shape of X:", X.shape, "| NaN Values:", X.isna().sum().sum())
print("Shape of X_test:", X_test_full.shape, "| NaN Values:", X_test_full.isna().sum().sum())
# Checking if both have same Columns
if list(X_test_full.columns) == list(X.columns):
    print("Both DataFrames have same Columns")
else:
    print("There's a difference in columns between the DataFrames")
    
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Selecting Categorical columns with low cardinality
categorical_cols = [col for col in X_train_full.columns 
                       if X_train_full[col].dtype=='object' 
                       and X_train_full[col].nunique()<10]

# Select Numerical columns
numerical_cols = [col for col in X_train_full.columns
                 if X_train_full[col].dtype in ['int64', 'float64']]

my_cols = numerical_cols + categorical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

Shape of X: (1460, 79) | NaN Values: 7829
Shape of X_test: (1459, 79) | NaN Values: 7878
Both DataFrames have same Columns


In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessong for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Bundle preprocessing and modelling code
def pipeline_model(preprocessor, model):
    my_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', rf_model)
    ])
    return my_pipeline
    
my_pipeline = pipeline_model(preprocessor=preprocessor, model=gbm_model)
my_pipeline.fit(X_train, y_train)
predictions = my_pipeline.predict(X_valid)

rmse = root_mean_squared_error(predictions, y_valid)
print("RMSE:", rmse)

RMSE: 29385.821998319723


In [None]:
# for depth in range(10, 31, 2):
#     rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=depth)
#     my_pipeline = pipeline_model(preprocessor=preprocessor, model=rf_model)
#     my_pipeline.fit(X_train, y_train)
#     predictions = my_pipeline.predict(X_valid)
#     rmse = root_mean_squared_error(predictions, y_valid)
#     print(rmse)