In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def rf_rmse(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse
    
def gbm_rmse(rate, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=rate, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    rmse = np.sqrt(mean_squared_error(predictions, val_y))
    return rmse

In [3]:
train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
X = train_df.dropna(axis=0, subset='SalePrice')
X = X.drop('SalePrice', axis=1)
X_test = test_df.drop("Id", axis=1)
y = train_df.SalePrice

missing_value_ocol_train = [col for col in X.columns if X[col].isna().any() and X[col].dtype == 'object']
X = X.drop(missing_value_ocol_train, axis=1)
object_cols = [col for col in X.columns if X[col].dtype == 'object']
X_test = X_test.drop(missing_value_ocol_train, axis=1)

# There are more columns in test data with missing values
missing_value_ocol_test = [col for col in X_test.columns if X_test[col].isna().any() and X_test[col].dtype == 'object']
# Filling NaN values because dropping isn't an option
for col in missing_value_ocol_test:
    if X_test[col].dtype == 'object':
        most_frequent_value = X_test[col].mode()[0]
        X_test[col].fillna(most_frequent_value, inplace=True)

# X_test must have 1459 rows
print("Shape of X:", X.shape, "| NaN Values:", X.isna().sum().sum())
print("Shape of X_test:", X_test.shape, "| NaN Values:", X_test.isna().sum().sum())
# Checking if both have same Columns
if list(X_test.columns) == list(X.columns):
    print("Both DataFrames have same Columns")
else:
    print("There's a difference in columns between the DataFrames")

Shape of X: (1460, 63) | NaN Values: 348
Shape of X_test: (1459, 63) | NaN Values: 330
Both DataFrames have same Columns


In [None]:
num_X = X.select_dtypes(exclude='object')
num_X_test = X_test.select_dtypes(exclude='object')

my_imputer = SimpleImputer(strategy='most_frequent')

imputed_X = pd.DataFrame(my_imputer.fit_transform(num_X))
imputed_X_test = pd.DataFrame(my_imputer.transform(num_X_test))

imputed_X.columns = num_X.columns
imputed_X_test.columns = num_X_test.columns

In [None]:
# Columns with good labels from Training
good_label_cols = [col for col in object_cols if set(X_test[col]).issubset(X[col])]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

ordinal_encoder = OrdinalEncoder()

object_X = X[object_cols].copy()
object_X_test = X_test[object_cols].copy()

ord_X = pd.DataFrame(ordinal_encoder.fit_transform(object_X))
ord_X_test = pd.DataFrame(ordinal_encoder.transform(object_X_test))

col_names_to_replace_with = {col: "col_"+str(col+1) for col in ord_X.columns}
ord_X = ord_X.rename(columns=col_names_to_replace_with)
ord_X_test = ord_X_test.rename(columns=col_names_to_replace_with)

label_X = pd.concat([imputed_X, ord_X], axis=1)
label_X_test = pd.concat([imputed_X_test, ord_X_test], axis=1)

In [8]:
def final_pred_result(X_train, y_train, X_test):
    # Prediction from RandomForest Model
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    # Prediction from GBM Model
    gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=1)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    # Averaging both Predictions
    return (rf_preds + gbm_preds) / 2

Label_final_preds = final_pred_result(label_X, y, label_X_test)
# OH_final_preds = final_pred_result(OH_X_train, y_train, OH_X_test)

In [9]:
# output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': Label_final_preds})
# output.to_csv("Submission.csv", index=False)