In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

train_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/train.csv").drop("Id", axis=1)
test_df = pd.read_csv("E:/Projects/Learning/ML/Housing Prices Prediction/Data/test.csv")
train_df.head()

In [3]:
X = train_df.dropna(axis=0, subset='SalePrice')
X = X.drop('SalePrice', axis=1)
X_test = test_df.drop("Id", axis=1)
y = train_df.SalePrice

missing_value_ocol_train = [col for col in X.columns if X[col].isna().any() and X[col].dtype == 'object']
X = X.drop(missing_value_ocol_train, axis=1)
object_cols = [col for col in X.columns if X[col].dtype == 'object']
X_test = X_test.drop(missing_value_ocol_train, axis=1)

# There are more columns in test data with missing values
missing_value_ocol_test = [col for col in X_test.columns if X_test[col].isna().any() and X_test[col].dtype == 'object']
# Filling NaN values because dropping isn't an option
for col in missing_value_ocol_test:
    if X_test[col].dtype == 'object':
        most_frequent_value = X_test[col].mode()[0]
        X_test[col].fillna(most_frequent_value, inplace=True)

# X_test must have 1459 rows
print("Shape of X:", X.shape, "| NaN Values:", X.isna().sum().sum())
print("Shape of X_test:", X_test.shape, "| NaN Values:", X_test.isna().sum().sum())
# Checking if both have same Columns
if list(X_test.columns) == list(X.columns):
    print("Both DataFrames have same Columns")
else:
    print("There's a difference in columns between the DataFrames")

Shape of X: (1460, 63) | NaN Values: 348
Shape of X_test: (1459, 63) | NaN Values: 330
Both DataFrames have same Columns


In [4]:
num_X = X.select_dtypes(exclude='object')
num_X_test = X_test.select_dtypes(exclude='object')

my_imputer = SimpleImputer(strategy='most_frequent')

imputed_X = pd.DataFrame(my_imputer.fit_transform(num_X))
imputed_X_test = pd.DataFrame(my_imputer.transform(num_X_test))

imputed_X.columns = num_X.columns
imputed_X_test.columns = num_X_test.columns

In [5]:
nunique_dict = {col: X[col].nunique() for col in object_cols}
nunique_dict_sorted = sorted(nunique_dict.items(), key=lambda item:item[1])

In [6]:
low_cardinal_col = [col for col in object_cols if X[col].nunique() < 10]
high_cardinal_col = list(set(object_cols) - set(low_cardinal_col))

OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

low_cardinality_X = X[low_cardinal_col].copy()
low_cardinality_X_test = X_test[low_cardinal_col].copy()

ohe_X = pd.DataFrame(OH_encoder.fit_transform(low_cardinality_X))
ohe_X_test = pd.DataFrame(OH_encoder.transform(low_cardinality_X_test))

ohe_X.index = low_cardinality_X.index
ohe_X_test.index = low_cardinality_X_test.index

OH_col_names_to_replace_with = {col: "col_"+str(col+1) for col in ohe_X.columns}
ohe_X = ohe_X.rename(columns=OH_col_names_to_replace_with)
ohe_X_test = ohe_X_test.rename(columns=OH_col_names_to_replace_with)

OH_X = pd.concat([imputed_X, ohe_X], axis=1)
OH_X_test = pd.concat([imputed_X_test, ohe_X_test], axis=1)

In [7]:
def final_pred_result(X_train, y_train, X_test):
    # Prediction from RandomForest Model
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    # Prediction from GBM Model
    gbm_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.15, random_state=1)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    # Averaging both Predictions
    return (rf_preds + gbm_preds) / 2

# Label_final_preds = final_pred_result(label_X, y, label_X_test)
OH_final_preds = final_pred_result(OH_X, y, OH_X_test)

In [8]:
# output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': OH_final_preds})
# output.to_csv("Submission.csv", index=False)