In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
import pprint
import warnings
warnings.filterwarnings('ignore')

In [2]:
df1 = pd.read_csv('train.csv')
df = df1.copy()

In [3]:
df['PoolQC'] = df['PoolQC'].fillna(value='NP')
df['MiscFeature'] = df['MiscFeature'].fillna(value='Not_present')
df['Alley'] = df['Alley'].fillna(value='No_access')
df['Fence'] = df['Fence'].fillna(value='No_fence')
df['FireplaceQu'] = df['FireplaceQu'].fillna(value='No_fire')
df['LotFrontage'] = df['LotFrontage'].fillna(value=0.0)
df['GarageFinish'] = df['GarageFinish'].fillna(value='No_garage')
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(value=df['GarageYrBlt'].mode()[0])
df['GarageQual'] = df['GarageQual'].fillna(value='No_garage')
df['GarageType'] = df['GarageType'].fillna(value='No_garage')
df['GarageCond'] = df['GarageCond'].fillna(value='No_garage')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(value='No_basement')
df['BsmtExposure'] = df['BsmtExposure'].fillna(value='No_basement')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna(value='No_basement')
df['BsmtQual'] = df['BsmtQual'].fillna(value='No_basement')
df['BsmtCond'] = df['BsmtCond'].fillna(value='No_basement')
df['MasVnrArea'] = df['MasVnrArea'].fillna(value=0.0)
df['MasVnrType'] = df['MasVnrType'].fillna(value='No_masonary')
df['Electrical'] = df['Electrical'].fillna(value=df['GarageYrBlt'].mode()[0])
df['LotFrontage'] = df['LotFrontage'].astype(int)
df['MasVnrArea'] = df['MasVnrArea'].astype(int)
df['GarageYrBlt'] = df['GarageYrBlt'].astype(int)
dtypes = pd.DataFrame(df.dtypes)
dtypes = dtypes.reset_index(drop=False)
dtypes = dtypes.rename(columns={'index':'Feature',0:'dtype'})
cols = list(dtypes['Feature'][dtypes['dtype'] == 'int64'])
to_remove = ['Id','LowQualFinSF','SalePrice','BsmtHalfBath','EnclosedPorch','3SsnPorch','ScreenPorch',
             'PoolArea','MiscVal','BsmtFinSF2','KitchenAbvGr']

for i in to_remove:
    if i in cols:
        cols.remove(i)
for i in cols:
    df = df.sort_values(by=i, ascending=True)
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    ll = Q1 - 1.5*IQR
    ul = Q3 + 1.5*IQR
    for j in range(len(df)):
        if df[i][j] < ll:
            df[i][j] = ll
        elif df[i][j] > ul:
            df[i][j] = ul



In [4]:
df_dt = pd.DataFrame(df.dtypes)
df_dt = df_dt.reset_index(drop=False)
df_dt = df_dt.rename(columns = {'index':'Name',0:'Type'})
cols2 = list(df_dt['Name'][df_dt['Type'] == 'object'])
df_res = pd.get_dummies(df, columns=cols2,drop_first=True)

In [5]:
ss = StandardScaler()
y = np.log(df_res['SalePrice'])
X = df_res.drop(columns='SalePrice', axis=1)
X_Scaled = ss.fit_transform(X)

In [6]:
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_Scaled)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_pca,y, test_size=0.2, random_state=30)

In [8]:
# Define models and hyperparameters
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBoostRegressor': xgb.XGBRegressor()
}

param_grid = {
    'LinearRegression': {},
    'Lasso': {'alpha': [0.1, 0.5, 1.0]},
    'Ridge': {'alpha': [0.1, 0.5, 1.0]},
    'SVR': {'C': [1, 10, 100], 'kernel': ['linear', 'rbf']},
    'RandomForestRegressor': {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]},
    'GradientBoostingRegressor': {'n_estimators': [10, 50, 100], 'learning_rate': [0.01, 0.1, 1.0]},
    'XGBoostRegressor': {'n_estimators': [10, 50, 100], 'learning_rate': [0.01, 0.1, 1.0]}
}

# Create a function to calculate RMSE
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

results = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = np.log(best_model.predict(X_test))
    
    rmse_score = rmse(y_test, y_pred)
    
    results[model_name] = {
        'best_estimator': best_model,
        'best_params': best_params,
        'RMSE': rmse_score
    }


In [9]:
pprint.pprint(results)

{'GradientBoostingRegressor': {'RMSE': 9.554918380179512,
                               'best_estimator': GradientBoostingRegressor(),
                               'best_params': {'learning_rate': 0.1,
                                               'n_estimators': 100}},
 'Lasso': {'RMSE': 9.55509918134933,
           'best_estimator': Lasso(alpha=0.1),
           'best_params': {'alpha': 0.1}},
 'LinearRegression': {'RMSE': 9.555190514576992,
                      'best_estimator': LinearRegression(),
                      'best_params': {}},
 'RandomForestRegressor': {'RMSE': 9.55515385010107,
                           'best_estimator': RandomForestRegressor(max_depth=10),
                           'best_params': {'max_depth': 10,
                                           'n_estimators': 100}},
 'Ridge': {'RMSE': 9.55519045043781,
           'best_estimator': Ridge(),
           'best_params': {'alpha': 1.0}},
 'SVR': {'RMSE': 9.554349782575017,
         'best_estimator': SVR(C