<a href="https://www.kaggle.com/code/feezakhankhanzada/feature-engineering-and-hyperparameter-tuning?scriptVersionId=111319646" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
totalCount = train.shape[0]

for column in train.columns:
    totalNullCount = train[column].isna().sum()
    nullPercentage = totalNullCount / totalCount * 100
    
    print(column , ' have ' , nullPercentage , '% Null Values')

Id  have  0.0 % Null Values
MSSubClass  have  0.0 % Null Values
MSZoning  have  0.0 % Null Values
LotFrontage  have  17.73972602739726 % Null Values
LotArea  have  0.0 % Null Values
Street  have  0.0 % Null Values
Alley  have  93.76712328767123 % Null Values
LotShape  have  0.0 % Null Values
LandContour  have  0.0 % Null Values
Utilities  have  0.0 % Null Values
LotConfig  have  0.0 % Null Values
LandSlope  have  0.0 % Null Values
Neighborhood  have  0.0 % Null Values
Condition1  have  0.0 % Null Values
Condition2  have  0.0 % Null Values
BldgType  have  0.0 % Null Values
HouseStyle  have  0.0 % Null Values
OverallQual  have  0.0 % Null Values
OverallCond  have  0.0 % Null Values
YearBuilt  have  0.0 % Null Values
YearRemodAdd  have  0.0 % Null Values
RoofStyle  have  0.0 % Null Values
RoofMatl  have  0.0 % Null Values
Exterior1st  have  0.0 % Null Values
Exterior2nd  have  0.0 % Null Values
MasVnrType  have  0.547945205479452 % Null Values
MasVnrArea  have  0.547945205479452 % Null Va

Identifying the Null Values in Training set and the percentile score of Null Values of each feature

In [4]:
totalCount = train.shape[0]

for column in train.columns:
    totalNullCount = train[column].isna().sum()
    nullPercentage = totalNullCount / totalCount * 100
    
    if nullPercentage >= 50:
        train = train.drop(column , axis = 1)
        #print(column , ' have ' , nullPercentage , '% Null Values')
    elif (nullPercentage > 0) and (nullPercentage < 50):
        print(column , ' have ' , nullPercentage , '% Null Values')
        #pass

LotFrontage  have  17.73972602739726 % Null Values
MasVnrType  have  0.547945205479452 % Null Values
MasVnrArea  have  0.547945205479452 % Null Values
BsmtQual  have  2.5342465753424657 % Null Values
BsmtCond  have  2.5342465753424657 % Null Values
BsmtExposure  have  2.6027397260273974 % Null Values
BsmtFinType1  have  2.5342465753424657 % Null Values
BsmtFinType2  have  2.6027397260273974 % Null Values
Electrical  have  0.0684931506849315 % Null Values
FireplaceQu  have  47.26027397260274 % Null Values
GarageType  have  5.5479452054794525 % Null Values
GarageYrBlt  have  5.5479452054794525 % Null Values
GarageFinish  have  5.5479452054794525 % Null Values
GarageQual  have  5.5479452054794525 % Null Values
GarageCond  have  5.5479452054794525 % Null Values


From above results, it can be clearly seen that 5.5% of the house does not have grage facility.

In [5]:
train.loc[train['GarageCond'].isna() == True , ['GarageType' , 'GarageYrBlt' , 'GarageFinish' , 'GarageQual' , 'GarageCond' , 'GarageArea' , 'GarageCars']]

for item in ['GarageType' , 'GarageYrBlt' , 'GarageFinish' , 'GarageQual' , 'GarageCond' , 'GarageArea' , 'GarageCars']:
    if train[item].dtype == 'O':
        train.loc[train['GarageCond'].isna() == True , item] = 'No Garage'


Around 2.6% of the houses does not contains the Basement.

In [6]:
train.loc[(train['BsmtExposure'].isna() == True) & (train['BsmtQual'].isna() == False) , 
          ['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinSF1' , 'BsmtFinType2' , 'BsmtFinSF2' , 'BsmtUnfSF' , 'TotalBsmtSF']]


for item in ['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinSF1' , 'BsmtFinType2' , 'BsmtFinSF2' , 'BsmtUnfSF' , 'TotalBsmtSF']:
    if train[item].dtype == 'O':
        train.loc[train[item].isna() == True , item] = 'No Basement'

A very small percentage of null values in Masonry Veneer 

In [7]:
train.loc[(train['MasVnrType'].isna() == True) , ['MasVnrType' , 'MasVnrArea']]


for item in ['MasVnrType' , 'MasVnrArea']:
    if train[item].dtype == 'O':
        train.loc[train[item].isna() == True , item] = 'No Masonry'

In [8]:
train.loc[(train['FireplaceQu'].isna() == True) , ['FireplaceQu']]

for item in ['FireplaceQu']:
    if train[item].dtype == 'O':
        train.loc[train[item].isna() == True , item] = 'No FirePlace'

In [9]:
train = train.drop('Electrical' , axis = 1)
train = train.drop('GarageYrBlt' , axis = 1)

In [10]:
train['Bathrooms'] = train['BsmtFullBath'] + 0.5 * train['BsmtHalfBath'] + train['FullBath'] + 0.5 * train['HalfBath']

train = train.drop(['BsmtFullBath' , 'BsmtHalfBath' , 'FullBath' , 'HalfBath'] , axis = 1)

In [11]:
train['OverallRating'] = (train['OverallQual'] + train['OverallCond'])/2

train = train.drop(['OverallQual' , 'OverallCond'] , axis = 1)

In [12]:
ratingDict = {"ExterQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1} , 
              "ExterCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1} , 
              "BsmtQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Basement' : 0} ,
              "BsmtCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Basement' : 0} , 
              "GarageQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Garage' : 0} ,
              "GarageCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Garage' : 0} 
             }
              

train = train.replace(ratingDict)
train['OverallExternalRating'] = (train['ExterQual'] + train['ExterCond']) / 2 
train = train.drop(['ExterQual' , 'ExterCond'] , axis = 1)

train['OverallBasementRating'] = (train['BsmtQual'] + train['BsmtCond']) / 2
train = train.drop(['BsmtQual' , 'BsmtCond'] , axis = 1)

train['OverallGarageRating'] = (train['GarageQual'] + train['GarageCond']) / 2
train = train.drop(['GarageQual' , 'GarageCond'] , axis = 1)

In [13]:
train.loc[train['MasVnrArea'].isna() == True , 'MasVnrArea'] = 0

imputer = SimpleImputer(strategy='median', missing_values=np.nan)

imputer = imputer.fit(train[['LotFrontage']])
train['LotFrontage'] = imputer.transform(train[['LotFrontage']])

Find out Categorical and Numerical Features

In [14]:
categoricalFeatures = []
numericalFeatures = []

for columns in train.columns:
#     if train[columns].nunique() <= 10:
#         categoricalFeatures.append(columns)
        
    if train[columns].dtype == 'object':
        categoricalFeatures.append(columns)
        
    elif train[columns].dtype in ['int64', 'float64']:
        numericalFeatures.append(columns)

In [15]:
train = pd.get_dummies(train, columns = categoricalFeatures)

In [16]:
'''
def target_encoding(data, column, target):
    
    grouped = data[[column,target]].groupby(column,as_index=False).mean()
    empty_dict = {}
    for i in range(len(grouped)):
        empty_dict[grouped.iloc[i,0]]=grouped.iloc[i,1]
    data[column]=data[column].map(lambda x: empty_dict[x])
    
    return data
    
for col in train.select_dtypes(include='O').columns:
    target_encoding(train , col , 'SalePrice')
    
'''

"\ndef target_encoding(data, column, target):\n    \n    grouped = data[[column,target]].groupby(column,as_index=False).mean()\n    empty_dict = {}\n    for i in range(len(grouped)):\n        empty_dict[grouped.iloc[i,0]]=grouped.iloc[i,1]\n    data[column]=data[column].map(lambda x: empty_dict[x])\n    \n    return data\n    \nfor col in train.select_dtypes(include='O').columns:\n    target_encoding(train , col , 'SalePrice')\n    \n"

In [17]:
pd.set_option('display.max_colwidth', None)
corr_matrix = train.corr()
corr_matrix.loc[corr_matrix.SalePrice >= 0.0 , 'SalePrice'].to_dict().keys()

dict_keys(['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold', 'SalePrice', 'Bathrooms', 'OverallRating', 'OverallExternalRating', 'OverallBasementRating', 'OverallGarageRating', 'MSZoning_FV', 'MSZoning_RL', 'Street_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LandContour_HLS', 'LandContour_Low', 'Utilities_AllPub', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR3', 'LandSlope_Mod', 'LandSlope_Sev', 'Neighborhood_Blmngtn', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Gilbert', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition1_Norm', 

In [18]:
features = corr_matrix.loc[corr_matrix.SalePrice >= -1 , 'SalePrice'].to_dict().keys()

X = train[features]
X = X.drop('SalePrice' , axis = 1)
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

X_train = MinMaxScaler().fit(X_train).transform(X_train)
X_test = MinMaxScaler().fit(X_test).transform(X_test)

In [19]:
models = {
    'linearRegression' : LinearRegression() , 
    'decisionTreeRegression' : DecisionTreeRegressor() , 
    'randomForestRegressor' : RandomForestRegressor() , 
    'xgboostRegressor' :  XGBRegressor() , 
    'SVR' : SVR()
}

In [20]:
def training(model , X_train , y_train , X_test , y_test):
    modelFit = model.fit(X_train , y_train)
    predictions = model.predict(X_test)
    
    MSE = mean_squared_error(y_test , predictions)
    RMSE = np.sqrt(MSE)
    
    print('MSE: ' , MSE , ", RMSE: " , RMSE)
    
    scores = cross_val_score(model , X_train , y_train , scoring = "neg_mean_squared_error" , cv = 5)
    rmseScores = np.sqrt(-scores)
    
    print('RMSE Score: ' , rmseScores , ',  RMSE Mean: ' , rmseScores.mean() , ', RMSE STD: ' , rmseScores.std())

In [21]:
training(models['linearRegression'] , X_train , y_train  ,X_test , y_test)

MSE:  7.557372319901549e+28 , RMSE:  274906753643877.38
RMSE Score:  [2.57751026e+04 3.52894535e+14 5.96341251e+14 2.06329851e+15
 4.03639720e+14] ,  RMSE Mean:  683234802621185.9 , RMSE STD:  716412481119483.2


In [22]:
training(models['decisionTreeRegression'] , X_train , y_train  ,X_test , y_test)

MSE:  3388251241.4965754 , RMSE:  58208.68699340826
RMSE Score:  [38050.07586445 39833.09526391 49528.24374234 44144.62846622
 43983.62572204] ,  RMSE Mean:  43107.93381179252 , RMSE STD:  3984.179872011331


In [23]:
training(models['randomForestRegressor'] , X_train , y_train  ,X_test , y_test)

MSE:  1830798978.6744156 , RMSE:  42787.836807607084
RMSE Score:  [23897.87650693 27981.92512495 37684.16224241 31487.77804715
 25660.15820334] ,  RMSE Mean:  29342.380024958657 , RMSE STD:  4883.447079131409


In [24]:
training(models['xgboostRegressor'] , X_train , y_train  ,X_test , y_test)

MSE:  2411102343.522434 , RMSE:  49102.97693136776
RMSE Score:  [24999.15528702 30562.01179708 42389.55435879 32885.12993285
 26421.86115613] ,  RMSE Mean:  31451.542506372538 , RMSE STD:  6154.957942816883


In [25]:
training(models['SVR'] , X_train , y_train  ,X_test , y_test)

MSE:  5950645618.5595665 , RMSE:  77140.42791273307
RMSE Score:  [78452.60862438 86027.40432392 78299.42664428 92324.33748891
 76015.97001272] ,  RMSE Mean:  82223.94941883968 , RMSE STD:  6076.973047229648


In [26]:
paramGridRandomForestRegressor = [
    {
        'n_estimators' : [3 , 10 , 30] , 
        'max_features' : [2 , 4 , 6 , 8] , 
    } , 
    {
        'bootstrap' : [False , True] , 
        'n_estimators' : [3 , 10 , 30] , 
        'max_features' : [1 , 3 , 5 , 7]
    } ,
]

In [27]:
def grid_training(paramGrid , model , X_train , y_train , X_test , y_test):
    gridSearch = GridSearchCV(model , 
                              paramGrid , 
                              cv = 5 , 
                              scoring = 'neg_mean_squared_error' ,
                              return_train_score = True)
    
    print(gridSearch.estimator.get_params().keys())
    
    gridSearch.fit(X_train , y_train)
    
    print('Best Parameters: ' , gridSearch.best_params_)
    print('Best Estimators: ' , gridSearch.best_estimator_)
    
    cvres = gridSearch.cv_results_
    
    for mean_score , params in zip(cvres["mean_test_score"] , cvres["params"]):
        print(np.sqrt(-mean_score) , params)
        
#     #feature_importances = gridSearch.best_estimator_.feature_importances_
#     feature_importances = gridSearch.feature_importances_
#     return feature_importances

In [28]:
grid_training(paramGridRandomForestRegressor , models['randomForestRegressor'] , X_train , y_train , X_test , y_test)

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])
Best Parameters:  {'bootstrap': False, 'max_features': 7, 'n_estimators': 30}
Best Estimators:  RandomForestRegressor(bootstrap=False, max_features=7, n_estimators=30)
52100.60824972954 {'max_features': 2, 'n_estimators': 3}
40309.65283579898 {'max_features': 2, 'n_estimators': 10}
37500.21044739592 {'max_features': 2, 'n_estimators': 30}
46939.25471915159 {'max_features': 4, 'n_estimators': 3}
38863.19867790006 {'max_features': 4, 'n_estimators': 10}
36414.786413985625 {'max_features': 4, 'n_estimators': 30}
45335.80359947499 {'max_features': 6, 'n_estimators': 3}
37571.314750532365 {'max_features': 6, 'n_estimators': 10}
34794.38596911319 {'max_features': 6, 'n_estimators': 30}
41049.33999040778 {'max_

In [29]:
paramGridSVM = [
    {
        'kernel': ['linear'], 
        'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]
    },
    {
        'kernel': ['rbf'], 
        'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
        'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]
    },
]

In [30]:
grid_training(paramGridSVM , models['SVR'] , X_train , y_train , X_test , y_test)

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])
Best Parameters:  {'C': 10000.0, 'kernel': 'linear'}
Best Estimators:  SVR(C=10000.0, kernel='linear')
74533.66272358876 {'C': 10.0, 'kernel': 'linear'}
65931.39416511654 {'C': 30.0, 'kernel': 'linear'}
54741.54258989652 {'C': 100.0, 'kernel': 'linear'}
44335.37541364882 {'C': 300.0, 'kernel': 'linear'}
35533.23482967221 {'C': 1000.0, 'kernel': 'linear'}
31628.882924422604 {'C': 3000.0, 'kernel': 'linear'}
30011.49236924312 {'C': 10000.0, 'kernel': 'linear'}
30134.519519573514 {'C': 30000.0, 'kernel': 'linear'}
82460.59239085398 {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
82448.79068808889 {'C': 1.0, 'gamma': 0.03, 'kernel': 'rbf'}
82459.50392603777 {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}
82474.76323487231 {'C': 1.0, 'gamma': 0.3, 'kernel': 'rbf'}
82475.91399645223 {'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'}
82475.9342794249 {'C': 1.0, 'gamma': 3.0, 'kernel': 'rbf'}

In [31]:
paramGridXGB = {
    'max_depth': [3, 5] ,
    'learning_rate': [0.01, 0.1] ,
    'subsample': np.arange(0.5) ,
    'colsample_bytree': np.arange(0.4) ,
    'colsample_bylevel': np.arange(0.4) ,       
    'n_estimators': [100, 500]
}

In [32]:
grid_training(paramGridXGB , models['xgboostRegressor'] , X_train , y_train , X_test , y_test)

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'gamma', 'gpu_id', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'predictor', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])
Best Parameters:  {'colsample_bylevel': 0.0, 'colsample_bytree': 0.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.0}
Best Estimators:  XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.0, colsample_bynode=1, colsample_bytree=0.0,
             early_stopping_rounds=None, enable_categorical=False,
     

In [33]:
xgbRegressor = XGBRegressor(colsample_bylevel = 0.0, 
                           colsample_bytree = 0.0,
                           learning_rate = 0.01,
                           max_depth = 3,
                           n_estimators = 100,
                           subsample = 0.0)

training(xgbRegressor , X_train , y_train  ,X_test , y_test)

MSE:  38144859773.90753 , RMSE:  195307.09094630316
RMSE Score:  [196946.29124134 201017.29935574 193837.50047919 202485.27896196
 196337.79744496] ,  RMSE Mean:  198124.83349663758 , RMSE STD:  3173.1223582500043


In [34]:
def preprocessing(data):
    data.loc[data['GarageCond'].isna() == True , ['GarageType' , 'GarageYrBlt' , 'GarageFinish' , 'GarageQual' , 'GarageCond' , 'GarageArea' , 'GarageCars']]

    for item in ['GarageType' , 'GarageYrBlt' , 'GarageFinish' , 'GarageQual' , 'GarageCond' , 'GarageArea' , 'GarageCars']:
        if data[item].dtype == 'O':
            data.loc[data['GarageCond'].isna() == True , item] = 'No Garage'


    data.loc[(data['BsmtExposure'].isna() == True) & (data['BsmtQual'].isna() == False) , 
             ['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinSF1' , 'BsmtFinType2' , 'BsmtFinSF2' , 'BsmtUnfSF' , 'TotalBsmtSF']]


    for item in ['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinSF1' , 'BsmtFinType2' , 'BsmtFinSF2' , 'BsmtUnfSF' , 'TotalBsmtSF']:
        if data[item].dtype == 'O':
            data.loc[data[item].isna() == True , item] = 'No Basement'
        
    data.loc[(data['MasVnrType'].isna() == True) , ['MasVnrType' , 'MasVnrArea']]


    for item in ['MasVnrType' , 'MasVnrArea']:
        if data[item].dtype == 'O':
            data.loc[data[item].isna() == True , item] = 'No Masonry'
        
     
    data.loc[(data['FireplaceQu'].isna() == True) , ['FireplaceQu']]

    for item in ['FireplaceQu']:
        if data[item].dtype == 'O':
            data.loc[data[item].isna() == True , item] = 'No FirePlace'
        
    data = data.drop('GarageYrBlt' , axis = 1)

    data['Bathrooms'] = data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath'] + data['FullBath'] + 0.5 * data['HalfBath']

    data = data.drop(['BsmtFullBath' , 'BsmtHalfBath' , 'FullBath' , 'HalfBath'] , axis = 1)

    data['OverallRating'] = (data['OverallQual'] + data['OverallCond'])/2

    data = data.drop(['OverallQual' , 'OverallCond'] , axis = 1)

    ratingDict = {"ExterQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1} , 
                  "ExterCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1} , 
                  "BsmtQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Basement' : 0} ,
                  "BsmtCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Basement' : 0} , 
                  "GarageQual": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Garage' : 0} ,
                  "GarageCond": {'Ex' : 5 , 'Gd' : 4 , 'TA' : 3 , 'Fa' : 2 , 'Po' : 1 , 'No Garage' : 0} 
                 }
              
    data = data.replace(ratingDict)
    data['OverallExternalRating'] = (data['ExterQual'] + data['ExterCond']) / 2 
    data = data.drop(['ExterQual' , 'ExterCond'] , axis = 1)

    data['OverallBasementRating'] = (data['BsmtQual'] + data['BsmtCond']) / 2
    data = data.drop(['BsmtQual' , 'BsmtCond'] , axis = 1)

    data['OverallGarageRating'] = (data['GarageQual'] + data['GarageCond']) / 2
    data = data.drop(['GarageQual' , 'GarageCond'] , axis = 1)

    data.loc[data['MasVnrArea'].isna() == True , 'MasVnrArea'] = 0

    imputer = SimpleImputer(strategy='median', missing_values=np.nan)

    imputer = imputer.fit(data[['LotFrontage']])
    data['LotFrontage'] = imputer.transform(data[['LotFrontage']])

    categoricalFeatures = []
    numericalFeatures = []

    for columns in data.columns:
        if data[columns].nunique() <= 10:
            categoricalFeatures.append(columns)
        
        elif data[columns].dtype == 'object':
            categoricalFeatures.append(columns)
        
        elif data[columns].dtype in ['int64', 'float64']:
            numericalFeatures.append(columns)
        
    data = pd.get_dummies(data, columns = categoricalFeatures)
        
    return data

In [35]:
test = preprocessing(test)

In [36]:
for item in test.isna().sum().to_dict():
    if test.isna().sum().to_dict()[item] != 0:
        test.loc[test[item].isna() == True , item] = 0

In [37]:
test

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,OverallBasementRating_4.5,OverallGarageRating_0.0,OverallGarageRating_1.0,OverallGarageRating_1.5,OverallGarageRating_2.0,OverallGarageRating_2.5,OverallGarageRating_3.0,OverallGarageRating_3.5,OverallGarageRating_4.0,OverallGarageRating_4.5
0,1461,20,80.0,11622,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,0,0,1,0,0,0
1,1462,20,81.0,14267,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,0,0,1,0,0,0
2,1463,60,74.0,13830,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,0,0,1,0,0,0
3,1464,60,78.0,9978,1998,1998,20.0,602.0,0.0,324.0,...,0,0,0,0,0,0,1,0,0,0
4,1465,120,43.0,5005,1992,1992,0.0,263.0,0.0,1017.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,1970,1970,0.0,0.0,0.0,546.0,...,0,1,0,0,0,0,0,0,0,0
1455,2916,160,21.0,1894,1970,1970,0.0,252.0,0.0,294.0,...,0,0,0,0,0,0,1,0,0,0
1456,2917,20,160.0,20000,1960,1996,0.0,1224.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1457,2918,85,62.0,10441,1992,1992,0.0,337.0,0.0,575.0,...,0,1,0,0,0,0,0,0,0,0


In [38]:
for item in list(set(features) - set(test.columns)):
    test[item] = 0
#test = test.drop('SalePrice' , axis = 1)

In [39]:
testFeatures = test[features]
testFeatures = testFeatures.drop('SalePrice' , axis = 1)
testFeaturesTransformed = MinMaxScaler().fit(testFeatures).transform(testFeatures)

X = train[features]
X = X.drop('SalePrice' , axis = 1)
X_train = MinMaxScaler().fit(X).transform(X)

y_train = train['SalePrice']

In [40]:
xgbRegressor = XGBRegressor(colsample_bylevel = 0.0, 
                           colsample_bytree = 0.0,
                           learning_rate = 0.01,
                           max_depth = 3,
                           n_estimators = 100,
                           subsample = 0.0)

def prediction(model , X_train , y_train , testFeaturesTransformed):
    modelFit = model.fit(X_train , y_train)
    
    predictions = model.predict(testFeaturesTransformed)
    
    return predictions

In [41]:
len(testFeaturesTransformed)

1459

In [42]:
prediction = prediction(xgbRegressor , X_train , y_train , testFeaturesTransformed)

In [43]:
xgbRegressor = XGBRegressor(learning_rate = 0.01,
                           max_depth = 3,
                           n_estimators = 100)

In [44]:
svRegressor = SVR()
svRegressor.fit(X_train , y_train)
prediction = svRegressor.predict(testFeaturesTransformed)

In [45]:
xgbRegressor = XGBRegressor()
xgbRegressor.fit(X_train , y_train)
prediction = xgbRegressor.predict(testFeaturesTransformed)

In [46]:
pd.DataFrame(prediction)

Unnamed: 0,0
0,110089.687500
1,127376.828125
2,171541.937500
3,193473.781250
4,177459.359375
...,...
1454,84961.117188
1455,64641.324219
1456,120578.585938
1457,74671.718750


In [47]:
submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [48]:
submission = submission.join(pd.DataFrame(prediction))

In [49]:
submission = submission.drop('SalePrice' , axis = 1)

In [50]:
submission.columns = ['Id' , 'SalePrice']

In [51]:
submission.to_csv('submission.csv')