# Imports

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from plotly import express as px, graph_objects as go

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import (
    BayesianRidge,
    ElasticNet,
    LinearRegression,
    RidgeCV,
    SGDRegressor,
)

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

%matplotlib inline

# Reading Data

In [2]:
df = pd.read_csv('train.csv', index_col='Id')
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

# Exploratory Data Analysis

### Splitting features and target

In [4]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

### Defining numerical and categorical features

In [5]:
nominal_features = [
    'MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'SaleType', 
    'SaleCondition','GarageType', 'Alley', 'Fence', 'MiscFeature'
]

ordinal_features = [
    'LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 
    'ExterCond', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 
    'Electrical', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC'
]

continuous_features = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
    'MiscVal'
]

discrete_features = [
    'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
    'MoSold', 'YrSold'
]

### Checking for missing values

In [6]:
def null_pct(df: pd.DataFrame, thold=0):
    """Get the percentage of nulls in a DataFrame above given threshold"""
    pct = df.isna().mean()
    return pct[pct > thold]

In [7]:
null_pct(y)

array([], dtype=float64)

In [8]:
null_pct(X)

LotFrontage     0.177397
Alley           0.937671
MasVnrType      0.005479
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtExposure    0.026027
BsmtFinType1    0.025342
BsmtFinType2    0.026027
Electrical      0.000685
FireplaceQu     0.472603
GarageType      0.055479
GarageYrBlt     0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
PoolQC          0.995205
Fence           0.807534
MiscFeature     0.963014
dtype: float64

Based on data description null values for most of these features have a meaning

Why do MasVnrType and MasVnrArea have missing values?

In [9]:
X.loc[X['MasVnrType'].isna(), ['MasVnrType', 'MasVnrArea']]

Unnamed: 0_level_0,MasVnrType,MasVnrArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
235,,
530,,
651,,
937,,
974,,
978,,
1244,,
1279,,


Apparently there was no masonry veneer for those houses.

#### Filling missing values for MasVnrType (MasVnrArea will be filled with continuous variables)

In [10]:
X['MasVnrType'] = X['MasVnrType'].fillna('None')

### Filling missing values

#### Filling categorical or discrete missing values with mode

In [11]:
for col in (nominal_features + ordinal_features + discrete_features):
    X[col] = X[col].fillna(X[col].mode())

#### Filling numerical missing values with 0 (because in most cases 0 means not applicable)

In [12]:
for col in (continuous_features + discrete_features):
    X[col] = X[col].fillna(0)

### Encoding Categorical Features

#### One-hot encoding

In [13]:
dummies = pd.get_dummies(X[nominal_features]).sort_index()
X = pd.concat([X, dummies], axis=1)
X = X.drop(nominal_features, axis=1)

#### Ordinal encoding

In [14]:
rating = {np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

ordinal_encoding = {
    'LotShape': {np.nan: 0, 'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}, 
    'Utilities': {np.nan: 0, 'ElO': 1, 'NoSeWa': 2, 'NoSeWr': 3, 'AllPub': 4}, 
    'LandSlope': {np.nan: 0, 'Gtl': 1, 'Mod': 2, 'Sev': 3}, 
    'ExterQual': rating, 
    'ExterCond': rating, 
    'BsmtQual': rating, 
    'BsmtCond': rating, 
    'BsmtExposure': {np.nan: 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}, 
    'BsmtFinType1': {np.nan: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
    'BsmtFinType2': {np.nan: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
    'HeatingQC': rating, 
    'CentralAir': {np.nan: 0, 'N': 1, 'Y': 2}, 
    'Electrical': {np.nan: 0, 'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5}, 
    'KitchenQual': rating, 
    'Functional': {np.nan: 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}, 
    'FireplaceQu': rating, 
    'GarageFinish': {np.nan: 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}, 
    'GarageQual': rating, 
    'GarageCond': rating, 
    'PavedDrive': {np.nan: 0, 'N': 1, 'P': 2, 'Y': 3}, 
    'PoolQC': rating
}

In [15]:
X = X.replace(ordinal_encoding)
X.head()

Unnamed: 0_level_0,LotFrontage,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Alley_Grvl,Alley_Pave,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,1,4,1,7,5,2003,2003,196.0,...,0,0,0,0,0,0,0,0,0,0
2,80.0,9600,1,4,1,6,8,1976,1976,0.0,...,0,0,0,0,0,0,0,0,0,0
3,68.0,11250,2,4,1,7,5,2001,2002,162.0,...,0,0,0,0,0,0,0,0,0,0
4,60.0,9550,2,4,1,7,5,1915,1970,0.0,...,0,0,0,0,0,0,0,0,0,0
5,84.0,14260,2,4,1,8,5,2000,2000,350.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 219 entries, LotFrontage to MiscFeature_TenC
dtypes: float64(3), int64(53), uint8(163)
memory usage: 882.6 KB


### Scaling features

In [17]:
X[:] = StandardScaler().fit_transform(X)
X.head()

Unnamed: 0_level_0,LotFrontage,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Alley_Grvl,Alley_Pave,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.212877,-0.207142,-0.701291,0.02618,-0.225716,0.651479,-0.5172,1.050994,0.878668,0.514104,...,-0.188311,-0.169981,-0.205214,-0.195977,-0.347118,-0.087129,-0.037037,-0.037037,-0.186352,-0.02618
2,0.645747,-0.091886,-0.701291,0.02618,-0.225716,-0.071836,2.179628,0.156734,-0.429577,-0.57075,...,-0.188311,-0.169981,-0.205214,-0.195977,-0.347118,-0.087129,-0.037037,-0.037037,-0.186352,-0.02618
3,0.299451,0.07348,1.016637,0.02618,-0.225716,0.651479,-0.5172,0.984752,0.830215,0.325915,...,-0.188311,-0.169981,-0.205214,-0.195977,-0.347118,-0.087129,-0.037037,-0.037037,-0.186352,-0.02618
4,0.068587,-0.096897,1.016637,0.02618,-0.225716,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,...,-0.188311,-0.169981,-0.205214,-0.195977,-0.347118,-0.087129,-0.037037,-0.037037,-0.186352,-0.02618
5,0.761179,0.375148,1.016637,0.02618,-0.225716,1.374795,-0.5172,0.951632,0.733308,1.366489,...,-0.188311,-0.169981,-0.205214,-0.195977,-0.347118,-0.087129,-0.037037,-0.037037,-0.186352,-0.02618


In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 219 entries, LotFrontage to MiscFeature_TenC
dtypes: float64(219)
memory usage: 2.5 MB


# Data Preparation Pipeline

### Prepare features function (Pipeline for all data preparation above)

In [19]:
def prepare_features(df: pd.DataFrame, feature_names: list):
    """Preparing features for training"""
    
    # Creating DF
    
    X = pd.concat([pd.DataFrame(columns=feature_names), df])
    
    # Defining numerical and categorical features
    
    nominal_features = [
        'MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'SaleType', 
        'SaleCondition','GarageType', 'Alley', 'Fence', 'MiscFeature'
    ]

    ordinal_features = [
        'LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 
        'ExterCond', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 
        'Electrical', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC'
    ]

    continuous_features = [
        'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
        'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
        'MiscVal'
    ]

    discrete_features = [
        'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
        'MoSold', 'YrSold'
    ]
    
    # Filling missing values for MasVnrType

    if 'MasVnrType' in X.columns:
        X['MasVnrType'] = X['MasVnrType'].fillna('None')
    
    # Filling categorical or discrete missing values with mode

    for col in (nominal_features + ordinal_features + discrete_features):
        X[col] = X[col].fillna(X[col].mode())

    # Filling numerical missing values with 0 (because in most cases 0 means not applicable)

    for col in (continuous_features + discrete_features):
        X[col] = X[col].fillna(0)
    
    # One Hot Encoding

    dummies = pd.get_dummies(X[nominal_features]).sort_index()
    dummies_cols = list(set(dummies.columns) & set(X.columns))
    X[dummies_cols] = dummies[dummies_cols]
    
    # Ordinal Encoding

    rating = {np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    ordinal_encoding = {
        'LotShape': {np.nan: 0, 'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}, 
        'Utilities': {np.nan: 0, 'ElO': 1, 'NoSeWa': 2, 'NoSeWr': 3, 'AllPub': 4}, 
        'LandSlope': {np.nan: 0, 'Gtl': 1, 'Mod': 2, 'Sev': 3}, 
        'ExterQual': rating, 
        'ExterCond': rating, 
        'BsmtQual': rating, 
        'BsmtCond': rating, 
        'BsmtExposure': {np.nan: 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}, 
        'BsmtFinType1': {np.nan: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
        'BsmtFinType2': {np.nan: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
        'HeatingQC': rating, 
        'CentralAir': {np.nan: 0, 'N': 1, 'Y': 2}, 
        'Electrical': {np.nan: 0, 'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5}, 
        'KitchenQual': rating, 
        'Functional': {np.nan: 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}, 
        'FireplaceQu': rating, 
        'GarageFinish': {np.nan: 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}, 
        'GarageQual': rating, 
        'GarageCond': rating, 
        'PavedDrive': {np.nan: 0, 'N': 1, 'P': 2, 'Y': 3}, 
        'PoolQC': rating
    }

    X = X.replace(ordinal_encoding)
    
    # Feature selection and filling NAs
    
    X = X[feature_names].fillna(0)
    
    # Scaling features
    
    X[:] = StandardScaler().fit_transform(X)
    
    return X


# Plot Function

In [20]:
def plot_actual_vs_pred(model, X, y):
    """Plotting actual vs predicted label"""
    
    y_pred = model.predict(X)
    
    plot_data = pd.concat([y, pd.Series(y_pred, name='PredictedPrice', index=y.index)], axis=1)
    plot_data = plot_data.sort_values('SalePrice')
    plot_data.index = y.index
    plot_data = plot_data.reset_index()
    
    fig = px.scatter(plot_data, x='Id', y='SalePrice')
    fig.add_trace(go.Scatter(x=plot_data['Id'], y=plot_data['PredictedPrice'], name='Prediction'))
    fig.show()

# Model Training

In [21]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [22]:
print("Number of GPUs available: ", len(tf.config.list_physical_devices('GPU')))

Number of GPUs available:  1


2022-04-13 04:39:37.357417: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-13 04:39:37.369399: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-13 04:39:37.369633: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [39]:
model_tf = keras.Sequential([
    layers.Dense(128, input_shape=(X.shape[1],), activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model_tf.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001), metrics=[tf.keras.metrics.RootMeanSquaredError()])


In [40]:
model_tf.fit(X,y, validation_split=0.3, epochs=60, verbose=2)

Epoch 1/60
32/32 - 0s - loss: 39310364672.0000 - root_mean_squared_error: 198268.4062 - val_loss: 37708562432.0000 - val_root_mean_squared_error: 194186.9219 - 421ms/epoch - 13ms/step
Epoch 2/60
32/32 - 0s - loss: 30215135232.0000 - root_mean_squared_error: 173825.0156 - val_loss: 9714087936.0000 - val_root_mean_squared_error: 98560.0703 - 69ms/epoch - 2ms/step
Epoch 3/60
32/32 - 0s - loss: 3851464192.0000 - root_mean_squared_error: 62060.1680 - val_loss: 2660598784.0000 - val_root_mean_squared_error: 51580.9922 - 67ms/epoch - 2ms/step
Epoch 4/60
32/32 - 0s - loss: 1188211584.0000 - root_mean_squared_error: 34470.4453 - val_loss: 2229611520.0000 - val_root_mean_squared_error: 47218.7617 - 67ms/epoch - 2ms/step
Epoch 5/60
32/32 - 0s - loss: 806231040.0000 - root_mean_squared_error: 28394.2070 - val_loss: 2024136576.0000 - val_root_mean_squared_error: 44990.4062 - 66ms/epoch - 2ms/step
Epoch 6/60
32/32 - 0s - loss: 632345344.0000 - root_mean_squared_error: 25146.4785 - val_loss: 19270199

<keras.callbacks.History at 0x7fe2e47c8dc0>

# Testing XGBoost

In [131]:
import xgboost

  from pandas import MultiIndex, Int64Index


In [133]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

model_xgb = XGBRegressor(n_estimators=1500, max_depth=10, eta=0.07, subsample=0.7, 
                     colsample_bytree=0.8, tree_method='gpu_hist', n_jobs=20, eval_metric='rmse')

In [140]:
model_xgb.fit(X,y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.07, eval_metric='rmse', gamma=0, gpu_id=0,
             importance_type=None, interaction_constraints='',
             learning_rate=0.0700000003, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1500, n_jobs=20, num_parallel_tree=1,
             predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

In [141]:
score = np.mean(cross_val_score(model_xgb, X, y))

print('Score:', score)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score: 0.8846759538159219


# Submission

### Reading test data

In [142]:
test_df = pd.read_csv('test.csv', index_col="Id")
test_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


### Preparing test data for model

In [143]:
X_test = prepare_features(test_df, X.columns)
X_test.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Alley_Grvl,Alley_Pave,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC
1461,0.684849,0.363929,-0.703009,0.03705,-0.207992,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,...,-0.22449,-0.161306,-0.205287,-0.203467,2.735427,-0.026189,-0.045392,-0.03705,-0.18043,0.0
1462,0.715852,0.897861,1.090156,0.03705,-0.207992,-0.054877,0.400766,-0.439695,-1.214908,0.047057,...,-0.22449,-0.161306,-0.205287,-0.203467,-0.365574,-0.026189,22.030282,-0.03705,-0.18043,0.0
1463,0.498831,0.809646,1.090156,0.03705,-0.207992,-0.751101,-0.497418,0.844059,0.678742,-0.563316,...,-0.22449,-0.161306,-0.205287,-0.203467,2.735427,-0.026189,-0.045392,-0.03705,-0.18043,0.0
1464,0.622843,0.032064,1.090156,0.03705,-0.207992,-0.054877,0.400766,0.876976,0.678742,-0.450284,...,-0.22449,-0.161306,-0.205287,-0.203467,-0.365574,-0.026189,-0.045392,-0.03705,-0.18043,0.0
1465,-0.462261,-0.971808,1.090156,0.03705,-0.207992,1.337571,-0.497418,0.679475,0.394694,-0.563316,...,-0.22449,-0.161306,-0.205287,-0.203467,-0.365574,-0.026189,-0.045392,-0.03705,-0.18043,0.0


### Making predictions

In [145]:
test_preds = pd.DataFrame.from_dict({'Id': test_df.index,'SalePrice': (model_xgb.predict(X_test))})
test_preds.head()

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Unnamed: 0,Id,SalePrice
0,1461,123998.945312
1,1462,143914.90625
2,1463,169237.671875
3,1464,177276.375
4,1465,154895.578125


### Saving submission file

In [146]:
test_preds.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').head()

Unnamed: 0,Id,SalePrice
0,1461,123998.945
1,1462,143914.9
2,1463,169237.67
3,1464,177276.38
4,1465,154895.58
