In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/train.csv')

# Missing Values

In [3]:
replacement_dictionary = {
                            'PoolQC'      : 'No Pool',
                            'MiscFeature' : 'No MiscFeatures',  
                            'Alley'       : 'No Alley Access',
                            'Fence'       : 'No Fence',
                            'FireplaceQu' : 'No FirePlace', 
                            'GarageFinish': 'No Garage',
                            'GarageQual'  : 'No Garage',
                            'GarageCond'  : 'No Garage',
                            'GarageType'  : 'No Garage',
                            'BsmtExposure': 'No Basement',
                            'BsmtCond'    : 'No Basement',
                            'BsmtQual'    : 'No Basement',
                            'BsmtFinType2': 'No Basement',
                            'BsmtFinType1': 'No Basement',
                            'Electrical'  : 'No Electrical',
                            'MasVnrType'  : 'No MasVnr',
                            'MasVnrArea'  : 0,
                            'LotFrontage' : 0,
                            'GarageYrBlt' : 0,
                            'BsmtFullBath':0, 
                            'BsmtHalfBath':0, 
                            'BsmtFinSF2':0, 
                            'GarageArea':0, 
                            'BsmtFinSF1':0, 
                            'GarageCars':0, 
                            'TotalBsmtSF':0, 
                            'BsmtUnfSF':0}

In [4]:
df.fillna(replacement_dictionary, inplace=True)

In [5]:
df.isnull().sum().sum()

0

# Feature Engineering

In [6]:
# Bathrooms
df['Total_Bathrooms'] = df['BsmtFullBath'] + df['FullBath'] + 0.5*(df['BsmtHalfBath'] + df['HalfBath'])

# Age - years
df['Age_of_Property'] = df['YearBuilt'].max() - df['YearBuilt']
df['Age_of_Renovation'] = df['YearBuilt'].max() - df['YearRemodAdd']

# Neighborhood
n_median = df.groupby('Neighborhood')['SalePrice'].median().reset_index()
n_median.columns = ['Neighborhood', 'MedianHousePrice']
df = pd.merge(df, n_median, on='Neighborhood', how='left')

In [7]:
# Basement
# Define different mappings for each basement features
mappings = {
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'No Basement': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0}
}

# Convert the categorical str values into grade points by mapping function
for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

# FinType has a numeric grade and a corresponding sqfoot, combine them
df['BsmtFinGrade'] = (df['BsmtFinType1'] * df.BsmtFinSF1 / df.TotalBsmtSF) + \
                      (df['BsmtFinType2'] * df.BsmtFinSF2 / df.TotalBsmtSF) + \
                      (df['BsmtFinType1'] * df.BsmtUnfSF / df.TotalBsmtSF)

# Because of zero division there will be Nan values where there are no basement, since TotalBsmtSf == 0
df['BsmtFinGrade'].fillna(0.0, inplace=True)

# Combine into one single 'BsmtOverallGrade'
df['BsmtOverallGrade'] = round((df['BsmtQual'] + df['BsmtCond'] + df['BsmtExposure'] + df['BsmtFinGrade']), 2)

In [8]:
# Terrace
terrace_columns = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
terrace_weights = [1, 1, 2, 2, 2]

df['Terrace_Grade'] = np.dot(df[terrace_columns], terrace_weights)

In [9]:
# these columns need to be dropped
col_drop_feat_eng = ['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr',
                     'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
                    'YearBuilt', 'YearRemodAdd', 
                    'GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond',
                    'Neighborhood', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2', 
                    'BsmtFinSF2','BsmtUnfSF','BsmtFinGrade',
                    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']

In [10]:
df.drop(columns=col_drop_feat_eng, inplace=True)

# High and Low Cardinality

In [11]:
cardinality_drop_columns = ['LowQualFinSF',
                             'PoolArea',
                             'MiscVal',
                             'Id',
                             'Street',
                             'Utilities',
                             'Condition2',
                             'RoofMatl',
                             'Heating',
                             'PoolQC']

In [12]:
df.drop(columns=cardinality_drop_columns, inplace=True)

# Outliers

In [13]:
outlier_columns = ['LotArea', 'MasVnrArea', 'GarageArea', 'GrLivArea', 'KitchenAbvGr', 'LotFrontage', 'OverallCond',
        'Terrace_Grade', 'TotRmsAbvGrd', 'TotalBsmtSF',
       'Total_Bathrooms']

In [14]:
df.drop(columns=outlier_columns, inplace=True)

In [18]:
from data.fatih_eda import FatihOutliers
outliers = FatihOutliers(df)

In [21]:
outliers.show_outlier_stats()

Unnamed: 0,OutlierCount
SalePrice,1
Total,1


In [22]:
df.drop(outliers.show_outliers_for_features('SalePrice').index, inplace=True)

# Transformations

In [23]:
small_constant = 0.000001 
df['SalePrice'] = np.log(df['SalePrice'] + small_constant)

In [24]:
categorical_columns = df.select_dtypes(include='object').columns

ordinal_mapping = {
    'Street': {'Grvl':1, 'Pave':2},
    'Alley' : {'Grvl':1, 'Pave':2, 'No Alley Access':0},
    'LotShape' : {'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4 },
    'LandContour' : {'Low':1, 'HLS':2, 'Bnk':3, 'Lvl':4 },
    'LandSlope' : {'Sev':1, 'Mod':2, 'Gtl':3},
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No':1, 'No Basement': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},    
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No FirePlace': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Garage': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Garage': 0},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'No Garage': 0},
    'PavedDrive': {'Y': 3, 'P': 2, 'N': 1},
    'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No Pool': 0},
    'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'No Fence': 0},        
    'Functional': {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 'Maj2': 3, 'Sev': 2, 'Sal': 1},
    'CentralAir': {'Y': 1, 'N': 0}
}

In [25]:
# Apply Ordinal Encoding
for var in categorical_columns:
    if var in ordinal_mapping.keys():
        df[var] = df[var].map(ordinal_mapping[var])

In [26]:
# Apply Nominal Encoding
nominal_categorical_vars = [col for col in categorical_columns if col not in ordinal_mapping.keys()]
df_encoded = pd.get_dummies(df[nominal_categorical_vars], columns=nominal_categorical_vars)
df = pd.concat([df, df_encoded], axis=1)
df = df.drop(columns=nominal_categorical_vars)

In [27]:
df.shape

(1447, 129)

## Final X and y 

In [28]:
X = df.drop(columns = ['SalePrice'])
y = df['SalePrice']

print(f"X shape is: {X.shape}")
print(f"y shape is: {y.shape}")

X shape is: (1447, 128)
y shape is: (1447,)


# Model Building

In [29]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [39]:
pipeline = Pipeline(
    [
        #('scaler', StandardScaler()),
        ('regressor', LinearRegression())
    ]
)
pipeline

In [40]:
pipeline.fit(X, y)

In [41]:
linear_cv_results = cross_validate(pipeline, X, y, cv=10, scoring=('r2', 'neg_mean_squared_error'))

# Extract and print the results
linear_cv_results = pd.DataFrame(linear_cv_results).iloc[:, 2:]
print(f"r2 Mean: {round(linear_cv_results['test_r2'].mean(), 2)}")
print(f"-RMSE Mean: {round(linear_cv_results['test_neg_mean_squared_error'].mean(), 4)}")
linear_cv_results

r2 Mean: 0.81
-RMSE Mean: -0.0271


Unnamed: 0,test_r2,test_neg_mean_squared_error
0,0.850362,-0.021438
1,0.854072,-0.020496
2,0.827675,-0.031123
3,0.804423,-0.033272
4,0.815326,-0.035754
5,0.774124,-0.023539
6,0.802058,-0.028029
7,0.851215,-0.018947
8,0.75995,-0.026702
9,0.792951,-0.032172


In [42]:
X

Unnamed: 0,MSSubClass,Alley,LotShape,LandContour,LandSlope,OverallQual,ExterQual,ExterCond,HeatingQC,CentralAir,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,0,4,4,3,7,4,3,5,1,...,0,0,0,1,0,0,0,0,1,0
1,20,0,4,4,3,6,3,3,5,1,...,0,0,0,1,0,0,0,0,1,0
2,60,0,3,4,3,7,4,3,5,1,...,0,0,0,1,0,0,0,0,1,0
3,70,0,3,4,3,7,3,3,4,1,...,0,0,0,1,1,0,0,0,0,0
4,60,0,3,4,3,8,4,3,5,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,0,4,4,3,6,3,3,5,1,...,0,0,0,1,0,0,0,0,1,0
1456,20,0,4,4,3,6,3,3,3,1,...,0,0,0,1,0,0,0,0,1,0
1457,70,0,4,4,3,7,5,4,5,1,...,0,0,0,1,0,0,0,0,1,0
1458,20,0,4,4,3,5,3,3,4,1,...,0,0,0,1,0,0,0,0,1,0
