In [38]:
import pandas as pd
import numpy as np

In [39]:
df = pd.read_csv('data/train.csv')

# Missing Values

In [40]:
replacement_dictionary = {
                            'PoolQC'      : 'No Pool',
                            'MiscFeature' : 'No MiscFeatures',  
                            'Alley'       : 'No Alley Access',
                            'Fence'       : 'No Fence',
                            'FireplaceQu' : 'No FirePlace', 
                            'GarageFinish': 'No Garage',
                            'GarageQual'  : 'No Garage',
                            'GarageCond'  : 'No Garage',
                            'GarageType'  : 'No Garage',
                            'BsmtExposure': 'No Basement',
                            'BsmtCond'    : 'No Basement',
                            'BsmtQual'    : 'No Basement',
                            'BsmtFinType2': 'No Basement',
                            'BsmtFinType1': 'No Basement',
                            'Electrical'  : 'No Electrical',
                            'MasVnrType'  : 'No MasVnr',
                            'MasVnrArea'  : 0,
                            'LotFrontage' : 0,
                            'GarageYrBlt' : 0,
                            'BsmtFullBath':0, 
                            'BsmtHalfBath':0, 
                            'BsmtFinSF2':0, 
                            'GarageArea':0, 
                            'BsmtFinSF1':0, 
                            'GarageCars':0, 
                            'TotalBsmtSF':0, 
                            'BsmtUnfSF':0}

In [41]:
df.fillna(replacement_dictionary, inplace=True)

In [42]:
df.isnull().sum().sum()

0

# Feature Engineering

In [43]:
# Bathrooms
df['Total_Bathrooms'] = df['BsmtFullBath'] + df['FullBath'] + 0.5*(df['BsmtHalfBath'] + df['HalfBath'])

# Age - years
df['Age_of_Property'] = df['YearBuilt'].max() - df['YearBuilt']
df['Age_of_Renovation'] = df['YearBuilt'].max() - df['YearRemodAdd']

# Neighborhood
n_median = df.groupby('Neighborhood')['SalePrice'].median().reset_index()
n_median.columns = ['Neighborhood', 'MedianHousePrice']
df = pd.merge(df, n_median, on='Neighborhood', how='left')

In [44]:
# Basement
# Define different mappings for each basement features
mappings = {
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'No Basement': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0}
}

# Convert the categorical str values into grade points by mapping function
for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

# FinType has a numeric grade and a corresponding sqfoot, combine them
df['BsmtFinGrade'] = (df['BsmtFinType1'] * df.BsmtFinSF1 / df.TotalBsmtSF) + \
                      (df['BsmtFinType2'] * df.BsmtFinSF2 / df.TotalBsmtSF) + \
                      (df['BsmtFinType1'] * df.BsmtUnfSF / df.TotalBsmtSF)

# Because of zero division there will be Nan values where there are no basement, since TotalBsmtSf == 0
df['BsmtFinGrade'].fillna(0.0, inplace=True)

# Combine into one single 'BsmtOverallGrade'
df['BsmtOverallGrade'] = round((df['BsmtQual'] + df['BsmtCond'] + df['BsmtExposure'] + df['BsmtFinGrade']), 2)

In [45]:
# Terrace
terrace_columns = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
terrace_weights = [1, 1, 2, 2, 2]

df['Terrace_Grade'] = np.dot(df[terrace_columns], terrace_weights)

In [46]:
# these columns need to be dropped
col_drop_feat_eng = ['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr',
                     'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
                    'YearBuilt', 'YearRemodAdd', 
                    'GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond',
                    'Neighborhood', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2', 
                    'BsmtFinSF2','BsmtUnfSF','BsmtFinGrade',
                    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']

In [47]:
df.drop(columns=col_drop_feat_eng, inplace=True)

# High and Low Cardinality

In [48]:
cardinality_drop_columns = ['LowQualFinSF',
                             'PoolArea',
                             'MiscVal',
                             'Id',
                             'Street',
                             'Utilities',
                             'Condition2',
                             'RoofMatl',
                             'Heating',
                             'PoolQC']

In [49]:
df.drop(columns=cardinality_drop_columns, inplace=True)

# Outliers

In [50]:
outlier_columns = ['LotArea', 'MasVnrArea']

In [51]:
df.drop(columns=outlier_columns, inplace=True)

In [52]:
from data.fatih_eda import FatihOutliers
outliers = FatihOutliers(df)

In [53]:
outliers.show_outlier_stats()

Unnamed: 0,OutlierCount
GarageArea,3
GrLivArea,4
KitchenAbvGr,68
LotFrontage,2
OverallCond,1
SalePrice,12
Terrace_Grade,1
TotRmsAbvGrd,1
TotalBsmtSF,5
Total_Bathrooms,9


# Transformations

In [54]:
small_constant = 0.000001 
df['SalePrice'] = np.log(df['SalePrice'] + small_constant)

In [56]:
categorical_columns = df.select_dtypes(include='object').columns

ordinal_mapping = {
    'Street': {'Grvl':1, 'Pave':2},
    'Alley' : {'Grvl':1, 'Pave':2, 'No Alley Access':0},
    'LotShape' : {'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4 },
    'LandContour' : {'Low':1, 'HLS':2, 'Bnk':3, 'Lvl':4 },
    'LandSlope' : {'Sev':1, 'Mod':2, 'Gtl':3},
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Basement': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No':1, 'No Basement': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No Basement': 0},    
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No FirePlace': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Garage': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No Garage': 0},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'No Garage': 0},
    'PavedDrive': {'Y': 3, 'P': 2, 'N': 1},
    'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No Pool': 0},
    'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'No Fence': 0},        
    'Functional': {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 'Maj2': 3, 'Sev': 2, 'Sal': 1},
    'CentralAir': {'Y': 1, 'N': 0}
}

In [58]:
# Apply Ordinal Encoding
for var in categorical_columns:
    if var in ordinal_mapping.keys():
        df[var] = df[var].map(ordinal_mapping[var])

In [59]:
# Apply Nominal Encoding
nominal_categorical_vars = [col for col in categorical_columns if col not in ordinal_mapping.keys()]
df_encoded = pd.get_dummies(df[nominal_categorical_vars], columns=nominal_categorical_vars)
df = pd.concat([df, df_encoded], axis=1)
df = df.drop(columns=nominal_categorical_vars)

In [60]:
df.shape

(1460, 138)

## Final X and y 

In [61]:
X = df.drop(columns = ['SalePrice'])
y = df['SalePrice']

print(f"X shape is: {X.shape}")
print(f"y shape is: {y.shape}")

X shape is: (1460, 137)
y shape is: (1460,)


# Model Building

In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [17]:
pipeline = Pipeline(
    [
        ('onehot', OneHotEncoder()),
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
    ]
)
pipeline

In [19]:
pipeline.fit(X, y)

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [18]:
linear_cv_results = cross_validate(pipeline, X, y, cv=10, scoring=('r2', 'neg_mean_squared_error'))

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\preprocessing\_data.py", line 809, in fit
    return self.partial_fit(X, y, sample_weight)
  File "C:\Users\fcali\anaconda3\envs\techpro\lib\site-packages\sklearn\preprocessing\_data.py", line 872, in partial_fit
    raise ValueError(
ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.
