# Data Cleaning

In [2]:
import pandas as pd

# transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler

# one modeling class
from sklearn import svm
from sklearn import linear_model

# two composition functions
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [3]:
# create DataFrame with training & test data
df_new = pd.read_csv('./data/test.csv', index_col='Id',)  # nrows=10)
df_train = pd.read_csv('./data/train.csv', index_col='Id',)  # nrows=10)

# concat test and training sets while dropping the target
all_train = pd.concat([df_train, df_new]).drop('SalePrice', axis=1)
y_train = df_train.loc[:, 'SalePrice']

print(f'{all_train.shape = }')
print(f'{y_train.shape = }')

all_train.shape = (2919, 79)
y_train.shape = (1460,)


## Add Features

In [4]:
# add total square footage  
all_train['TotalSF'] = all_train['TotalBsmtSF'] + all_train['1stFlrSF'] + all_train['2ndFlrSF']

In [5]:
# add 'TotalBaths'
all_train['TotalBaths'] = (all_train['BsmtFullBath'] + (all_train['BsmtHalfBath'] / 2) + 
                         all_train['FullBath'] + (all_train['HalfBath'] / 2)
)

## Missing Values

In [6]:
all_train["PoolQC"] = all_train["PoolQC"].fillna("None")
all_train["MiscFeature"] = all_train["MiscFeature"].fillna("None")
all_train["Alley"] = all_train["Alley"].fillna("None")
all_train["Fence"] = all_train["Fence"].fillna("None")
all_train["FireplaceQu"] = all_train["FireplaceQu"].fillna("None")
all_train["MasVnrType"] = all_train["MasVnrType"].fillna("None")
all_train["MasVnrArea"] = all_train["MasVnrArea"].fillna(0)
all_train['MSSubClass'] = all_train['MSSubClass'].fillna("None")


# garage
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_train[col] = all_train[col].fillna('None')

# basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_train[col] = all_train[col].fillna(0)

# basement (categorical)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_train[col] = all_train[col].fillna('None')

In [8]:
# features to keep
cols = [
    'YearBuilt',
    'LotArea',
    'LandContour',
    'LotConfig',
    'Neighborhood',
    'Condition1',
    'BldgType',
    'HouseStyle',
    'Fireplaces',
    'GarageArea',
    'OverallCond',
    'GrLivArea',
    'TotalBsmtSF',
    'TotRmsAbvGrd',
    'TotalBaths',
    'TotalSF',
    'MoSold',
    'YrSold',
]


In [9]:
# categorical data
cat = [
    'YearBuilt',
    'LandContour',
    'LotConfig',
    'Neighborhood',
    'Condition1',
    'OverallCond',
    'BldgType',
    'HouseStyle',
    'Fireplaces',
    'MoSold',
    'YrSold',
]

# remainder data
rem = [
    'TotalBaths',
    'LotArea',
    'GarageArea',
    'GrLivArea',
    'TotalBsmtSF',
    'TotRmsAbvGrd',
    'TotalSF',
]

## Scikit-Learn Pipeline

In [10]:
# filter selected features
filtered_train = all_train[cols]

# create final feature matrices
X_train = filtered_train.iloc[:df_train.shape[0]]
X_test = filtered_train.iloc[df_train.shape[0]:]

In [11]:
# todo: impute missing values
filtered_train.isna().sum()

YearBuilt       0
LotArea         0
LandContour     0
LotConfig       0
Neighborhood    0
Condition1      0
BldgType        0
HouseStyle      0
Fireplaces      0
GarageArea      1
OverallCond     0
GrLivArea       0
TotalBsmtSF     0
TotRmsAbvGrd    0
TotalBaths      2
TotalSF         1
MoSold          0
YrSold          0
dtype: int64

In [8]:
ohe = OneHotEncoder(handle_unknown='ignore',)  # sparse_output=False)
scaler = RobustScaler()

ct = make_column_transformer(
    (ohe, cat),
    # ('passthrough', pas)
    remainder=scaler,
    n_jobs=-1
)

In [9]:
linear_svc = svm.SVC(kernel='linear', random_state=1)

# two-step modeling pipeline
pipe = make_pipeline(ct, linear_svc)
pipe.fit(X_train, y_train)
# pipe.predict(X_test)

In [10]:
predictions = pd.Series(pipe.predict(X_test), index=X_test.index,
                        name='Prediction')

pd.concat([X_test, predictions], axis='columns')

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [11]:
X_test.isna().sum()

YearBuilt       0
LotArea         0
LandContour     0
LotConfig       0
Neighborhood    0
Condition1      0
BldgType        0
HouseStyle      0
Fireplaces      0
GarageArea      1
OverallCond     0
GrLivArea       0
TotalBsmtSF     1
TotRmsAbvGrd    0
TotalBaths      2
TotalSF         1
MoSold          0
YrSold          0
dtype: int64