In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [3]:
train = train.drop( columns = {"Id"})

There are **3** types of attributes:
- numerical
- categorical
- date

And **categorical** is further divided into two categories:
- Relative Categories
- Unrelative Categories

## Attributes Separation

In [4]:
numeric = [ var for var in train.columns if train[var].dtype == "int64" or "float64"]
categorical = [ var for var in train.columns if train[var].dtype == "O"]

In [5]:
numeric

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'Enc

Among all these numeric attributes:
- "SalePrice" is the target variable
- "YearBuilt", "YearRemodAdd", "MoSold", "YrSold","GarageYrBlt" are date variables
- "MSSubClass", "OverallQual", "OverallCond" are categorical variables

In [6]:
date_var = ["YearBuilt", "YearRemodAdd", "MoSold", "YrSold","GarageYrBlt"]
categorical.extend(["MSSubClass", "OverallQual", "OverallCond"])
numeric = list(set(numeric) - set(categorical) - set(date_var))
len(numeric)

29

In [7]:
print(len(categorical))
print(len(date_var))

46
5


All the attributes are now seperated into their correct list.
Categorical will be further divided later.

## Data Preprocessing

### Handling Missing Values

In [8]:
train[numeric].isnull().sum()

BedroomAbvGr       0
BsmtFinSF2         0
2ndFlrSF           0
3SsnPorch          0
BsmtFullBath       0
EnclosedPorch      0
WoodDeckSF         0
BsmtUnfSF          0
LotFrontage      259
1stFlrSF           0
GrLivArea          0
MiscVal            0
TotalBsmtSF        0
KitchenAbvGr       0
BsmtFinSF1         0
SalePrice          0
TotRmsAbvGrd       0
PoolArea           0
HalfBath           0
OpenPorchSF        0
LotArea            0
MasVnrArea         8
GarageArea         0
LowQualFinSF       0
Fireplaces         0
BsmtHalfBath       0
FullBath           0
GarageCars         0
ScreenPorch        0
dtype: int64

In the numeric attributes only lot frontage had missing values and corresponding rows are dropped.

In [9]:
train = train.dropna(subset = numeric,ignore_index = True)

In [10]:
train[date_var].isnull().sum()

YearBuilt        0
YearRemodAdd     0
MoSold           0
YrSold           0
GarageYrBlt     74
dtype: int64

In [11]:
train[categorical].isnull().sum()

MSZoning            0
Street              0
Alley            1110
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        728
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           31
BsmtCond           31
BsmtExposure       32
BsmtFinType1       31
BsmtFinType2       32
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functional          0
FireplaceQu       596
GarageType         74
GarageFinish       74
GarageQual         74
GarageCond         74
PavedDrive          0
PoolQC           1189
Fence             967
MiscFeature      1156
SaleType            0
SaleCondition       0
MSSubClass          0
OverallQual         0
OverallCon

All the garage related attributes have the same missing rows so they are dropped.

In [12]:
train = train.dropna(subset=date_var, ignore_index=True)

As for Categorical Attributes, all missing values signify no existence of that particular item.
Even missing values in garage were related to that. Maybe they also need imputation instead of dropping.
If garage related features were imputed, the problem will 

In [13]:
# Electrical had 1 missing value which can't signify no existence. so that is dropped and other values are imputed as None.

train = train.dropna( subset =["Electrical"], ignore_index = True)
train = train.fillna("None")
train.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64

### Handling Numerical Attributes

In [14]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler

- Almost all numerical attributes have power law distribution except for attributes that signify actual units like bathrooms, fireplaces and kitchens.
- These attributes are "FullBath", "GarageCars", "BsmtHalfBath", "BsmtFullBath", "KitchenAbvGr", "Fireplaces", "TotRmsAbvGrd", "HalfBath", "BedroomAbvGr"
- These attributes are seperated and others are passed to a log transformer and a min max scaler while the seperated ones are left untouched.

In [15]:
unit_numeric = ["FullBath", "GarageCars", "BsmtHalfBath", "BsmtFullBath", "KitchenAbvGr", "Fireplaces", "TotRmsAbvGrd", "HalfBath", "BedroomAbvGr"]
numeric = list(set(numeric) - set(unit_numeric))
len(numeric)

20

In [16]:
log_transformer = FunctionTransformer(np.log1p, inverse_func = np.expm1) 

In [17]:
min_max_scaler = MinMaxScaler()

### Handling Categorical Attributes

In [18]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

Three types of categories are identified to be **relative**:
- "None", "No", "Mn", "Av", "Gd"
- "None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"
- "None","Po","Fa","TA","Gd","Ex"

The relative attributes either have all of the categories from one of the list or it's subset.

They will be **ordinally encoded** from 0 to n such that the closer they are, the more similar they are.
Meanwhile, other unrelative categories would be encoded using **OneHotEncoder**

In [19]:
rating_1 = ["None", "No", "Mn", "Av", "Gd"]
rating_2 = ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
rating_3 = ["None","Po","Fa","TA","Gd","Ex"]

rating_3_cat = list()
rating_2_cat = list()
rating_1_cat = list()

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_3)):
        print(i)
        rating_3_cat.append(i)

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_2)):
        print(i)
        rating_2_cat.append(i)

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_1)):
        print(i)
        rating_1_cat.append(i)

ExterQual
ExterCond
BsmtQual
BsmtCond
HeatingQC
KitchenQual
FireplaceQu
GarageQual
GarageCond
PoolQC
BsmtFinType1
BsmtFinType2
BsmtExposure


In [20]:
rating_1_encoder = OrdinalEncoder(categories = [rating_1 for i in rating_1_cat])
rating_2_encoder = OrdinalEncoder(categories = [rating_2 for i in rating_2_cat])
rating_3_encoder = OrdinalEncoder(categories = [rating_3 for i in rating_3_cat])

In [21]:
unrelative_cat = list((((set(categorical) - set(rating_1_cat)) - set(rating_2_cat)) - set(rating_3_cat))) 

In [22]:
unrelative_encoder = OneHotEncoder()

## Final Data Preparation before Pipeline and Training

In [23]:
train[date_var].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1120 entries, 0 to 1119
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   YearBuilt     1120 non-null   int64  
 1   YearRemodAdd  1120 non-null   int64  
 2   MoSold        1120 non-null   int64  
 3   YrSold        1120 non-null   int64  
 4   GarageYrBlt   1120 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 43.9 KB


In [24]:
train["GarageYrBlt"] = train["GarageYrBlt"].astype("int64")

In [27]:
X_train = train.drop( columns = {"SalePrice"})
y_train = train["SalePrice"]
numeric.remove("SalePrice")

In [28]:
y_train = log_transformer.transform(y_train)

## Pipeline

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [32]:
num_pipeline = Pipeline([
    ("Logarithm", log_transformer),
    ("Scaling", min_max_scaler),
])
rating_1_pipeline = Pipeline([
    ("rating1", rating_1_encoder)
])
rating_2_pipeline = Pipeline([
    ("rating2", rating_2_encoder)
])
rating_3_pipeline = Pipeline([
    ("rating3", rating_3_encoder)
])
unrelative_cat_pipeline = Pipeline([
    ("onehot", unrelative_encoder)
])

In [33]:
preprocessing = ColumnTransformer([
     ("numeric", num_pipeline, numeric),
    ("ordinal1", rating_1_pipeline, rating_1_cat),
    ("ordinal2",rating_2_pipeline, rating_2_cat),
    ("ordinal3",rating_3_pipeline, rating_3_cat),
    ("unrelative", unrelative_cat_pipeline, unrelative_cat),
], remainder = "passthrough")

In [34]:
pipeline = Pipeline(steps=[('preprocessor', preprocessing)])

In [35]:
X_train_transformed = pipeline.fit_transform(X_train)
print(type(X_train_transformed))

<class 'scipy.sparse._csr.csr_matrix'>


In [38]:
X_train_transformed.toarray().shape

(1120, 264)

## Training Model

In [40]:
y_train

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1115    12.072547
1116    12.254868
1117    12.493133
1118    11.864469
1119    11.901590
Name: SalePrice, Length: 1120, dtype: float64

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

linear = LinearRegression()
linear.fit(X_train_transformed, y_train)
y_pred = linear.predict(X_train_transformed)

root_mean_squared_error(y_train, y_pred)

0.08912741487654186