In [41]:
import pandas as pd
import numpy as np

In [42]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [43]:
train = train.drop( columns = {"Id"})

There are **3** types of attributes:
- numerical
- categorical
- date

And **categorical** is further divided into two categories:
- Relative Categories
- Unrelative Categories

## Attributes Separation

In [44]:
numeric = [ var for var in train.columns if train[var].dtype == "int64" or "float64"]
categorical = [ var for var in train.columns if train[var].dtype == "O"]

In [45]:
numeric

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'Enc

Among all these numeric attributes:
- "SalePrice" is the target variable
- "YearBuilt", "YearRemodAdd", "MoSold", "YrSold","GarageYrBlt" are date variables
- "MSSubClass", "OverallQual", "OverallCond" are categorical variables

In [46]:
date_var = ["YearBuilt", "YearRemodAdd", "MoSold", "YrSold","GarageYrBlt"]
categorical.extend(["MSSubClass", "OverallQual", "OverallCond"])
numeric = list(set(numeric) - set(categorical) - set(date_var))
len(numeric)

29

In [47]:
print(len(categorical))
print(len(date_var))

46
5


All the attributes are now seperated into their correct list.
Categorical will be further divided later.

## Data Preprocessing

### Handling Missing Values

In [48]:
train[numeric].isnull().sum()

BsmtHalfBath       0
GarageCars         0
OpenPorchSF        0
GarageArea         0
ScreenPorch        0
LotFrontage      259
2ndFlrSF           0
LotArea            0
SalePrice          0
LowQualFinSF       0
KitchenAbvGr       0
BsmtFinSF1         0
EnclosedPorch      0
TotRmsAbvGrd       0
TotalBsmtSF        0
BsmtFinSF2         0
Fireplaces         0
BsmtUnfSF          0
FullBath           0
BedroomAbvGr       0
1stFlrSF           0
WoodDeckSF         0
3SsnPorch          0
HalfBath           0
GrLivArea          0
PoolArea           0
MiscVal            0
BsmtFullBath       0
MasVnrArea         8
dtype: int64

In the numeric attributes only lot frontage had missing values and corresponding rows are dropped.

In [49]:
train = train.dropna(subset = numeric,ignore_index = True)

In [50]:
train[date_var].isnull().sum()

YearBuilt        0
YearRemodAdd     0
MoSold           0
YrSold           0
GarageYrBlt     74
dtype: int64

In [51]:
train[categorical].isnull().sum()

MSZoning            0
Street              0
Alley            1110
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        728
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           31
BsmtCond           31
BsmtExposure       32
BsmtFinType1       31
BsmtFinType2       32
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functional          0
FireplaceQu       596
GarageType         74
GarageFinish       74
GarageQual         74
GarageCond         74
PavedDrive          0
PoolQC           1189
Fence             967
MiscFeature      1156
SaleType            0
SaleCondition       0
MSSubClass          0
OverallQual         0
OverallCon

All the garage related attributes have the same missing rows so they are dropped.

In [52]:
train = train.dropna(subset=date_var, ignore_index=True)

As for Categorical Attributes, all missing values signify no existence of that particular item.
Even missing values in garage were related to that. Maybe they also need imputation instead of dropping.
If garage related features were imputed, the problem will 

In [53]:
# Electrical had 1 missing value which can't signify no existence. so that is dropped and other values are imputed as None.

train = train.dropna( subset =["Electrical"], ignore_index = True)
train = train.fillna("None")
train.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64

### Handling Numerical Attributes

In [54]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler

- Almost all numerical attributes have power law distribution except for attributes that signify actual units like bathrooms, fireplaces and kitchens.
- These attributes are "FullBath", "GarageCars", "BsmtHalfBath", "BsmtFullBath", "KitchenAbvGr", "Fireplaces", "TotRmsAbvGrd", "HalfBath", "BedroomAbvGr"
- These attributes are seperated and others are passed to a log transformer and a min max scaler while the seperated ones are left untouched.

In [55]:
unit_numeric = ["FullBath", "GarageCars", "BsmtHalfBath", "BsmtFullBath", "KitchenAbvGr", "Fireplaces", "TotRmsAbvGrd", "HalfBath", "BedroomAbvGr"]
numeric = list(set(numeric) - set(unit_numeric))
len(numeric)

20

In [56]:
log_transformer = FunctionTransformer(np.log1p, inverse_func = np.expm1) 

In [57]:
min_max_scaler = MinMaxScaler()

### Handling Categorical Attributes

In [58]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

Three types of categories are identified to be **relative**:
- "None", "No", "Mn", "Av", "Gd"
- "None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"
- "None","Po","Fa","TA","Gd","Ex"

The relative attributes either have all of the categories from one of the list or it's subset.

They will be **ordinally encoded** from 0 to n such that the closer they are, the more similar they are.
Meanwhile, other unrelative categories would be encoded using **OneHotEncoder**

In [59]:
rating_1 = ["None", "No", "Mn", "Av", "Gd"]
rating_2 = ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
rating_3 = ["None","Po","Fa","TA","Gd","Ex"]

rating_3_cat = list()
rating_2_cat = list()
rating_1_cat = list()

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_3)):
        print(i)
        rating_3_cat.append(i)

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_2)):
        print(i)
        rating_2_cat.append(i)

for i in categorical:
    if set(train[i].value_counts().index.tolist()).issubset(set(rating_1)):
        print(i)
        rating_1_cat.append(i)

ExterQual
ExterCond
BsmtQual
BsmtCond
HeatingQC
KitchenQual
FireplaceQu
GarageQual
GarageCond
PoolQC
BsmtFinType1
BsmtFinType2
BsmtExposure


In [60]:
rating_1_encoder = OrdinalEncoder(categories = [rating_1 for i in rating_1_cat])
rating_2_encoder = OrdinalEncoder(categories = [rating_2 for i in rating_2_cat])
rating_3_encoder = OrdinalEncoder(categories = [rating_3 for i in rating_3_cat])

In [61]:
unrelative_cat = list((((set(categorical) - set(rating_1_cat)) - set(rating_2_cat)) - set(rating_3_cat))) 

In [62]:
unrelative_encoder = OneHotEncoder()

## Final Data Preparation before Pipeline and Training

In [63]:
train[date_var].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1120 entries, 0 to 1119
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   YearBuilt     1120 non-null   int64  
 1   YearRemodAdd  1120 non-null   int64  
 2   MoSold        1120 non-null   int64  
 3   YrSold        1120 non-null   int64  
 4   GarageYrBlt   1120 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 43.9 KB


In [64]:
train["GarageYrBlt"] = train["GarageYrBlt"].astype("int64")

In [65]:
X_train = train.drop( columns = {"SalePrice"})
y_train = train["SalePrice"]
numeric.remove("SalePrice")

In [66]:
# y_train = log_transformer.transform(y_train)

## Pipeline

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [68]:
num_pipeline = Pipeline([
    ("Logarithm", log_transformer),
    ("Scaling", min_max_scaler),
])
rating_1_pipeline = Pipeline([
    ("rating1", rating_1_encoder)
])
rating_2_pipeline = Pipeline([
    ("rating2", rating_2_encoder)
])
rating_3_pipeline = Pipeline([
    ("rating3", rating_3_encoder)
])
unrelative_cat_pipeline = Pipeline([
    ("onehot", unrelative_encoder)
])

In [69]:
preprocessing = ColumnTransformer([
     ("numeric", num_pipeline, numeric),
    ("ordinal1", rating_1_pipeline, rating_1_cat),
    ("ordinal2",rating_2_pipeline, rating_2_cat),
    ("ordinal3",rating_3_pipeline, rating_3_cat),
    ("unrelative", unrelative_cat_pipeline, unrelative_cat),
], remainder = "passthrough")

In [70]:
pipeline = Pipeline(steps=[('preprocessor', preprocessing)])

In [71]:
X_train_transformed = pipeline.fit_transform(X_train)
print(type(X_train_transformed))

<class 'scipy.sparse._csr.csr_matrix'>


In [72]:
X_train_transformed.toarray().shape

(1120, 264)

## Training Model

In [73]:
y_train

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1115    175000
1116    210000
1117    266500
1118    142125
1119    147500
Name: SalePrice, Length: 1120, dtype: int64

In [74]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

linear = LinearRegression()
linear.fit(X_train_transformed, y_train)
y_pred = linear.predict(X_train_transformed)

# y_pred_real = log_transformer.inverse_transform(y_pred)
# y_train_real = log_transformer.inverse_transform(y_train)
# root_mean_squared_error(y_train_real, y_pred_real)

root_mean_squared_error(y_train, y_pred)

22934.713139517477

In [75]:
print(y_train_real.min())
print(y_train_real.max())

35311.00000000002
755000.0000000005


After the data processing, a simple linear model was fitted to see the baseline prediction capability within the features. The RMSE was 18,744 where the range of the salesprice goes from 35311 to 755000. The score isn't great but it could have been worst.

In [76]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train_transformed,y_train)
y_pred = decision_tree.predict(X_train_transformed)

# y_train_real = log_transformer.inverse_transform(y_train)
# y_pred_real = log_transformer.inverse_transform(y_pred)
root_mean_squared_error(y_train, y_pred)

0.0

HAHAHA!!! Exactly similar to the case from the book, the decision tree has a low rmse but it is definitely overfitting. Will do Cross Validation later.

In [77]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()
random_forest.fit(X_train_transformed, y_train)
y_pred = random_forest.predict(X_train_transformed)

# y_train_real = log_transformer.inverse_transform(y_train)
# y_pred_real = log_transformer.inverse_transform(y_pred)

root_mean_squared_error(y_train, y_pred)

11844.37157480636

okay this is better than linear regression.

## Evaluation of models using Cross-validation

In [78]:
from sklearn.model_selection import cross_val_score

linear_rmse = -cross_val_score(linear, X_train_transformed, y_train, scoring="neg_root_mean_squared_error", cv = 10)
decision_rmse = -cross_val_score(decision_tree, X_train_transformed, y_train, scoring="neg_root_mean_squared_error", cv = 10)
random_rmse = -cross_val_score(random_forest, X_train_transformed, y_train, scoring="neg_root_mean_squared_error", cv = 10)

print(linear_rmse)
print(decision_rmse)
print(random_rmse)

[26115.65479142 30182.03147336 31045.00779374 39183.2252978
 43289.82206029 45060.40064627 30057.10114026 27524.76926888
 57974.9140693  30314.90353985]
[54580.7531688  41209.91761418 59195.32286347 50025.85993014
 47936.45669008 37656.17796282 36427.96008955 32707.36761146
 49744.29924822 36521.31554923]
[21453.33992232 27731.15709151 27043.60987082 47247.78001184
 41352.40742818 28467.09951392 25415.35557511 24292.14087295
 42872.71722387 23224.25683696]


In [80]:
pd.Series(linear_rmse).describe()

count       10.000000
mean     36074.783008
std      10131.222930
min      26115.654791
25%      30088.333724
50%      30679.955667
75%      42263.172870
max      57974.914069
dtype: float64

In [81]:
pd.Series(decision_rmse).describe()

count       10.000000
mean     44600.543073
std       8899.850194
min      32707.367611
25%      36805.031153
50%      44573.187152
75%      49955.469760
max      59195.322863
dtype: float64

In [83]:
pd.Series(random_rmse).describe()

count       10.000000
mean     30909.986435
std       9265.117102
min      21453.339922
25%      24572.944548
50%      27387.383481
75%      38131.080450
max      47247.780012
dtype: float64

It seems that random forest is just slightly better than linear regression with an mean RMSE of 30,909 and std of 9265.