# Import and Explore Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_set = pd.read_csv("../../datasets/housing/train.csv")

In [4]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# Before working on data, the labels have to dropped

X = train_set.drop(columns="SalePrice", axis=0)
y = train_set["SalePrice"].copy()

# Preparing Data to Training

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

<b>DropNanValues</b> transformer drops columns which number of nan values of columns is greater than or equal the value that specified with the `threshold` hyperparameter. It has also two additional hyperparameter to control dropping id column if data has id column.

In [8]:
class DropNanValues(BaseEstimator, TransformerMixin):
    def __init__(self, nan_value_threshold, has_id=False, id_column_name=None):
        self.threshold = nan_value_threshold
        self.has_id = has_id
        self.name = id_column_name
        
    def fit(self, X, y=None):
        self.columns = list()
        
        for (column, value) in zip(X.keys(), X.isna().sum()):
            if value >= self.threshold:
                self.columns.append(column)
        
        return self

    def transform(self, X):
        if self.has_id:
            self.columns.append(self.name)
            
        return X.drop(columns=self.columns, axis=0)

<b>FeatureDecomposition</b> transformer returns the columns that composed of numbers or the columns that composed of objects. This property can control with the `return_num_features` hyperparameter.

In [9]:
class FeatureDecomposition(BaseEstimator, TransformerMixin):
    def __init__(self, return_num_features=True):
        self.columns = list()
        self.return_type = return_num_features
        
    def fit(self, X, y=None):
        compare_type = np.object
        
        if not self.return_type:
            for (column, dtype) in zip(X.keys(), X.dtypes):
                if dtype != compare_type:
                    self.columns.append(column)
                    
        else:
            for (column, dtype) in zip(X.keys(), X.dtypes):
                if dtype == compare_type:
                    self.columns.append(column)
                    
        return self
                    
    def transform(self, X):
        return X.drop(columns=self.columns, axis=0)

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

In [12]:
num_pipeline = Pipeline([
    ("num_columns", FeatureDecomposition(return_num_features=True)), # explained above
    ("imputer", SimpleImputer(strategy="median")) # this transformer fill nan values according to the strategy
])

In [13]:
cat_pipeline = Pipeline([
    ("cat_columns", FeatureDecomposition(return_num_features=False)), # explained above
    ("imputer", SimpleImputer(strategy="most_frequent")), # explained above
    ("encoder", OneHotEncoder(sparse=False)) # this transformer categorizes columns
])

In [14]:
# This pipeline merges the numeric columns with the object columns

merge_columns = FeatureUnion([
    ("num", num_pipeline),
    ("cat", cat_pipeline)
])

In [15]:
full_pipeline = Pipeline([
    ("drop_nan", DropNanValues(nan_value_threshold=300, has_id=True, id_column_name="Id")),
    ("merge", merge_columns),
    ("std_scaler", StandardScaler()), 
    ("pca", PCA(n_components=0.95)) # since the dataseth as unnecessary columns, dimensionality reduction
])                                  # might be applied.
                                    # n_components=0.95 indicates that amount of preserving variance while doing 
                                    # dimensionality reduction.

In [16]:
X_prepared = full_pipeline.fit_transform(X)

# Selecting and Training the models 

Let's train a `RandomForestRegressor` and see how does it perform on the dataset

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_prepared, y)
y_pred = forest_reg.predict(X_prepared)

In [19]:
from sklearn.metrics import mean_squared_error

In [22]:
score = mean_squared_error(y, y_pred)
print("Error rate:", np.sqrt(score))
print("Mean accuracy score:", forest_reg.score(X_prepared, y))

Error rate: 12525.4561462011
Mean accuracy score: 0.9751240984936682


It seems good. But how about `GradientBoostingRegressor`?

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

gbt_reg = GradientBoostingRegressor(random_state=42)
gbt_reg.fit(X_prepared, y)
y_boost_pred = gbt_reg.predict(X_prepared)

In [24]:
gbt_score = mean_squared_error(y, y_boost_pred)
print("Error rate:", np.sqrt(gbt_score))
print("Mean accuracy score:", gbt_reg.score(X_prepared, y))

Error rate: 14183.693611779376
Mean accuracy score: 0.9681014890938049


Almost same, but the `RandomForestRegressor` is little bit more precise than `GradientBoostingRegressor`.
So let's continue with it.

# Fine tune model

Due to that the `RandomForestRegressor` was selected, let's find the best hyperparameter. At this point I searched just for the best number of estimator, because the model didn't make overfitting. You can also search the other hyperparameters just adjust the `param_grid` variable.

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = [
    {"n_estimators": [100 ,300, 500]}
]

In [27]:
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, scoring="neg_mean_squared_error", cv=3)

In [28]:
grid_search.fit(X_prepared, y)

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'n_estimators': [100, 300, 500]}],
             scoring='neg_mean_squared_error')

In [36]:
y_grid_search = grid_search.predict(X_prepared)
print("Error rate:", np.sqrt(mean_squared_error(y, y_grid_search)))
print("Mean accuracy score:", -grid_search.score(X_prepared, y))

Error rate: 11990.53815715938
Mean accuracy score: 143773005.29829508


In [31]:
grid_search.best_params_ # since the number of estimators is the biggest value in the parameter space of it
                         # (i.e., the biggest value of the number of n_estimators), you can increase the 
                         # parameter space of n_estimators (e.g., [100, 1000, 10000]). But it may overfitting.

{'n_estimators': 500}

In [32]:
estimator = grid_search.best_estimator_

Last but not least. Don't forget the evaluating the model on the test set, and then send its predicts.

In [33]:
test_set = pd.read_csv("../../datasets/housing/test.csv")

In [34]:
prepared_test_set = full_pipeline.transform(test_set)
y_test = estimator.predict(prepared_test_set)

In [35]:
submission = pd.DataFrame(y_test, index=test_set["Id"], columns=["SalePrice"])
submission.to_csv("submission.csv")