## 1. Data reading & splitting

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

#import the dataframe
title = 'housing-classification-iter3.csv'
url = f'/Users/G/Desktop/Documents/Formazione in Data Science/WBS/WBS Bootcamp/7. Supervised Machine Learning/Data/housing-iter-0-2/{title}'
data = pd.read_csv(url)

#X and y creation
y = data.pop('Expensive')
X = data

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## 2. Categorical encoding - "MANUAL" approach  (Without Pipelines)

In order to encode the categorical columns numerical, we follow the following steps:

1. Selected the categorical columns.
2. Fitted a `OneHotEncoder` to them.
3. Transformed the categorical columns with the encoder.
4. Converted the sparse matrix into a dataframe.
5. Recovered the names of the columns.
6. Concatenated the one-hot columns with the numerical columns.

In [7]:
from sklearn.impute import SimpleImputer

#Splitting non-numerical and numerical columns, train and test
X_train_cat = X_train.select_dtypes(exclude="number")
X_test_cat = X_test.select_dtypes(exclude="number")
X_train_num = X_train.select_dtypes(include="number")
X_test_num = X_test.select_dtypes(include="number")



#TREATMENT OF THE NUMERICAL FEATURES 

#Define the imputers to replace the NaNs
num_imputer = SimpleImputer(strategy="mean").set_output(transform='pandas')

#Transforming (and fitting) the (training) data
X_train_imputed_num = num_imputer.fit_transform(X_train_num)
X_test_imputed_num = num_imputer.transform(X_test_num)

#--------------------------------------------------------

#TREATMENT OF THE CATEGORIACAL FEATURES

#NaNs
cat_imputer = SimpleImputer(strategy="constant",fill_value="unknown").set_output(transform='pandas')

#Transforming (and tfitting) the (training) data
X_train_imputed_cat = cat_imputer.fit_transform(X_train_cat)
X_test_imputed_cat = cat_imputer.transform(X_test_cat)

#One-hot encoding
#import
from sklearn.preprocessing import OneHotEncoder

# initialize
my_onehot = OneHotEncoder(drop="first",sparse_output=False).set_output(transform='pandas')

# fit
my_onehot.fit(X_train_imputed_cat)

# transform
X_cat_imputed_onehot_train = my_onehot.transform(X_train_imputed_cat)
#--------------------------------------------------------

#Concatenating "one-hot" columns with numerical columns
X_imputed_train = pd.concat([X_cat_imputed_onehot_train, X_train_imputed_num], axis=1)
X_imputed_train.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,Foundation_Wood,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
318,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0
580,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0
961,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0
78,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0


NOTE: If we leave `sparse_output=True`, the result will be a "sparse matrix": an object that Scikit-Learn creates when a matrix contains mostly zeros. In that case we would not be able to use `.set_output(transform='pandas')`.

# 3. Categorical encoding - "Automated" approach (Using Pipelines)

All these steps involeved in the previous "manual" approach can be synthetised by using Scikit-Learn Pipelines and specifically something called `ColumnTransformer`, which allows us to apply different transformations to two or more groups of columns: in our case, categorical and numerical columns.

This process is also called creating "branches" in the pipeline. One branch for the categorical columns and another for the numerical columns. Each branch will contain as many transformers as we want. Then, the branches will meet again, and the transformed columns will be automatically concatenated. Let's see the process in action:

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

#Creating the "numeric pipe" and the "categoric pipe"

#select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

#create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(SimpleImputer(strategy="mean"))

#create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False,handle_unknown='ignore')
)

#Using ColumnTransformer a pipeline with 2 branches (the preprocessor)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

#Creating the full_pipeline (preprocessor + Decision Tree)
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier()).set_output(transform='pandas')

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#train a DecisionTree with GridSearch cross validation

#parameter grid
param_grid ={
    'columntransformer__num_pipe__simpleimputer__strategy':['mean','median'],
    'decisiontreeclassifier__max_depth': range(2, 12),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

# define cross validation
search = GridSearchCV(full_pipeline,
                      param_grid=param_grid,
                      cv=5,
                      verbose=0)

# fit
search.fit(X_train,y_train)

print(f"The best parameters are {search.best_params_}")
print("")
print(f"The average accuracy is {search.best_score_}")

# training accuracy
#below we use X_train and not X_train_imputed because imputing is
#built in the pipeline
print(f"The training accuracy is {accuracy_score(y_train, search.predict(X_train))}")

# testing accuracy
print(f"The testing accuracy is {accuracy_score(y_test, search.predict(X_test))}")

The best parameters are {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__min_samples_leaf': 3, 'decisiontreeclassifier__min_samples_split': 38}

The average accuracy is 0.9212317963390925
The training accuracy is 0.9409246575342466
The testing accuracy is 0.9075342465753424
