In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import seaborn as sb
from scipy.stats import entropy
import math
import sys
from pandas.api.types import CategoricalDtype
from itertools import groupby
import re
from sklearn.preprocessing import StandardScaler 
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_integer_dtype
plt.style.use('ggplot')


hue_order = ["HIGH","MEDIUM","LOW"]
color = ["#BF4E30","#2E86AB","#E7A012"]
palette = {
    'LOW': color[0],
    'MEDIUM': color[1],
    'HIGH': color[2],
}
TARGET_FEATURE = 'SalePrice'
TARGET_FEATURE_CONTI = 'salePriceNum'
TARGET_FEATURE_VALUES = ['LOW', 'MEDIUM', 'HIGH']

#### Utils

###### Data quality

In [3]:
def total_square_feet(dataf: pd.DataFrame):
    total_sf = []
    for ind, e in dataf.iterrows():
        totalsf = e['TotalBsmtSF']
        if e["GarageType"] != 'Basment': 
            totalsf += e['GarageArea']
        # totalsf += e['1stFlrSF']
        # totalsf += e['2ndFlrSF']
        totalsf += e['GrLivArea']
        total_sf.append(totalsf)
    
    return total_sf


In [4]:
def inconsistencies(df):
    print("Incons. for Garage:")
    print(f'If no garage, then no garageFinish {len(df[(df["GarageType"] == "NA") & (df["GarageFinish"] != "NA")])}')
    print(f'If no garage, then no garageArea {len(df[(df["GarageType"] == "NA") & (df["GarageArea"] != 0)])}')
    print()
    print("Incons. for Basment:")
    print(f'BsmtFinType1 <=> bsmtQual {len(df[((df["BsmtQual"] == "NA") & (df["BsmtFinType1"] != "NA")) | ((df["BsmtFinType1"] == "NA") & (df["BsmtQual"] != "NA"))])}')
    print(f'TotalBsmtSf <=> bsmtQual {len(df[((df["BsmtQual"] == "NA") & (df["TotalBsmtSF"] != 0)) | ((df["TotalBsmtSF"] == 0) & (df["BsmtQual"] != "NA"))])}')
    print(f'BsmtFinType1 <=> TotalBsmtSF {len(df[((df["BsmtFinType1"] == "NA") & (df["TotalBsmtSF"] != 0)) | ((df["TotalBsmtSF"] == 0) & (df["BsmtFinType1"] != "NA"))])}')
    print()
    print("Incons. for floors:")
    print(f'Sum of 1st and 2nd floor is consistent: {len(df[df["GrLivArea"] < df["1stFlrSF"]+df["2ndFlrSF"]])}')
    print(f'1st floor <= 0: {len(df[df["1stFlrSF"] <= 0])}')
    print()
    print("Incons. for Housestyle:")
    house_style_df = df[(df["HouseStyle"] == "1Story") & (df["2ndFlrSF"] > 0 )]
    print(f'{house_style_df.loc[:, ["Id","HouseStyle","2ndFlrSF", "1stFlrSF","salePriceNum", "SalePrice","GrLivArea"]]}')
    print()
    print("Incons. for Rooms:")
    print(f'At least a room: {len(df[(df["TotRmsAbvGrd"] == 0)])}')
    print()
    print("Incons. for MasVnrType:")
    df_mas = df.loc[:,["Id", "MasVnrType", "MasVnrArea", "SalePrice"]]
    df_mas["MasVnrArea"] = df_mas["MasVnrArea"].replace("NA", 0)
    df_mas["MasVnrArea"] = df_mas["MasVnrArea"].astype("int")
    #3 diversi vanno droppati e i due con 1 in MasVnrType vanno settati a 0 
    print(f'No MasVnr then Area must be 0: {(df_mas[(df_mas["MasVnrType"] == "None") & (df_mas["MasVnrArea"] != 0)])}')
    #to be changed to None!
    print(f'NA MasVnr: {(df_mas[(df_mas["MasVnrType"] == "NA") & (df_mas["MasVnrArea"] != 0)])}')
    print()
    for i in df.columns:
        if is_integer_dtype(df[i].dtype):
            if(len(df[df[i]<0]) > 0):
                print(f"Values < 0 for {i}")


In [5]:
def get_common_category(df: pd.DataFrame, category: str,  attribute: str, exclude:list, target_feature = TARGET_FEATURE):
    df_filtered = df[df[target_feature] == category]
    df_filtered = df_filtered[~df_filtered[attribute].isin(exclude)]
    return df_filtered[attribute].mode().iloc[0]
    

###### Data Cleaning and Reduction

In [6]:
def data_cleaning_reduction(df: pd.DataFrame, selected_att : dict):
    df = df.copy(deep=True)
    #feature selection (era in data reduction, per facilita lo teniamo qua)
    df = df.loc[:, selected_att.keys()]

    #missing values there are no null values only NA
    #df["attributo"].fillna("valore", inplace = True)

    na_values = {"MasVnrArea": ["NA", 0], "MasVnrType": ["NA", "None"]}
    for attribute in na_values.keys():
        df[attribute] = df[attribute].replace(na_values[attribute][0], na_values[attribute][1]) 
    
    #casting
    for i in selected_att.keys():
        if selected_att[i][0] == "ordinal category":
            df[i] = df[i].astype("category")
            df[i] = df[i].cat.set_categories(selected_att[i][1], ordered=True)
        elif selected_att[i][0] == "ordinal integer":
            df[i]=df[i].astype(CategoricalDtype(ordered=True))
        else:
            df[i] = df[i].astype(selected_att[i][0])
    
    df["MasVnrArea"] = df["MasVnrArea"].replace("NA", 0)
    df["MasVnrArea"] = df["MasVnrArea"].astype("int")

    #noisy data
    df.loc[(df["HouseStyle"] == "1Story") & (df["2ndFlrSF"] > 0 ), "HouseStyle"] = "2Story"
    df.loc[(df["MasVnrType"] == "None") & (df["MasVnrArea"] <= 5 ), "MasVnrArea"] = 0
    
    df["MasVnrArea"] = df["MasVnrArea"].replace("NA", 0)
    df["MasVnrArea"] = df["MasVnrArea"].astype("int")
    df_mastype = df[(df["MasVnrType"] == "None") & (df["MasVnrArea"] > 5 )]
    for index, row in df_mastype.iterrows():
        df.at[index, "MasVnrType"] = get_common_category(df, category=row["SalePrice"], attribute="MasVnrType", exclude = ["None", "NA"])
    return df
    

###### Data transformation

In [7]:
def data_transformation(df : pd.DataFrame, aggregate_values):
    df = df.copy(deep=True)
    #Normalization
    type = ["float64", "int" ,"int64", "float"]
    cols = df.select_dtypes(include = type).columns
    scaler = StandardScaler()
    df[cols] = scaler.fit_transform(df[cols])
    #Aggregation
    for attribute in aggregate_values.keys():
        for val in aggregate_values[attribute].keys():
            df[attribute] = df[attribute].replace(val, aggregate_values[attribute][val])

    #Feature creation
    df["TotalSF"] = total_square_feet(df)
    
    return df

###### Feature selection with wrapper approach

In [8]:
def k_fold_evaluation(df : pd.DataFrame, model, n_splits):

    k_folds = KFold(n_splits, shuffle=True)
    
    df_t = df.drop(columns=['SalePrice'])

    scores = cross_val_score(model, df_t, df['SalePrice'], cv = k_folds, scoring='accuracy')

    print("Cross Validation Scores: ", scores)
    print("Average CV Score: ", scores.mean())
    print("Number of CV Scores used in Average: ", len(scores))

def bootstrapping_evaluation(df : pd.DataFrame, model, n_iterations, silent: bool=False):
    n_size = df.shape[0] // 2
    values = df.values
    #Lets run Bootstrap
    stats = list()
    for i in range(n_iterations):

        #prepare train & test sets
        train = resample(df.index, n_samples = n_size) #Sampling with replacement..whichever is not used in training data will be used in test data
        train_x = []
        train_y = []
        attributes = df.columns.values.tolist()
        attributes.remove('SalePrice')
        for ind in train:
            row = df.iloc[ind]
            train_y.append(row['SalePrice'])
            row = row[attributes]
            train_x.append(row.values)
        
        test = np.array([x for x in df.index if x not in train]) #picking rest of the data not considered in training sample
        test_x = []
        test_y = []
        for ind in test:
            row = df.iloc[ind]
            test_y.append(row['SalePrice'])
            row = row[attributes]
            test_x.append(row.values)
        #fit model
        model.fit(train_x, train_y) #model.fit(X_train,y_train) i.e model.fit(train set, train label as it is a classifier)
        
        #evaluate model
        predictions = model.predict(test_x) #model.predict(X_test)
        score = accuracy_score(test_y, predictions) #accuracy_score(y_test, y_pred)
        #caution, overall accuracy score can mislead when classes are imbalanced
        
        if not silent: print(score)
        stats.append(score)
    
    return stats

def feature_selection_wrapper_(df : pd.DataFrame, selected_att : list)->list:
    #
    pass

#### Read data

In [9]:
df = pd.read_csv('house-prices-advanced-regression-techniques/train.csv', keep_default_na = False, low_memory= False )
df["salePriceNum"] = df.SalePrice.rename("salePriceNum")
df["SalePrice"] = pd.cut(df["salePriceNum"], bins = [0,150000,300000 - 1 ,int(sys.maxsize)], labels=["LOW","MEDIUM","HIGH"])
df_copy = df.copy(deep=True)

### Data Preparation

#### Selected features

In [10]:
SELECTED_FEATURES = ['MSSubClass', 'LotArea', 'Neighborhood', 'HouseStyle', 'OverallQual',
                     'YearBuilt', 'YearRemodAdd', 'Exterior1st', 'MasVnrType', 'MasVnrArea',
                     "ExterQual", "Foundation", "BsmtQual", "BsmtFinType1", "TotalBsmtSF",
                     "1stFlrSF", "2ndFlrSF", "KitchenQual", "GrLivArea", "FullBath", "TotRmsAbvGrd",
                     "Fireplaces", "GarageType", "GarageFinish", "GarageArea", 'TotalSF']

#### Checking for inconsistencies

In [11]:
inconsistencies(df)

Incons. for Garage:
If no garage, then no garageFinish 0
If no garage, then no garageArea 0

Incons. for Basment:
BsmtFinType1 <=> bsmtQual 0
TotalBsmtSf <=> bsmtQual 0
BsmtFinType1 <=> TotalBsmtSF 0

Incons. for floors:
Sum of 1st and 2nd floor is consistent: 0
1st floor <= 0: 0

Incons. for Housestyle:
        Id HouseStyle  2ndFlrSF  1stFlrSF  salePriceNum SalePrice  GrLivArea
164    165     1Story       467      1149        152000    MEDIUM       1616
1270  1271     1Story       192      1332        260000    MEDIUM       1524

Incons. for Rooms:
At least a room: 0

Incons. for MasVnrType:
No MasVnr then Area must be 0:         Id MasVnrType  MasVnrArea SalePrice
624    625       None         288    MEDIUM
773    774       None           1       LOW
1230  1231       None           1    MEDIUM
1300  1301       None         344    MEDIUM
1334  1335       None         312       LOW
NA MasVnr: Empty DataFrame
Columns: [Id, MasVnrType, MasVnrArea, SalePrice]
Index: []



#### Data Cleaning and Reduction

In [12]:
selected_att = {"MSSubClass": ["ordinal integer"],
                "Neighborhood": ["category"],
                "HouseStyle": ["category"],
                "OverallQual": ["ordinal integer"],
                "YearBuilt": ["int64"],
                "YearRemodAdd": ["int64"],
                "Exterior1st":["category"],
                "MasVnrType":["category"],
                "MasVnrArea":["int64"],
                "ExterQual":["ordinal category", ["Po", "Fa","TA","Gd","Ex"]],
                "Foundation":["category"],
                "BsmtQual":["ordinal category", ["NA","Po","Fa","TA","Gd","Ex"]],
                "BsmtFinType1":["ordinal category", ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"]],
                "TotalBsmtSF":["int64"],
                "1stFlrSF":["int64"],
                "2ndFlrSF":["int64"],
                "GrLivArea":["int64"],
                "FullBath":["int64"],
                "KitchenQual":["ordinal category", ["Po","Fa","TA","Gd","Ex"]],
                "TotRmsAbvGrd":["int64"],
                "Fireplaces":["int64"],
                "GarageType":["category"],
                "GarageFinish":["ordinal category",["NA","Unf","RFn","Fin"]],
                "GarageArea":["int64"],
                "SalePrice":["category"]
                }

df_cleaned = data_cleaning_reduction(df , selected_att=selected_att)
print(df_cleaned)

     MSSubClass Neighborhood HouseStyle OverallQual  YearBuilt  YearRemodAdd  \
0            60      CollgCr     2Story           7       2003          2003   
1            20      Veenker     1Story           6       1976          1976   
2            60      CollgCr     2Story           7       2001          2002   
3            70      Crawfor     2Story           7       1915          1970   
4            60      NoRidge     2Story           8       2000          2000   
...         ...          ...        ...         ...        ...           ...   
1455         60      Gilbert     2Story           6       1999          2000   
1456         20       NWAmes     1Story           6       1978          1988   
1457         70      Crawfor     2Story           7       1941          2006   
1458         20        NAmes     1Story           5       1950          1996   
1459         20      Edwards     1Story           5       1965          1965   

     Exterior1st MasVnrType  MasVnrArea

#### Data Transformation

In [13]:

aggregate_values = {
                    # "HouseStyle": {
                    #                 "1Story" : "other",
                    #                 "1.5Unf" : "other",
                    #                 "2.5Fin" : "other",
                    #                 "2.5Unf" : "other",
                    #                 "Sfoyer" : "other",
                    #                 "SLvl" : "other"
                    #                 },
                    "OverallQual": {
                                    1 : 4,
                                    2 : 4,
                                    3 : 4,
                                    10 : 9,
                                    },
                    # "Exterior1st": {
                    #                 "AsbShng" : "other",
                    #                 "AsphShn" : "other",
                    #                 "BrkComm" : "other",
                    #                 "BrkFace" : "other",
                    #                 "CBlock" : "other",
                    #                 "CemntBd" : "other",
                    #                 "HdBoard" : "other",
                    #                 "ImStucc" : "other",                                    
                    #                 "AsbShng" : "other",
                    #                 "MetalSd" : "other",
                    #                 "Other" : "other",
                    #                 "Plywood" : "other",
                    #                 "PreCast" : "other",
                    #                 "Stone" : "other",
                    #                 "Stucco" : "other",
                    #                 "Stone" : "other",
                    #                 "Wd Sdng" : "other",
                    #                 "WdShing" : "other"

                    #                 },
                    # "MasVnrType": {
                    #                 "BrkCmn" : "other",
                    #                 "BrkFace" : "other"
                    #                 },
                    # "ExterQual": {
                    #                 "Fa" : "TA",
                    #                 "Po" : "TA",
                    #             },
                    "Foundation": {
                                    "wood" : "other",
                                    "slab" : "other",
                                    "stone" : "other",
                                    "BrkTil" : "CBlock"
                                },
                    # "BsmtQual": {
                    #                 "NA" : "other",
                    #                 "Fa" : "other",
                    #                 "TA" : "other",
                    #                 "Po" : "other"
                    #             },
                    # "BsmtFinType1": {
                    #                 "ALQ" : "other",
                    #                 "BLQ" : "other",
                    #                 "Rec" : "other",
                    #                 "LwQ" : "other",
                    #                 "NA" : "other",
                    #             },
                    # "KitchenQual": {
                    #                 "Fa" : "TA",
                    #                 "Po" : "TA",
                    #             },
                    # "GarageType": {
                    #                 "2Types" : "other",
                    #                 "CarPort" : "other",
                    #                 "Basment" : "other",
                    #             },
                    }

df_transformed = data_transformation(df_cleaned, aggregate_values)

### Modelling

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
print(df_transformed.columns)
cat_attributes = list(df_transformed.select_dtypes(include = ['category', 'object']).columns)
print(cat_attributes)
cat_attributes.remove("SalePrice")
df_transformed_dummies = pd.get_dummies(df_transformed , columns = cat_attributes)
new_attr_list = list(df_transformed_dummies.columns)

df_transformed_dummies = df_transformed_dummies[new_attr_list]
df_transformed_dummies.head()
attributes = list(df_transformed_dummies.columns)

#attributes.remove('SalePrice')
#seed = random.randint(0, 2**32-2)
seed = 6334
#seed = 1
x_train, x_test, y_train, y_test = train_test_split(np.array(df_transformed_dummies[attributes]), np.array(df_transformed_dummies['SalePrice']), test_size=0.5, random_state=seed)
#validation test, splitting train test again
#x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=seed)

Index(['MSSubClass', 'Neighborhood', 'HouseStyle', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'Exterior1st', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'Foundation', 'BsmtQual', 'BsmtFinType1', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'FullBath', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageType', 'GarageFinish', 'GarageArea', 'SalePrice',
       'TotalSF'],
      dtype='object')
['MSSubClass', 'Neighborhood', 'HouseStyle', 'OverallQual', 'Exterior1st', 'MasVnrType', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtFinType1', 'KitchenQual', 'GarageType', 'GarageFinish', 'SalePrice']


In [16]:
mymodels = []
mymodels.append(('C45',  DecisionTreeClassifier(criterion='entropy', random_state=seed)))
mymodels.append(('CART', DecisionTreeClassifier(criterion='gini', random_state=seed)))
mymodels.append(('FRST', RandomForestClassifier(criterion='entropy', random_state=seed)))
mymodels.append(('ADAB', AdaBoostClassifier( random_state=seed)))
mymodels.append(('GRDB', GradientBoostingClassifier(criterion='squared_error', random_state=seed)))
mymodels.append(('GaussianNaiveBayes', GaussianNB())) 
#mymodels.append(('NeuralNetwork', MLPClassifier(hidden_layer_sizes=(50, 10, ), max_iter=500, verbose=False,  random_state=seed)))

In [17]:
"""
# evaluate each model in turn

for name, model in mymodels:
    print(name)
    
    #model.fit(x_train, y_train)
    #y_test_pred = model.predict(x_test)
    #print('Classification metrics: ')
    #print(classification_report(y_test, y_test_pred))
    #k_fold_evaluation(df = df_transformed_dummies, model = model, n_splits = 1000)
    bootstrapping_evaluation(df=df_transformed_dummies, model=model, n_iterations=3)
"""

"\n# evaluate each model in turn\n\nfor name, model in mymodels:\n    print(name)\n    \n    #model.fit(x_train, y_train)\n    #y_test_pred = model.predict(x_test)\n    #print('Classification metrics: ')\n    #print(classification_report(y_test, y_test_pred))\n    #k_fold_evaluation(df = df_transformed_dummies, model = model, n_splits = 1000)\n    bootstrapping_evaluation(df=df_transformed_dummies, model=model, n_iterations=3)\n"

In [25]:
#
# validate selected/discarded features
#
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from statistics import mean


class FeatureSelectionValidator:

    def __init__(self, staged_dataf: pd.DataFrame, niterations: int=5, silent: bool=False) -> None:
        self.staged_dataf = staged_dataf
        self.niterations = niterations
        self.silent = silent
        self.staged_score_sample = self.compute_score_sample(staged_dataf)
        self.print("Staged dataframe score (avg): %f" % mean(self.staged_score_sample), "\n")
    

    def compute_score_sample(self, dataf: pd.DataFrame) -> list:
        k_fold = KFold(shuffle=True)  # TODO: fix k
        score_sample = []

        for _ in range(self.niterations):
            for i, (train_index, test_index) in enumerate(k_fold.split(dataf)):
                #print(f"Fold {i}:")
                #print(f"  Train: index={train_index}")
                #print(f"  Test:  index={test_index}")

                train_dataf = dataf.iloc[train_index]
                test_dataf = dataf.iloc[test_index]

                X_train = train_dataf.drop(columns=[TARGET_FEATURE])
                X_test  = test_dataf.drop(columns=[TARGET_FEATURE])
                y_train = train_dataf[TARGET_FEATURE]
                y_test  = test_dataf[TARGET_FEATURE]

                model = DecisionTreeClassifier(criterion='entropy', random_state=seed)  # validation model: decision tree
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                score = balanced_accuracy_score(y_true=y_test, y_pred=y_pred)  # balance accuracy as measure
                score_sample.append(score)
        return score_sample


    def validate_forward(self, candidate_features: list[str]) -> list:
        
        include_features = []

        for candidate_feature in candidate_features:
            if candidate_feature == TARGET_FEATURE: continue

            # candidate data frame (construction + score sample)
            candidate_dataf = self.staged_dataf.copy(deep=True)
            candidate_dataf[candidate_feature] = df[candidate_feature]  # TODO: df is the original data frame
            candidate_feature_dtype = str(candidate_dataf[candidate_feature].dtype)
            
            # TODO: add assert for candidate_feature
            if candidate_feature_dtype == 'object':
                candidate_dataf = pd.get_dummies(candidate_dataf, columns=[candidate_feature])  # TODO: check null
            
            elif 'int' in candidate_feature_dtype or 'float' in candidate_feature_dtype:
                # normalize in case of numeric dtype
                scaler = StandardScaler()
                candidate_dataf[[candidate_feature]] = scaler.fit_transform(candidate_dataf[[candidate_feature]])
            
            candidate_score_sample = self.compute_score_sample(candidate_dataf)
            
            # perform t-test for the means
            _, pvalue = stats.ttest_ind(candidate_score_sample, self.staged_score_sample, equal_var=False, alternative='greater')
            outcome: bool = pvalue < 0.05

            if outcome:  # evidence to an increase in performance
                include_features.append(candidate_feature)

            self.print("%s selection - Validation outcome:" % candidate_feature)
            self.print("\t- %s candidate" % ('include' if outcome else 'discard'))
            self.print("\t- pvalue: %f" % pvalue, "\n")
        
        return include_features


    def validate_backward(self, candidate_features: list[str]) -> list:

        drop_features = []

        for candidate_feature in candidate_features:
            if candidate_feature == TARGET_FEATURE: continue

            # candidate data frame (construction + score sample)
            candidate_dataf = self.staged_dataf.copy(deep=True)
            cols_to_drop = []
            for col in candidate_dataf.columns:
                if col.startswith(candidate_feature): cols_to_drop.append(col)
            candidate_dataf = candidate_dataf.drop(columns=cols_to_drop)

            candidate_score_sample = self.compute_score_sample(candidate_dataf)
            
            # perform t-test for the means
            _, pvalue = stats.ttest_ind(candidate_score_sample, self.staged_score_sample, equal_var=False, alternative='less')
            outcome: bool = pvalue < 0.5

            if not outcome:  # no evidence to a decrease in performance
                drop_features.append(candidate_feature)
            
            self.print("%s selection - Validation outcome:" % candidate_feature)
            self.print("\t- %s candidate" % ('keep' if outcome else 'drop'))
            self.print("\t- pvalue: %f" % pvalue, "\n")
        
        return drop_features


    def print(self, *msg: str):
        if not self.silent: print(' '.join(msg))



unstaged_features = df.columns.tolist()
for f in df.columns:
    if f in SELECTED_FEATURES: unstaged_features.remove(f)
staged_dataf = df_transformed_dummies

validator = FeatureSelectionValidator(staged_dataf, niterations=20)
include_features = validator.validate_forward(unstaged_features)
drop_features = validator.validate_backward(SELECTED_FEATURES)

print('Include:', ', '.join(include_features))
print('Drop:', ', '.join(drop_features))

Staged dataframe score (avg): 0.781220 

Id selection - Validation outcome:
	- discard candidate
	- pvalue: 0.902870 

MSZoning selection - Validation outcome:
	- discard candidate
	- pvalue: 0.663809 

LotFrontage selection - Validation outcome:
	- discard candidate
	- pvalue: 0.268348 

Street selection - Validation outcome:
	- discard candidate
	- pvalue: 0.587299 

Alley selection - Validation outcome:
	- discard candidate
	- pvalue: 0.184364 

LotShape selection - Validation outcome:
	- discard candidate
	- pvalue: 0.534127 

LandContour selection - Validation outcome:
	- discard candidate
	- pvalue: 0.595425 

Utilities selection - Validation outcome:
	- discard candidate
	- pvalue: 0.539548 

LotConfig selection - Validation outcome:
	- discard candidate
	- pvalue: 0.324106 

LandSlope selection - Validation outcome:
	- discard candidate
	- pvalue: 0.186615 

Condition1 selection - Validation outcome:
	- discard candidate
	- pvalue: 0.229258 

Condition2 selection - Validation o

In [24]:
#
# Sequential (wrapped) feature selection (from sklearn)
#

from sklearn.feature_selection import SequentialFeatureSelector

pred = DecisionTreeClassifier(criterion='entropy', random_state=seed)
sfs = SequentialFeatureSelector(pred, n_features_to_select=1, scoring="accuracy", tol=0.02, direction='backward')

X = df_transformed_dummies.drop(columns=['SalePrice'])
y = df['SalePrice']
sfs.fit(X, y)

feature_names = X.columns
print('Selected features:', ', '.join(feature_names[sfs.get_support()].tolist()))
sfs.get_params()

KeyboardInterrupt: 