In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
from xgboost import XGBRegressor

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import StratifiedKFold

from collections import Counter

In [3]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
X = train_data.copy().drop('Id', axis =1)
X_test = test_data.copy().drop('Id', axis =1)
y = X.pop('SalePrice')

#filling na with given criteria, can be done before cross validation
def filling_zero_na_fn(df):
#filling numerical NA with no purpose
    df['MasVnrArea'] =df['MasVnrArea'].fillna('0').astype('float')    
    #filling categorical NA with given criteria extracted from individual features data
    def cat_fn(df):
        cat_cols = [
        'MasVnrType', 'Alley', 'BsmtExposure', 'BsmtQual', 'BsmtCond',
        'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
        'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PoolQC', 'MiscFeature', 'Fence','Electrical'
        ]
        
        df.loc[df['MasVnrArea'] == 0, 'MasVnrType'] = 'None'
        df['Alley'] =df['Alley'].fillna('NA')
        df.loc[df['TotalBsmtSF'] == 0, ['BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1','BsmtFinType2']] = 'NA'
        df.loc[df['Fireplaces'] == 0, 'FireplaceQu'] = 'NA'
        df.loc[df['GarageArea'] == 0, ['GarageType','GarageFinish','GarageQual','GarageCond']] = 'NA'
        df.loc[df['PoolArea'] == 0,'PoolQC'] = 'NA'
        df['MiscFeature'] =df['MiscFeature'].fillna('NA')
        df['Fence'] =df['Fence'].fillna('NA')
        df.loc[(df['GarageArea'] > 0) & (df['GarageYrBlt'].isnull()), 'GarageYrBlt'] = df['YearBuilt']

        df[cat_cols] = df[cat_cols].astype(object)
        return df
    df = cat_fn(df)
    return df

#apply fill NA function
X = filling_zero_na_fn(X)
X_test = filling_zero_na_fn(X_test)

#classifiing 3 things: categorical features in numbers, cat features with order
#categorical features with dtype as number
cat_in_number_col = ['MSSubClass','OverallQual','OverallCond']

#for categorical features with hiearchy
Ordinal_map = {
    'ExterQual' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQC' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual' : ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond' : ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FireplaceQu' : ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQual': ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure' : ['NA','No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1' : ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFinType2' : ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'Functional': ['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
    'GarageFinish': ['NA','Unf','RFn','Fin'],
    'PavedDrive': ['N','P','Y'],
    'PoolQC': ['NA','Fa','TA','Gd','Ex'],
    'Fence': ['NA','MnWw','GdWo','MnPrv','GdPrv'],
    'OverallQual': ['1','2','3','4','5','6','7','8','9','10'],
    'OverallCond': ['1','2','3','4','5','6','7','8','9','10']
}

# knn impute for missing values for those that are not supposed to be NA of 0
def knn_impute_features(df, testdf, target, features):

    #change categorical features that is inputted in numbers(int) into str for target encoding later
    for col in cat_in_number_col:
        X[col] = X[col].astype(str)
        X_test[col] = X_test[col].astype(str)

    #separate features into 3: numerical, categorical without order and categorical with order
    ordered_cat_cols = [col for col in features if col in Ordinal_map]
    cat_cols = [col for col in features if df[col].dtype == 'object' and col not in ordered_cat_cols]
    num_cols = [col for col in features if df[col].dtype != 'object' and col not in ordered_cat_cols]

    df = df.copy()
    testdf = testdf.copy()

    #locate the np.nan
    missing_value_train = df[target].isnull()
    missing_value_test = testdf[target].isnull()

    #fill na with 'missing' temporarily, to avoid error during encoding, for selected features used for KNN imputing only, not taarget
    for col in cat_cols:
        df[col] = df[col].fillna('Missing')
        testdf[col] = testdf[col].fillna('Missing')

    ordered_categories = []

    #fill NA with 'missing' temporarily, to avoid error during encoding, for selected features used for KNN imputing only, not taarget
    for col in ordered_cat_cols:
        df[col] = df[col].fillna('Missing')
        testdf[col] = testdf[col].fillna('Missing')
        # extract the order of the selected ordered cat columns
        ordered_categories.append(Ordinal_map[col])

    #place target into 3 categories generated earlier on
    if target in Ordinal_map:
        ordered_cat_cols.append(target)
        ordered_categories.append(Ordinal_map[target])
    elif df[target].dtype == 'object'and target not in ordered_cat_cols:
        cat_cols.append(target)
    else:
        num_cols.append(target)    

    #define encoder to cat cols, scaler for num cols
    preprocessor = ColumnTransformer([
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols),
        ('ord', OrdinalEncoder(categories = ordered_categories, handle_unknown='use_encoded_value', unknown_value=np.nan), ordered_cat_cols),
        ('num', StandardScaler(),num_cols)
    ])

    #pipeline for KNN imputer
    pipeline = Pipeline([
        ('preprocessing',preprocessor),
        ('imputer', KNNImputer(n_neighbors=5))
    ])

    #group features and target together
    unarranged_train = df[features + [target]]
    unarranged_test = testdf[features + [target]]

    #create another lists for defined 3 categories
    train_cat = []
    train_ord_cat = []
    train_num = []
    test_cat = []
    test_ord_cat = []
    test_num = []

    #place features and target according to 3 categories
    for col in unarranged_train.columns:
        if col in cat_cols:
            train_cat.append(unarranged_train[col])
            test_cat.append(unarranged_test[col])
        elif col in ordered_cat_cols:
            train_ord_cat.append(unarranged_train[col])
            test_ord_cat.append(unarranged_test[col])
        elif col in num_cols:
            train_num.append(unarranged_train[col])
            test_num.append(unarranged_test[col])

    #rearrange into such order: cat without order, cat with order and num, exactly same order as preprossessor above, for inverse transform later
    train = pd.concat(train_cat + train_ord_cat + train_num, axis = 1)
    test = pd.concat(test_cat + test_ord_cat + test_num, axis = 1)

    #reverse categorical with numbers back into int
    for col in cat_in_number_col:
        if col in train.columns:
            train[col] = train[col].astype(int)
        if col in test.columns:
            test[col] = test[col].astype(int)

    #KNN imputer called
    train_transformed = pipeline.fit_transform(train)
    test_transformed = pipeline.transform(test)

    #extract the transformer for inverse transform later from pipeline
    scaler = pipeline.named_steps['preprocessing'].named_transformers_.get('num')
    ord_encoder = pipeline.named_steps['preprocessing'].named_transformers_.get('ord')
    encoder = pipeline.named_steps['preprocessing'].named_transformers_.get('cat')

    #getting the length of the columns in dataframe, rearrange above make this steps can get the order precisely
    n_cat = len(cat_cols)
    n_ord = len(ordered_cat_cols)
    n_num = len(num_cols)

    #getting the decoded and unscaled list
    decoded = encoder.inverse_transform(np.rint(train_transformed[:,:n_cat]).astype(int)) if n_cat else None
    ord_decoded = ord_encoder.inverse_transform(np.rint(train_transformed[:,n_cat:n_cat + n_ord]).astype(int)) if n_ord else None  
    unscaled = scaler.inverse_transform(train_transformed[:,n_cat + n_ord:]) if n_num else None
    decoded_test = encoder.inverse_transform(np.rint(test_transformed[:,:n_cat]).astype(int)) if n_cat else None
    ord_decoded_test = ord_encoder.inverse_transform(np.rint(test_transformed[:,n_cat:n_cat + n_ord]).astype(int)) if n_ord else None
    unscaled_test = scaler.inverse_transform(test_transformed[:,n_cat + n_ord:]) if n_num else None

    #generated new dataframe, avoiding the non-target features contaminated
    imputed_train_df = pd.DataFrame(index = df.index)
    imputed_test_df = pd.DataFrame(index = testdf.index)

    #put the imputed result into the new dataframe, with the same order defined above
    if cat_cols:
        imputed_train_df[cat_cols] = decoded
        imputed_test_df[cat_cols] =  decoded_test

    if ordered_cat_cols:
        imputed_train_df[ordered_cat_cols] = ord_decoded
        imputed_test_df[ordered_cat_cols] = ord_decoded_test
    
    if num_cols:
        imputed_train_df[num_cols] = unscaled
        imputed_test_df[num_cols] = unscaled_test

    #replace target that indentified as NA, with the imputed values
    df.loc[missing_value_train, target] = imputed_train_df.loc[missing_value_train, target]
    testdf.loc[missing_value_test, target] = imputed_test_df.loc[missing_value_test, target]

    #change 'Missing' defined earlier back into np.nan, wait for the turn when imputing features become target
    for col in cat_cols:
        df[col] = df[col].replace('Missing', np.nan)
        testdf[col] = testdf[col].replace('Missing', np.nan)
    for col in ordered_cat_cols:
        df[col] = df[col].replace('Missing', np.nan)
        testdf[col] = testdf[col].replace('Missing', np.nan)
    for col in cat_in_number_col:
        df[col] = df[col].astype(int)
        testdf[col] = testdf[col].astype(int)

    return df, testdf

def feature_engineering(df):
    df['HouseAge'] = df['YrSold'] - df['YearBuilt'] #year the house exist before sold
    df['RenoAge'] = df['YrSold'] - df['YearRemodAdd'] #year the house renovated before sold
    df['LotDepth'] = (df['LotArea'] / df['LotFrontage']).fillna(0) #area divided by frontage, assuming all houses are rectangular
    
    df['OverallSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF'] #sum of all SF
    df['HighQualFinSFRatio'] = ((df['OverallSF'] - (df['LowQualFinSF'] + df['BsmtUnfSF'])) / df['OverallSF']).fillna(0)
    df['TotalBath'] = df['BsmtFullBath'] + df['FullBath'] + 0.5 * (df['BsmtHalfBath'] + df['HalfBath']) #sum of bathroom
    df['OtherRooms'] = df['TotRmsAbvGrd'] - df['BedroomAbvGr'] #number of other utilities rooms
    df['BsmtFinRatio'] = ((df['BsmtFinSF2'] + df['BsmtFinSF1']) / df['TotalBsmtSF']).fillna(0) 
    df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] +df['ScreenPorch'] # total porch SF
    df['GotPorch'] = (df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] +df['ScreenPorch']) != 0 #binary for existence of porch
    df['OpenSpace'] = df['LotArea'] - df['1stFlrSF'] - df['PorchSF'] - df['WoodDeckSF'] - df['GarageArea'] #openspace SF

    df['GotGarage'] = df['GarageYrBlt'].notnull() & (df['GarageYrBlt'] != 0) #existance of garage
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt']) #garage year built
    df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
    df['GarageBuiltAfterYr'] = df['GarageYrBlt'] - df['YearBuilt']
    df['ArearPerCar'] = (df['GarageArea'] / df['GarageCars']).fillna(0)
    df.loc[df['Utilities'] == 'AllPub', 'Utilities'] = 1 #change to binary
    df.loc[df['Utilities'] == 'NoSeWa', 'Utilities'] = 0
    df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
    
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt']) #when there is no garage, garage year built impute to the same as house year built
    df['GotMiscFeature'] = df['MiscFeature'] != 'NA'
    df['GotAlley'] = df['Alley'] != 'NA'
    df = df.drop(['MiscFeature'], axis = 1)
    
    return df

def target_encoding_fn(df,testdf):

    #this is actually not target encoding, just mapping categorical features with orders listed in Ordinal_map
    for col, cat_order in Ordinal_map.items():
        if col in df.columns and col in testdf.columns:
            ordencoder=OrdinalEncoder(categories = [cat_order])
            df[col] = ordencoder.fit_transform(df[[col]])
            #print(f"{col} -> train: {df[col].isnull().sum()} | test: {testdf[col].isnull().sum()}")
            testdf[col] = ordencoder.transform(testdf[[col]])
            
    for col in df:
        # if nunique is more than 2, all target encoding with SalePrice and given orginal name with _enc behind
        if df[col].dtype == object and col not in list(Ordinal_map.keys()) and df[col].nunique() > 2:
            target_means = df.groupby(col)['SalePrice'].mean()
            df[f'{col}_enc'] = df[col].map(target_means)
            testdf[f'{col}_enc'] = testdf[col].map(target_means)
            df.drop(columns=[col], inplace=True)
            testdf.drop(columns=[col], inplace=True)

        # if nunique == 2, aka binary, one onehotencoder, and drop one of it, become boolean
        elif df[col].dtype == object and col not in list(Ordinal_map.keys()) and df[col].nunique() <= 2:
            OH = OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop = 'first')
            df_cols = pd.DataFrame(OH.fit_transform(df[[col]]), columns = OH.get_feature_names_out([col]), index = df.index)
            
            testdf_cols = pd.DataFrame(OH.transform(testdf[[col]]), columns = OH.get_feature_names_out([col]), index = testdf.index)

            df = pd.concat([df.drop(columns=[col]),df_cols], axis = 1)
            testdf = pd.concat([testdf.drop(columns=[col]),testdf_cols], axis = 1)

    return df, testdf

def clustering(df, testdf):
    #do clustering with area location realted features
    HousingArea_kmeans = KMeans(n_clusters=5, random_state = 1)
    df["HousingAreaCluster"] = HousingArea_kmeans.fit_predict(df[['MSSubClass','MSZoning_enc','Neighborhood_enc']])
    testdf["HousingAreaCluster"] = HousingArea_kmeans.predict(testdf[['MSSubClass','MSZoning_enc','Neighborhood_enc']])

    #do clustering with landscaping related features
    Landscape_kmeans = KMeans(n_clusters=5, random_state = 1)
    df['LandscapeCluster'] = Landscape_kmeans.fit_predict(df[['LotShape_enc','LandContour_enc','LotConfig_enc','LandSlope_enc']])
    testdf['LandscapeCluster'] = Landscape_kmeans.predict(testdf[['LotShape_enc','LandContour_enc','LotConfig_enc','LandSlope_enc']])

    #do clustering with external score
    exter_kmeans = KMeans(n_clusters=10, random_state = 1)
    df["ExterCluster"] = exter_kmeans.fit_predict(df[['ExterQual','ExterCond']])
    testdf["ExterCluster"] = exter_kmeans.predict(testdf[['ExterQual','ExterCond']])

    #do clustering with overall score
    overall_kmeans = KMeans(n_clusters=10, random_state = 1)
    df["OverallCluster"] = overall_kmeans.fit_predict(df[['OverallQual','OverallCond']])
    testdf["OverallCluster"] = overall_kmeans.predict(testdf[['OverallQual','OverallCond']])

    #do clustering with basement score
    bsmt_kmeans = KMeans(n_clusters=10, random_state = 1)
    df["BsmtCluster"] = bsmt_kmeans.fit_predict(df[['BsmtQual','BsmtCond']])
    testdf["BsmtCluster"] = bsmt_kmeans.predict(testdf[['BsmtQual','BsmtCond']])

    #do clustering with garage score
    garage_kmeans = KMeans(n_clusters=10, random_state = 1)
    df["GarageCluster"] = garage_kmeans.fit_predict(df[['GarageQual','GarageCond']])
    testdf["GarageCluster"] = garage_kmeans.predict(testdf[['GarageQual','GarageCond']])
    
    return df, testdf

#build a simple correlation matrix, for selection of features for KKNimputing later
X_corr_study = train_data.copy()
X_corr_study = X_corr_study.drop('Id', axis = 1)
encoder = OrdinalEncoder()
s = [col for col in X_corr_study.columns if X_corr_study[col].dtype == 'object']
X_corr_study[s] = encoder.fit_transform(X_corr_study[s])
corr_matrix = X_corr_study.corr()
top_corr = corr_matrix['SalePrice'].abs().sort_values(ascending=False)

importance_dfs = {}
rejected_features_list = {}

#manually split into 5 folds on train_data
StratifiedKF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 1)

#first run, to find which features to eliminate using reverse-elimination method
for fold, (train_idx, val_idx) in enumerate(StratifiedKF.split(X, y)):

    X_train, X_valid =  X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]
    X_train = pd.concat([X_train, y_train], axis = 1)
    X_valid = pd.concat([X_valid, y_valid], axis = 1)

    #only corr with 0.4 or above to the target features is selected. 
    corr_threshold = 0.4
    #locate the missing columns in X_train and X_valid
    missing_cols = X_train.columns[X_train.isnull().any()].union(X_valid.columns[X_valid.isnull().any()])
    impute_parameter = []
    
    for target in missing_cols:
        #ignore target that not in corr_matrix set up earlier, and also GarageYrBlt, coz there is a lot of np.nan, will be handled by feature_engineering function later
        if target not in corr_matrix.columns or target == 'GarageYrBlt':
            continue
        #if correlation score is above 0.4, within 1, not SalePrice, not Alley (alley got a lot np.nan also), feature selected to be use
        correlated = corr_matrix[target][
            (corr_matrix[target].abs() > corr_threshold) & 
            (corr_matrix[target].abs() < 1.0) &
            (corr_matrix[target].index != 'SalePrice') &
            (corr_matrix[target].index != 'Alley')].index.tolist()
        # if none is above 0.4, the top 1 most related features is use, ignoring SalePrice and alley as well
        if not correlated:
            fallback = corr_matrix[target].drop(labels=[target, 'SalePrice','Alley']).abs().sort_values(ascending=False)
            if not fallback.empty:
                correlated = [fallback.index[0]]
        #put the target and selected features into a new dictionary
        if correlated:
            impute_parameter.append({
                'target': target,
                'features': correlated
            })

    #impute following the dictionary
    for parameter in impute_parameter:
        X_train, X_valid = knn_impute_features(df=X_train, target = parameter['target'], features = parameter['features'], testdf = X_valid)

    #provoke the defined function above for feature_engineering, target encoding and clustering
    X_train = feature_engineering(X_train)
    X_valid = feature_engineering(X_valid)
    X_train, X_valid = target_encoding_fn(X_train, X_valid)
    X_train, X_valid = clustering(X_train, X_valid)

    #drop SalePrice
    X_train = X_train.drop(['SalePrice'], axis = 1)
    X_valid = X_valid.drop(['SalePrice'], axis = 1)

    #define model
    model = XGBRegressor(n_estimators=50, random_state=1)
    model.fit(X_train, y_train)

    #put feature_importances into dataframe
    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=True)

    #record feature importance for each folds
    importance_dfs[fold] = importance_df
    
    #sort 
    importance_df = importance_dfs[fold].sort_values(by='Importance', ascending=True)

    #extract features from least to most important
    features = list(importance_df['Feature'])
    
    rejected_features = []

    #benchmark score taken by predicting with all features without elimination 
    benchmark_pipeline = Pipeline([
        ('model', XGBRegressor(n_estimators=50, random_state=1))
    ])
    benchmark_pipeline.fit(X_train, y_train)
    y_pred = benchmark_pipeline.predict(X_valid)
    benchmark = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))
    feature_selection_scores = []
    
    current_features = features.copy()
    
    for i, feature_to_test in enumerate(features):
        # Temporarily drop the least important one and test
        temp_features = [f for f in current_features if f != feature_to_test]
    
        pipeline = Pipeline([
            ('model', XGBRegressor(n_estimators=50, random_state=1))
        ])
        pipeline.fit(X_train[temp_features], y_train)
        y_pred = pipeline.predict(X_valid[temp_features])
        score = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))
        feature_selection_scores.append((feature_to_test, score))
    
        if score <= benchmark:
            # If RMSE is better or the same, permanently drop the feature
            current_features = temp_features
            rejected_features.append(feature_to_test)
            benchmark = score
            #print(f"Rejected feature: {feature_to_test} (RMSE improved or same: {score})")
        #else:
            #print(f"Kept feature: {feature_to_test} (RMSE worsened to {score})")
        rejected_features_list[fold] = rejected_features

    flat_rejected = sum(rejected_features_list.values(), [])
    rejected_counts = pd.Series(Counter(flat_rejected))
    
    # Filter features rejected in all 5 folds, rejected_5 is the features that eliminated in all 5 folds,
    # rejected_1 is including only 1 fold eliminate until all 5 folds eliminate
    rejected_5 = rejected_counts[rejected_counts >= 5]
    rejected_4 = rejected_counts[rejected_counts >= 4]
    rejected_3 = rejected_counts[rejected_counts >= 3]
    rejected_2 = rejected_counts[rejected_counts >= 2]
    rejected_1 = rejected_counts[rejected_counts >= 1]
            


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Ut

In [4]:
# list of rejected features and how many times it got rejected in different folds
rejected_1.sort_values(ascending = False)

GotAlley              5
GotGarage             4
Utilities             4
GotMiscFeature        3
RoofMatl_enc          3
3SsnPorch             3
BsmtFinSF2            3
FullBath              3
GotPorch              3
PoolQC                3
Condition2_enc        3
GarageCond            2
GarageQual            2
PoolArea              2
Street_Pave           2
MiscVal               2
Heating_enc           2
1stFlrSF              2
GarageCluster         1
PavedDrive            1
MasVnrType_enc        1
BsmtCluster           1
BsmtQual              1
OverallSF             1
TotalBsmtSF           1
LowQualFinSF          1
HighQualFinSFRatio    1
HouseStyle_enc        1
GarageBuiltAfterYr    1
BsmtFinType2          1
BsmtFinRatio          1
MasVnrArea            1
BsmtUnfSF             1
GarageCars            1
OpenPorchSF           1
YearBuilt             1
BsmtHalfBath          1
Alley_enc             1
Electrical_enc        1
MSZoning_enc          1
ExterQual             1
ScreenPorch     

In [5]:
score_5 = {}
score_4 = {}
score_3 = {}
score_2 = {}
score_1 = {}

#second run, to see the actual features appearred in last elimination run helps on the prediction
for fold, (train_idx, val_idx) in enumerate(StratifiedKF.split(X, y)):
    # te below is just a repeat for preprocessing the data
    X_train, X_valid =  X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]
    X_train = pd.concat([X_train, y_train], axis = 1)
    X_valid = pd.concat([X_valid, y_valid], axis = 1)

    impute_parameter = []

    for target in missing_cols:
        if target not in corr_matrix.columns or target == 'GarageYrBlt':
            continue
        correlated = corr_matrix[target][
            (corr_matrix[target].abs() > corr_threshold) & 
            (corr_matrix[target].abs() < 1.0) &
            (corr_matrix[target].index != 'SalePrice') &
            (corr_matrix[target].index != 'Alley')].index.tolist()
        if not correlated:
            fallback = corr_matrix[target].drop(labels=[target, 'SalePrice','Alley']).abs().sort_values(ascending=False)
            if not fallback.empty:
                correlated = [fallback.index[0]]
        if correlated:
            impute_parameter.append({
                'target': target,
                'features': correlated
            })
    
    for parameter in impute_parameter:
        X_train, X_valid = knn_impute_features(df=X_train, target = parameter['target'], features = parameter['features'], testdf = X_valid)
    
    X_train = feature_engineering(X_train)
    X_valid = feature_engineering(X_valid)
    X_train, X_valid = target_encoding_fn(X_train, X_valid)
    X_train, X_valid = clustering(X_train, X_valid)

    X_train = X_train.drop(['SalePrice'], axis = 1)
    X_valid = X_valid.drop(['SalePrice'], axis = 1)

    rejection_pipeline = Pipeline([
        ('model', XGBRegressor(n_estimators=50, random_state=1))
    ])

    
    #a function to call to iterate the prediction with the elimination list
    def featuresrejection(rejected_list):
        retaining_X_train = X_train.drop(rejected_list, axis = 1)
        retaining_X_valid = X_valid.drop(rejected_list, axis = 1)
        rejection_pipeline.fit(retaining_X_train, y_train)
        y_pred = rejection_pipeline.predict(retaining_X_valid)
        score = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))

 
        return score

    score_5[fold] = featuresrejection(rejected_5.index)
    score_4[fold] = featuresrejection(rejected_4.index)
    score_3[fold] = featuresrejection(rejected_3.index)
    score_2[fold] = featuresrejection(rejected_2.index)
    score_1[fold] = featuresrejection(rejected_1.index)

scores_df = pd.DataFrame({
    'score_5': score_5,
    'score_4': score_4,
    'score_3': score_3,
    'score_2': score_2,
    'score_1': score_1
        })
print(scores_df.mean())


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Ut

score_5    0.137956
score_4    0.137956
score_3    0.135733
score_2    0.134379
score_1    0.135185
dtype: float64


In [6]:
'''
the result as below:
score_5    0.137956
score_4    0.137956
score_3    0.135733
score_2    0.134379
score_1    0.135185

score_2 is the best, meaning we are eliminating the features at appear at least twice during the first run
'''
estimators_score = {}
#third run is to get the best parameter for the model
for fold, (train_idx, val_idx) in enumerate(StratifiedKF.split(X, y)):
    #repeat for preprocessing
    X_train, X_valid =  X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]
    X_train = pd.concat([X_train, y_train], axis = 1)
    X_valid = pd.concat([X_valid, y_valid], axis = 1)

    impute_parameter = []

    for target in missing_cols:
        if target not in corr_matrix.columns or target == 'GarageYrBlt':
            continue
        correlated = corr_matrix[target][
            (corr_matrix[target].abs() > corr_threshold) & 
            (corr_matrix[target].abs() < 1.0) &
            (corr_matrix[target].index != 'SalePrice') &
            (corr_matrix[target].index != 'Alley')].index.tolist()
        if not correlated:
            fallback = corr_matrix[target].drop(labels=[target, 'SalePrice','Alley']).abs().sort_values(ascending=False)
            if not fallback.empty:
                correlated = [fallback.index[0]]
        if correlated:
            impute_parameter.append({
                'target': target,
                'features': correlated
            })
    
    for parameter in impute_parameter:
        X_train, X_valid = knn_impute_features(df=X_train, target = parameter['target'], features = parameter['features'], testdf = X_valid)
    
    X_train = feature_engineering(X_train)
    X_valid = feature_engineering(X_valid)
    X_train, X_valid = target_encoding_fn(X_train, X_valid)
    X_train, X_valid = clustering(X_train, X_valid)

    X_train = X_train.drop(['SalePrice'] + list(rejected_2.index), axis=1)
    X_valid = X_valid.drop(['SalePrice'] + list(rejected_2.index), axis=1)

    #a function to interate the n_estimators from 5 to 300, and get the best score
    def get_estimators_accuracy(n, X_train, y_train, X_valid, y_valid):
        my_pipeline = Pipeline(steps=[
            ('model', XGBRegressor(n_estimators = n, random_state=1))
        ])
        my_pipeline.fit(X_train, y_train)
        y_pred = my_pipeline.predict(X_valid)
        score = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))
        return score
        
    best_n = float('inf')
    best_n_score = float('inf')
    for n in [5,10,15,20,25, 30, 35,40,45, 50, 55, 60,65,70,75,80,85,90,95,100,150,200,250,300]:
        my_n_accuracy = get_estimators_accuracy(n, X_train, y_train, X_valid, y_valid)
        print("N_estimators: %d  \t\t Accuracy Score:  %.8f" %(n, my_n_accuracy))
        
        if my_n_accuracy < best_n_score:
            best_n_score = my_n_accuracy
            best_n = n
    
    print("best n = "+ str(best_n) + " at " + str(best_n_score))    
    
    def get_depth_accuracy(d, X_train, y_train, X_valid, y_valid):
        my_pipeline = Pipeline(steps=[
            ('model', XGBRegressor(n_estimators = best_n, max_depth = d, random_state=1))])
        my_pipeline.fit(X_train, y_train)
        y_pred = my_pipeline.predict(X_valid)
        score = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))
        return score

    #a function to interate the max_depth from 1 to 10, given best n_estimators retrieved from above, and get the best score
    best_d = float('inf')
    best_d_score = float('inf')
    for d in [1,2,3,4,5,6,7,8,9,10]:
        my_d_accuracy = get_depth_accuracy(d, X_train, y_train, X_valid, y_valid)
        print("D_estimators: %d  \t\t Accuracy Score:  %.8f" %(d, my_d_accuracy))
        
        if my_d_accuracy < best_d_score:
            best_d_score = my_d_accuracy
            best_d = d
    
    print("best d = "+ str(best_d) + " at " + str(best_d_score))

    #a function to interate the best learning rate from 0.05, to 0.5, given best n_estimators and max_depth retrieved from above, and get the best score
    def get_learning_accuracy(l, X_train, y_train, X_valid, y_valid):
        my_pipeline = Pipeline(steps=[
            ('model', XGBRegressor(n_estimators = best_n, max_depth = best_d, learning_rate = l, random_state=1))])
        my_pipeline.fit(X_train, y_train)
        y_pred = my_pipeline.predict(X_valid)
        score = np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(y_pred)))
        return score
        
    best_l = float('inf')
    best_l_score = float('inf')
    for l in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
        my_l_accuracy = get_learning_accuracy(l, X_train, y_train, X_valid, y_valid)
        print("l_estimators: %.4f  \t\t RMSE Score:  %.8f" %(l, my_l_accuracy))
        
        if my_l_accuracy < best_l_score:
            best_l_score = my_l_accuracy
            best_l = l
    
    print("best l = "+ str(best_l) + " at " + str(best_l_score))

    estimators_score[fold] = {'best_n': best_n, 'best_d': best_d, 'best_l':best_l,'RMSE': best_l_score}
    print(f"Best for Fold {fold}: n={best_n}, n={best_d}, n={best_l}, RMSE={best_l_score:.5f}")

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


N_estimators: 5  		 Accuracy Score:  0.17589635
N_estimators: 10  		 Accuracy Score:  0.14588175
N_estimators: 15  		 Accuracy Score:  0.13880692
N_estimators: 20  		 Accuracy Score:  0.13761303
N_estimators: 25  		 Accuracy Score:  0.13719280
N_estimators: 30  		 Accuracy Score:  0.13697784
N_estimators: 35  		 Accuracy Score:  0.13732349
N_estimators: 40  		 Accuracy Score:  0.13747047
N_estimators: 45  		 Accuracy Score:  0.13732242
N_estimators: 50  		 Accuracy Score:  0.13750899
N_estimators: 55  		 Accuracy Score:  0.13671920
N_estimators: 60  		 Accuracy Score:  0.13652178
N_estimators: 65  		 Accuracy Score:  0.13631418
N_estimators: 70  		 Accuracy Score:  0.13645717
N_estimators: 75  		 Accuracy Score:  0.13641689
N_estimators: 80  		 Accuracy Score:  0.13639196
N_estimators: 85  		 Accuracy Score:  0.13648331
N_estimators: 90  		 Accuracy Score:  0.13645822
N_estimators: 95  		 Accuracy Score:  0.13628021
N_estimators: 100  		 Accuracy Score:  0.13621809
N_estimators: 150  	

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


N_estimators: 5  		 Accuracy Score:  0.17806867
N_estimators: 10  		 Accuracy Score:  0.14373328
N_estimators: 15  		 Accuracy Score:  0.13660210
N_estimators: 20  		 Accuracy Score:  0.13377886
N_estimators: 25  		 Accuracy Score:  0.13253157
N_estimators: 30  		 Accuracy Score:  0.13269018
N_estimators: 35  		 Accuracy Score:  0.13334502
N_estimators: 40  		 Accuracy Score:  0.13374493
N_estimators: 45  		 Accuracy Score:  0.13346855
N_estimators: 50  		 Accuracy Score:  0.13395912
N_estimators: 55  		 Accuracy Score:  0.13414731
N_estimators: 60  		 Accuracy Score:  0.13388559
N_estimators: 65  		 Accuracy Score:  0.13394132
N_estimators: 70  		 Accuracy Score:  0.13377177
N_estimators: 75  		 Accuracy Score:  0.13352127
N_estimators: 80  		 Accuracy Score:  0.13366018
N_estimators: 85  		 Accuracy Score:  0.13358817
N_estimators: 90  		 Accuracy Score:  0.13348254
N_estimators: 95  		 Accuracy Score:  0.13365768
N_estimators: 100  		 Accuracy Score:  0.13361970
N_estimators: 150  	

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


N_estimators: 5  		 Accuracy Score:  0.17307785
N_estimators: 10  		 Accuracy Score:  0.14108230
N_estimators: 15  		 Accuracy Score:  0.13559693
N_estimators: 20  		 Accuracy Score:  0.13443680
N_estimators: 25  		 Accuracy Score:  0.13401235
N_estimators: 30  		 Accuracy Score:  0.13388219
N_estimators: 35  		 Accuracy Score:  0.13446208
N_estimators: 40  		 Accuracy Score:  0.13513005
N_estimators: 45  		 Accuracy Score:  0.13503619
N_estimators: 50  		 Accuracy Score:  0.13490969
N_estimators: 55  		 Accuracy Score:  0.13474166
N_estimators: 60  		 Accuracy Score:  0.13475737
N_estimators: 65  		 Accuracy Score:  0.13457749
N_estimators: 70  		 Accuracy Score:  0.13468853
N_estimators: 75  		 Accuracy Score:  0.13462234
N_estimators: 80  		 Accuracy Score:  0.13486257
N_estimators: 85  		 Accuracy Score:  0.13478348
N_estimators: 90  		 Accuracy Score:  0.13480574
N_estimators: 95  		 Accuracy Score:  0.13480699
N_estimators: 100  		 Accuracy Score:  0.13476352
N_estimators: 150  	

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


N_estimators: 5  		 Accuracy Score:  0.15778363
N_estimators: 10  		 Accuracy Score:  0.13359699
N_estimators: 15  		 Accuracy Score:  0.13080344
N_estimators: 20  		 Accuracy Score:  0.12920546
N_estimators: 25  		 Accuracy Score:  0.12888064
N_estimators: 30  		 Accuracy Score:  0.12879428
N_estimators: 35  		 Accuracy Score:  0.12872421
N_estimators: 40  		 Accuracy Score:  0.12888129
N_estimators: 45  		 Accuracy Score:  0.12842582
N_estimators: 50  		 Accuracy Score:  0.12875881
N_estimators: 55  		 Accuracy Score:  0.12882123
N_estimators: 60  		 Accuracy Score:  0.12868071
N_estimators: 65  		 Accuracy Score:  0.12863769
N_estimators: 70  		 Accuracy Score:  0.12869580
N_estimators: 75  		 Accuracy Score:  0.12878366
N_estimators: 80  		 Accuracy Score:  0.12897946
N_estimators: 85  		 Accuracy Score:  0.12887915
N_estimators: 90  		 Accuracy Score:  0.12898641
N_estimators: 95  		 Accuracy Score:  0.12917668
N_estimators: 100  		 Accuracy Score:  0.12915697
N_estimators: 150  	

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


N_estimators: 5  		 Accuracy Score:  0.18477534
N_estimators: 10  		 Accuracy Score:  0.14699798
N_estimators: 15  		 Accuracy Score:  0.13995398
N_estimators: 20  		 Accuracy Score:  0.13903503
N_estimators: 25  		 Accuracy Score:  0.13819782
N_estimators: 30  		 Accuracy Score:  0.13728321
N_estimators: 35  		 Accuracy Score:  0.13760806
N_estimators: 40  		 Accuracy Score:  0.13681108
N_estimators: 45  		 Accuracy Score:  0.13701764
N_estimators: 50  		 Accuracy Score:  0.13675991
N_estimators: 55  		 Accuracy Score:  0.13673682
N_estimators: 60  		 Accuracy Score:  0.13678338
N_estimators: 65  		 Accuracy Score:  0.13729645
N_estimators: 70  		 Accuracy Score:  0.13718762
N_estimators: 75  		 Accuracy Score:  0.13702186
N_estimators: 80  		 Accuracy Score:  0.13720411
N_estimators: 85  		 Accuracy Score:  0.13716439
N_estimators: 90  		 Accuracy Score:  0.13733162
N_estimators: 95  		 Accuracy Score:  0.13741608
N_estimators: 100  		 Accuracy Score:  0.13738327
N_estimators: 150  	

In [7]:
estimators_score

{0: {'best_n': 300, 'best_d': 4, 'best_l': 0.1, 'RMSE': 0.13161138061778774},
 1: {'best_n': 25, 'best_d': 6, 'best_l': 0.3, 'RMSE': 0.13253156782210657},
 2: {'best_n': 30, 'best_d': 4, 'best_l': 0.2, 'RMSE': 0.12604753960023604},
 3: {'best_n': 45, 'best_d': 5, 'best_l': 0.35, 'RMSE': 0.12117840881072417},
 4: {'best_n': 55, 'best_d': 3, 'best_l': 0.3, 'RMSE': 0.1224466101099657}}

In [8]:
'''
result shows:
{0: {'best_n': 300, 'best_d': 4, 'best_l': 0.1, 'mae': 0.13161138061778774},
 1: {'best_n': 25, 'best_d': 6, 'best_l': 0.3, 'mae': 0.13253156782210657},
 2: {'best_n': 30, 'best_d': 4, 'best_l': 0.2, 'mae': 0.12604753960023604},
 3: {'best_n': 45, 'best_d': 5, 'best_l': 0.35, 'mae': 0.12117840881072417},
 4: {'best_n': 55, 'best_d': 3, 'best_l': 0.3, 'mae': 0.1224466101099657}}

ignoring the outlinerm, we get average best_n = 30, best_d = 5, best_l = 0.3
'''

final_X = pd.concat([X, y], axis = 1)
final_X_test = test_data.copy().drop('Id', axis =1)

final_missing_cols = final_X.columns[final_X.isnull().any()].union(final_X_test.columns[final_X_test.isnull().any()])
impute_parameter = []

for target in final_missing_cols:
    if target not in corr_matrix.columns or target == 'GarageYrBlt':
        continue
    correlated = corr_matrix[target][
        (corr_matrix[target].abs() > corr_threshold) & 
        (corr_matrix[target].abs() < 1.0) &
        (corr_matrix[target].index != 'SalePrice') &
        (corr_matrix[target].index != 'Alley')].index.tolist()
    if not correlated:
        fallback = corr_matrix[target].drop(labels=[target, 'SalePrice','Alley']).abs().sort_values(ascending=False)
        if not fallback.empty:
            correlated = [fallback.index[0]]
    if correlated:
        impute_parameter.append({
            'target': target,
            'features': correlated
        })
    
for parameter in impute_parameter:
    final_X, final_X_test = knn_impute_features(df=final_X, target = parameter['target'], features = parameter['features'], testdf = final_X_test)

final_X = feature_engineering(final_X)
final_X_test = feature_engineering(final_X_test)
final_X, final_X_test = target_encoding_fn(final_X, final_X_test)

#special impute for the features that KNNImputer cant handle, as the features used is too biased
final_X_test.loc[final_X_test['MSZoning_enc'].isnull(), 'MSZoning_enc'] = final_X_test['MSZoning_enc'].mode()[0]
final_X_test.loc[final_X_test['Exterior1st_enc'].isnull(), 'Exterior1st_enc'] = final_X_test['Exterior1st_enc'].mode()[0]
final_X_test.loc[final_X_test['Exterior2nd_enc'].isnull(), 'Exterior2nd_enc'] = final_X_test['Exterior2nd_enc'].mode()[0]
final_X_test.loc[final_X_test['SaleType_enc'].isnull(), 'SaleType_enc'] = final_X_test['SaleType_enc'].mode()[0]

final_X, final_X_test = clustering(final_X, final_X_test)

final_X = final_X.drop(['SalePrice'] + list(rejected_2.index), axis=1)
final_X_test = final_X_test.drop(list(rejected_2.index), axis=1)



submission_pipeline = Pipeline(steps=[
                              ('model', XGBRegressor(n_estimators = 50, max_depth = 4, learning_rate = 0.3,random_state=1))
                             ])
model = submission_pipeline
model.fit(final_X, y)

predictions = model.predict(final_X_test)

output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")



  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute
  df['Utilities'] = df['Utilities'].fillna(1).astype(int) #majority if AllPub, thus direct impute


Your submission was successfully saved!
