In [14]:
from pathlib import Path
import pandas as pd
import numpy as np

# Specify project paths
import os
import sys
PROJECT_ROOT_PATH = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), )))
print('PROJECT_ROOT_PATH',PROJECT_ROOT_PATH)
sys.path.append(PROJECT_ROOT_PATH)

PROJECT_ROOT_PATH C:\Users\urcha\repos\ds_toolbox\ds_toolbox


Links:
- https://medium.com/@GouthamPeri/pipeline-with-tuning-scikit-learn-b2789dca9dc2
- https://scikit-learn.org/stable/modules/feature_selection.html

## Get data

In [15]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()

boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['target'] = boston_dataset.target #MEDV
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [16]:
df=boston.copy()

target_col='target'
# make list of numeric and string columns
numeric_cols = [] # could still have ordinal data
string_cols = []  # could have ordinal or nominal data

for col in df.columns:
    if col!=target_col:
        if (df.dtypes[col] == np.int64 or df.dtypes[col] == np.int32 or df.dtypes[col] == np.float64):
            numeric_cols.append(col)      # True integer or float columns

        if (df.dtypes[col] == object):  # Nominal and ordinal columns
            string_cols.append(col)

print('\n> Number of numerical features',len(numeric_cols),numeric_cols)
print('\n> Number of string features',len(string_cols),string_cols)


df = df[numeric_cols+[target_col]].dropna().copy()
df = df.reset_index(drop=True)

print('\n> Split in train/test')
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)

# break into X and y dataframes
X_train = df_train.reindex(columns=[x for x in df_train[numeric_cols].columns.values if x != target_col]).reset_index(drop=True)        # separate out X
y_train = df_train.reindex(columns=[target_col])   # separate out y
y_train=y_train.values.reshape(-1,1)
y_train = np.ravel(y_train)                     # flatten the y array

# break into X and y dataframes
X_test = df_test.reindex(columns=[x for x in df_test[numeric_cols].columns.values if x != target_col]).reset_index(drop=True)       # separate out X
y_test = df_test.reindex(columns=[target_col])   # separate out y
y_test=y_test.values.reshape(-1,1)
y_test = np.ravel(y_test)                     # flatten the y array

print('\n X,y review (TRAIN):',X_train.shape,y_train.shape)
print('\n X,y review (TEST):',X_test.shape,y_test.shape)


> Number of numerical features 13 ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

> Number of string features 0 []

> Split in train/test

 X,y review (TRAIN): (339, 13) (339,)

 X,y review (TEST): (167, 13) (167,)


## Iterate select features

In [17]:
from imp import reload
import ds_toolbox.feature_selection.feat_selector
reload(ds_toolbox.feature_selection.feat_selector)
from ds_toolbox.feature_selection.feat_selector import RegFeatureSelector

In [18]:
fs=RegFeatureSelector()
available_strategies=fs._available_strategies

for strategy in available_strategies:
    print('\nStrategy=',strategy)
    fs=RegFeatureSelector(strategy=strategy)
    X_adj=fs.fit_transform(X_train,pd.Series(y_train))
    selected_cols=list(X_adj.columns)
    print('selected_cols=',len(selected_cols),sorted(selected_cols))


Strategy= variance
selected_cols= 9 ['AGE', 'B', 'CRIM', 'INDUS', 'LSTAT', 'PTRATIO', 'RAD', 'TAX', 'ZN']

Strategy= l1
selected_cols= 2 ['B', 'TAX']

Strategy= rf_feature_importance
selected_cols= 9 ['AGE', 'B', 'CRIM', 'DIS', 'LSTAT', 'NOX', 'PTRATIO', 'RM', 'TAX']

Strategy= rf_top_features
selected_cols= 10 ['AGE', 'B', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO', 'RM', 'TAX']

Strategy= stepwise
selected_cols= 6 ['CHAS', 'DIS', 'LSTAT', 'NOX', 'PTRATIO', 'RM']


## To evaluate

In [28]:
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


def score_estimator(estimator, X_test,y_test):
    """Score an estimator on the test set."""
    y_pred = estimator.predict(X_test)

    print("MSE: %.3f" %
          mean_squared_error(y_test, y_pred))
    print("MAE: %.3f" %
          mean_absolute_error(y_test, y_pred))

    
dummy = Pipeline([
    ("regressor", DummyRegressor(strategy='mean')),
]).fit(X_train, y_train)

print('Dummy model')
score_estimator(dummy, X_test,y_test)

print('\nLinear regression model')
lr = Pipeline([
    ("regressor", LinearRegression()),
]).fit(X_train, y_train)
score_estimator(lr, X_test,y_test)

Dummy model
MSE: 81.383
MAE: 6.595

Linear regression model
MSE: 26.559
MAE: 3.562


## Compare select + evaluate

In [40]:
from sklearn.linear_model import LinearRegression

for strategy in ['variance', 'l1' ,'rf_feature_importance','rf_top_features','stepwise']:
    print('\nStrategy=',strategy)
    combined_pipeline=Pipeline([
        ("feat_selector", RegFeatureSelector(strategy=strategy)),
        ("regressor", LinearRegression()),
    ]).fit(X_train, pd.Series(y_train))
    X_adj=combined_pipeline['feat_selector'].transform(X_test)
    selected_cols=list(X_adj.columns)
    print('selected_cols=',len(selected_cols),sorted(selected_cols))
    print('check X_adj',X_adj.shape)
    print('test_score:',combined_pipeline.score(X_test,y_test).round(3))
    score_estimator(combined_pipeline, X_test,y_test)



Strategy= variance
selected_cols= 9 ['AGE', 'B', 'CRIM', 'INDUS', 'LSTAT', 'PTRATIO', 'RAD', 'TAX', 'ZN']
check X_adj (167, 9)
test_score: 0.569
MSE: 34.727
MAE: 4.138

Strategy= l1
selected_cols= 2 ['B', 'TAX']
check X_adj (167, 2)
test_score: 0.203
MSE: 64.252
MAE: 5.839

Strategy= rf_feature_importance
selected_cols= 9 ['AGE', 'B', 'CRIM', 'DIS', 'LSTAT', 'NOX', 'PTRATIO', 'RM', 'TAX']
check X_adj (167, 9)
test_score: 0.639
MSE: 29.109
MAE: 3.728

Strategy= rf_top_features
selected_cols= 10 ['AGE', 'B', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO', 'RM', 'TAX']
check X_adj (167, 10)
test_score: 0.639
MSE: 29.114
MAE: 3.731

Strategy= stepwise
selected_cols= 6 ['CHAS', 'DIS', 'LSTAT', 'NOX', 'PTRATIO', 'RM']
check X_adj (167, 6)
test_score: 0.639
MSE: 29.074
MAE: 3.824


## GridsearchCV

In [47]:
from sklearn.model_selection import GridSearchCV

import time

# starting time
start = time.time()
print(f"Run GridsearchCV with strategies",available_strategies)
combined_pipeline=Pipeline([
        ("feat_selector", Reg_feature_selector()),
        ("regressor", LinearRegression()),
    ]).fit(X_train, pd.Series(y_train))


reg_CV = GridSearchCV(combined_pipeline, {'feat_selector__strategy' : available_strategies})
reg_CV.fit(X_train, pd.Series(y_train))

# end time
end = time.time()

# total time taken
tot_time=round((end - start))
print(f"Runtime: {tot_time} sec")

print('\nbest_strategy',reg_CV.best_params_)
print('best_score',reg_CV.best_score_.round(3))
print("\n gridsearch details:")
print(reg_CV.cv_results_)

Run GridsearchCV with strategies ['variance', 'l1', 'rf_feature_importance', 'rf_top_features', 'stepwise']
Runtime: 12 sec

best_strategy {'feat_selector__strategy': 'rf_feature_importance'}
best_score 0.723

 gridsearch details:
{'mean_fit_time': array([0.00478911, 0.00365772, 0.11243453, 2.0066288 , 0.2287447 ]), 'std_fit_time': array([0.00042136, 0.00203868, 0.02161333, 0.01097234, 0.01809523]), 'mean_score_time': array([0.00261359, 0.00099487, 0.00079956, 0.        , 0.00155759]), 'std_score_time': array([0.00044561, 0.00088918, 0.00159912, 0.        , 0.00080239]), 'param_feat_selector__strategy': masked_array(data=['variance', 'l1', 'rf_feature_importance',
                   'rf_top_features', 'stepwise'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'feat_selector__strategy': 'variance'}, {'feat_selector__strategy': 'l1'}, {'feat_selector__strategy': 'rf_feature_importance'}, {'feat_selector__strategy': 'r

Random tests

In [4]:
import numpy as np
rand_list = np.random.randint(
    low=0,
    high=1000,
    size=11,
)
rand_list

array([381, 392, 602, 289, 860, 648, 606,   1, 510, 166, 336])