# Feature Selection

In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
pd.set_option('display.max_columns', None)

import plotly
plotly.offline.init_notebook_mode()

from sklearn.datasets import load_boston
data = load_boston()
df = pd.concat([pd.DataFrame(data['data'], columns = data['feature_names']), 
                pd.DataFrame(data['target'], columns=['MEDV'])], axis=1)

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
import plotly.express as px
X = df.drop(['MEDV'], axis=1)
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
###
# Code adopted from https://machinelearningmastery.com/feature-selection-for-regression-data/
###
fs = SelectKBest(score_func=mutual_info_regression, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)

_dict_features={'Features':[], 'Score':[]}

for score_feature, column in zip(fs.scores_, df.columns):
    _dict_features['Features'].append(column)
    _dict_features['Score'].append(score_feature)
    
_df=pd.DataFrame(_dict_features)

# plot the scores
fig = px.bar(_df, x='Features', y='Score', title="Visualization of the dependence of the target variable on features")
fig.update_layout(height=500, width=950)
fig.show()

# Getting Best Model

In [None]:
#Preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
#Metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score 
#Additional
from sklearn.pipeline import make_pipeline
from joblib import Parallel, delayed

In [None]:
#Function for getting a best model by MAE
def evaluating_scores(pipeline, X, y, scoring_mae='neg_mean_absolute_error', scoring_r2='r2', cv=5):
    #Get values from Preprocessors and Regressors
    preprocessors_names=list(pipeline.named_steps.values())[0]
    regressors_names=list(pipeline.named_steps.values())[1]
    #Get MAE on Cross Validation
    score_mae=-cross_val_score(pipeline, X, y, scoring=scoring_mae, cv=cv).mean()
    #Returning a dict to transform in DataFrame
    return dict(preprocessor=preprocessors_names, regressor=regressors_names, score_mae=score_mae)


def find_optimum(X, y):
    #Set Preprocessors
    preprocessors=[MinMaxScaler(), 
                   StandardScaler(), 
                   RobustScaler()]
    #Set Algorithms
    regressors=[RandomForestRegressor(random_state=42),
                KNeighborsRegressor(),
                LinearRegression(),
                XGBRegressor(random_state=42)] 
    
    #A list for pipelines
    _list_est=[]
    
    #Get all possible pipelines using Brute-Force Search
    for preprocessor in preprocessors:
        for regressor in regressors:
            _list_est.append(make_pipeline(preprocessor, regressor))
            
    #Computing scores for all pipelines using all computing resources
    computed=Parallel(n_jobs=-1)(delayed(evaluating_scores)(pipeline, X, y) for pipeline in _list_est)
    
    #Returning a transformed DataFrame from a dict with evaluated scores
    return round(pd.DataFrame(computed).sort_values('score_mae').reset_index(drop=True), 3)

In [None]:
results=find_optimum(X, y)
results