# Assignment

Write a method called wrpr_regression which takes three arguments:

1. A range of features (eg 4-8) 
2. Step forward/ backward boolean option
3. Scoring methodology whose default is ‘mean_absolute_error’. 
 
This method should implment the sequential feature selection approach for regression(hint mlxtend library) using a random forest tree model. The method is expected to find the best subset of features from the given  range that optimizes the scoring methodology provided while using either a forward or backward step and finally it should print the metric dictionary.

http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#example-1-a-simple-sequential-forward-selection-example

http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#example-9-selecting-the-best-feature-combination-in-a-k-range

https://github.com/rasbt/mlxtend/blob/master/mlxtend/feature_selection/sequential_feature_selector.py

 ## Data Importation
 

In [0]:
import pandas as pd

X_train = pd.read_pickle("../x_train.pkl")
y_train = pd.read_pickle("../y_train.pkl")
y_train_time = pd.read_pickle("../y_train_time.pkl")

# Selected feature set from Filter Techniques
feature_set_six = ['v_Vel', 'lateral_current_lane', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_follow1', 'v_Vel_preced2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']

X_train_fs = X_train[feature_set_six]

In [0]:
def get_best_feature_dict_metrics(sfs):
    """Returns only the best feature set's metric dictionnary.

      Parameters
      ----------
      sfs : SequentialFeatureSelector object
          A SequentialFeatureSelector object after having being fitted.
      
      Returns
      -------
      dict
          a dict containing the sfs's best feature set's metrics.
    """

    # Retrieve the complete metric dictionnary
    dict_metrics = sfs.get_metric_dict()

    # Iterate through each (key, value) pair until finding the best feature names
    for key, value in dict_metrics.items():
        if value['feature_names'] == sfs.k_feature_names_:
            return (dict_metrics[key])

In [0]:
def pp_metric_dict(dic):
    """Pretty Prints the given feature set's metrics dictionnary

      Parameters
      ----------
      dic : dict
          The feature set's metric dictionnary extracted from the SFS metric dict.
    """

    # Iterate over each (key, value) pair and print them side by side
    for key,value in dic.items():
        print(f'{key}: {value}')

In [0]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
import numpy as np

def pp_SFS_results(step_forward, results):
    """Pretty Prints the extracted results from a Sequential Feature Selection. 
        The metrics being printed are mean absolute error, mean squared error, median absolute error and r2 score.

      Parameters
      ----------
      step_forward : bool
          This boolean represent whether the Sequential Feature Selection has been done forward or backwards.
          True: forward, False: backward.
          
      results : dict
          Dictionnary containing the Sequential Feature Selection's metrics
    """
    if step_forward:
        print(f"==================== SFFS, range: {results['range']} ====================")
    else:
        print(f"==================== SBFS, range: {results['range']} ====================")

    print(f"Mean Absolute Error: {results['mean_ae']}")
    print(f"Mean Squared Error: {results['mean_se']}")
    print(f"Median Absolute Error: {results['median_ae']}")
    print(f"R2 Score: {results['r2_score']}")

def wrpr_regression(X, y, feature_range, step_forward):
    """Performs Sequential Feature Selection. The orientation (forward/backward) of the feature selection depends on the arguments (step_forward)
        The feature selection will be performed in a feature range. This function also prints the result metrics of the extracted feature selection.

      Parameters
      ----------
      X : {array-like, sparse matrix, pandas Dataframe}, shape = [n_samples, n_features]
          Data from which the SFS will be based upon.
          
      y : {array-like, pandas Series}, shape = [n_samples]
          Target values.

      feautre_range : tuple
          A tuple containing the range in which you wish to perform the sequential feature selection. The shape of the tuple is (min, max).

      step_forward : bool
          A boolean describing whether you would like to perform the sequential feature selection forwards or backwards.
          (True: Forward, False: Backward).
      
      Returns
      -------
      dict
          a dictionnary containing the sequential feature selection's metric results.
    """

    # Regression Score Metrics Declaration
    scoring_regressor = {
        'mean_ae': make_scorer(mean_absolute_error),
        'mean_se': make_scorer(mean_squared_error),
        'median_ae': make_scorer(median_absolute_error),
        'r2_score': make_scorer(r2_score),
    }

    # Random Forest Regressor creation
    rf_rgr = RandomForestRegressor(n_jobs=-1)

    # Sequential Feature Selection Object Creation + fitting
    sfs = SFS(rf_rgr, k_features=feature_range, forward=step_forward, n_jobs=-1)
    sfs = sfs.fit(X, y.values.ravel())

    rf_rgr = RandomForestRegressor(n_jobs=-1)

    # Cross Validate + Metrics Retrieval for regression on only the current feature set
    regression_cross_validation_results = cross_validate(rf_rgr, X[list(sfs.k_feature_names_)], y.values.ravel(), cv=10, scoring=scoring_regressor, n_jobs=-1)

    # Results storage
    results = {
        'mean_ae': np.mean(regression_cross_validation_results['test_mean_ae']),
        'mean_se': np.mean(regression_cross_validation_results['test_mean_se']),
        'median_ae': np.mean(regression_cross_validation_results['test_median_ae']),
        'r2_score': np.mean(regression_cross_validation_results['test_r2_score']),
        'range': feature_range,
        'feature_names': list(sfs.k_feature_names_)
    }

    # Current Sequential Feature Selection Results printing
    pp_SFS_results(step_forward, results)

    # Best feature set dict metrics retrieval
    dict_metrics = get_best_feature_dict_metrics(sfs)

    # Best feature set dict metrics printing
    pp_metric_dict(dict_metrics)

    print('\n')

    return results

In [0]:
def is_new_best(current_best, results):
    """Compares the results given as parameter to the current best feature set metrics. 

      Parameters
      ----------
      current_best : dict
          Dictionnary containing the current best feature set metrics.
          
      results : dict
          Dictionnary containing the new metrics we want to compare to the current best ones.
      
      Returns
      -------
      bool
          Boolean flag on whether the new metrics are better than the current best.
          True: new metrics are better, False: Current best is better than new metrics.
    """


    # Calculate the difference of each metrics
    mean_ae_difference =  results['mean_ae'] - current_best['mean_ae']
    mean_se_difference = results['mean_se'] - current_best['mean_se']
    median_ae_difference = results['median_ae'] - current_best['median_ae']
    r2_score_difference = current_best['r2_score'] - results['r2_score']


    # Calculate the total difference between the metrics
    total_difference = mean_ae_difference + mean_se_difference + median_ae_difference + r2_score_difference


    return total_difference >= 0

In [0]:
from random import seed
from random import randint

def range_creation(num_ranges, num_features):
    """Creates and returns random ranges for a certain given number of features.
        The minimum difference between the min and the max within a range is 5.

      Parameters
      ----------
      num_ranges : int
          Number of ranges you wish to generate
          
      num_features : int
          Number of features present in your data set
      
      Returns
      -------
      array
          an array of tuples containing the ranges.
    """

    # Empty array which will contain the different generated ranges
    ranges = []

    for _ in range(num_ranges):
        # Do while
        while True:
            # First generate max
            max = randint(1,num_features)

            # Generate min between 1 and max value
            min = randint(1, max)

            # At least have a range of 5 to leave while loop
            if max - min > 5:
                break
        
        # Append new range to the ranges array
        ranges.append((min, max))

    return ranges


def pp_best_fs_SFS(best_results):
    """Pretty Prints the best results given as an argument.

      Parameters
      ----------
      best_results : dict
          Dictionnary containing the best results for SFFS and SBFS.
    """

    # Declaring the different types of Sequential Feature Selection
    sequential_feature_selection = ['SFFS', 'SBFS']

    # Printing the best feature sets results
    for sequential_type in sequential_feature_selection:
        print(f"============================================================================")
        print(f'==================== Best {sequential_type} Feature Set ====================')
        print(f"============================================================================")
        print(f"Range: {best_results[sequential_type]['range']}")
        print(f"Feature names: {best_results[sequential_type]['feature_names']}\n")

        print(f"Mean Absolute Error: {best_results[sequential_type]['mean_ae']}")
        print(f"Mean Squared Error: {best_results[sequential_type]['mean_se']}")
        print(f"Median Absolute Error: {best_results[sequential_type]['median_ae']}")
        print(f"R2 Score: {best_results[sequential_type]['r2_score']}\n\n")



def find_best_fs_SFS_regressor(X_train, y_train, num_ranges):
    """Calculates and finds the best feature sets using SFFS and SFBS.
        To calculate all the different metrics, the Random Forest Classifier model is used.
        This function also prints the results of each range for the forward and the backward
        sequential feature selection, in addition to the best feauture sets and ranges.

      Parameters
      ----------
      X_train : {array-like, sparse matrix, pandas Dataframe}, shape = [n_samples, n_features]
          Data from which the SFS will be based upon.
          
      y_train : {array-like, pandas Series}, shape = [n_samples]
          Target values.
    
      num_ranges : int
          Number of ranges you wish to be generated to find the optimal feature set.
      
      Returns
      -------
      dict
          A dictionnary containing the best feature sets for SFFS and SFBS, with their respective metrics, and range.
    """

    # Best results dictionnary initialisation
    best_results = {
        'SFFS': {
            'mean_ae': 0,
            'mean_se': 0,
            'median_ae': 0,
            'r2_score': 0,
            'range': (0,0),
            'feature_names': []
        },
        'SBFS': {
            'mean_ae': 0,
            'mean_se': 0,
            'median_ae': 0,
            'r2_score': 0,
            'range': (0,0),
            'feature_names': []
        }
    }

    # Ranges generation
    ranges = range_creation(num_ranges, len(X_train.columns))

    # Iterate over each range generated
    for curr_range in ranges:
        print(f"=============================================================")
        print(f"Range: {curr_range}")
        print(f"=============================================================")

        # Calculate SFFS & SBFS for current range
        SFFS_results = wrpr_regression(X_train, y_train, curr_range, True)
        SBFS_results = wrpr_regression(X_train, y_train, curr_range, False)

        # Comparing result with current best and replacing if needed
        if is_new_best(best_results['SFFS'], SFFS_results):
            best_results['SFFS'] = SFFS_results
        
        if is_new_best(best_results['SBFS'], SBFS_results):
            best_results['SBFS'] = SBFS_results

    # Pretty Print best feature sets
    pp_best_fs_SFS(best_results)

    return best_results

In [0]:
best_fs = find_best_fs_SFS_regressor(X_train_fs, y_train_time, 10)

Range: (1, 8)
Mean Absolute Error: 0.2672439393939394
Mean Squared Error: 0.27345536142191146
Median Absolute Error: 0.14165000000000003
R2 Score: -0.2394186559441301
feature_idx: (0, 1, 3, 5, 7, 10, 13)
cv_scores: [-3.40978297  0.81507702  0.90124975  0.93114421  0.88643373]
avg_score: 0.024824347329343244
feature_names: ('v_Vel', 'lateral_current_lane', 'longit_pos_vehicle2', 'lat_pos_vehicle1', 'iTTC_ref3', 'longit_pos_preced1', 'longit_pos_follow2')
ci_bound: 2.2077792414474002
std_dev: 1.7177272553632275
std_err: 0.8588636276816137


Mean Absolute Error: 0.2553207459207459
Mean Squared Error: 0.2504296328671328
Median Absolute Error: 0.1315
R2 Score: -0.23882867929999282
feature_idx: (0, 1, 3, 5, 7, 13, 14, 16)
cv_scores: [-3.51558135  0.8175857   0.92972648  0.93569426  0.89755193]
avg_score: 0.012995403608879652
feature_names: ('v_Vel', 'lateral_current_lane', 'longit_pos_vehicle2', 'lat_pos_vehicle1', 'iTTC_ref3', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_preced2')
ci_bound: 