# Assignment

Write a method called wrpr_exhaustive_regression which takes three arguments:
 1. A minimum number of features
 2. A maximum number of features
 3. Scoring methodlogy whose default is ‘mean_absoluteerror ’
 
Implments the exhaustive feature selection for classification using a random forest tree model and print the best features.

## Data Importation

In [2]:
from google.colab import files
uploaded = files.upload()

Saving x_train.pkl to x_train.pkl
Saving y_train.pkl to y_train.pkl
Saving y_train_time.pkl to y_train_time.pkl


In [0]:
import pandas as pd
X_train = pd.read_pickle("x_train.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_train_time = pd.read_pickle("y_train_time.pkl")

feature_set_six = ['v_Vel', 'lateral_current_lane', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_follow1', 'v_Vel_preced2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']

X_train_fs = X_train[feature_set_six]


In [4]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer,mean_absolute_error, r2_score, max_error, mean_squared_error,median_absolute_error
from sklearn.metrics import SCORERS
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
import numpy as np



In [0]:
def wrpr_exhaustive_regression(features,target,min,max,scorer='neg_mean_absolute_error'):
  """Returns only the best feature set's metric dictionnary.

      Parameters
      ----------
      features : Panda DataFrame object
          DataFrame containing our data samples.
      
      target : Panda DataFrame object
          DataFrame containing our class values

      min , max : int
          Sets the range of feature to keep 

      scorer : a sklearn metric
          Sets the metric to monitored to define the quality of the features selected
      Returns
      -------
          Print the best features selected,
          Print the results of mean_absolute_error, mean_squared_error, median_absolute_error and r2_score
        
    """
  #Initialization of the exhaustive feature selector with a ramdom forest classifier and the min-max range
  feature_selector = ExhaustiveFeatureSelector(RandomForestRegressor(n_jobs=-1),
           min_features=min,
           max_features=max,
           scoring=scorer,
           print_progress=True,
           cv=2)
  #Fitting of the selector on the Correlated removel feature set
  feature_selector = feature_selector.fit(features, np.ravel(target,order='C'))
  #Printing the feature names of the best feature subset
  print('Best subset (corresponding names):', feature_selector.best_feature_names_)
  #printing the the metric dictionary of all subsets
  df = pd.DataFrame.from_dict(feature_selector.get_metric_dict()).T
  df.sort_values('avg_score', inplace=True, ascending=False)
  best_features=feature_selector.best_feature_names_
  best_features=np.array(best_features)
  best_features=features[best_features]
  best_features=pd.DataFrame(best_features)
  print(best_features)
  #Initialization of the scoring classifier
  scoring_regressor = {
        'mean_ae': make_scorer(mean_absolute_error),
        'mean_se': make_scorer(mean_squared_error),
        'median_ae': make_scorer(median_absolute_error),
        'r2_score': make_scorer(r2_score)
    }
  #Computing a ramdom forest classifier to get all the metrics by a cross validation
  rfc=RandomForestRegressor(n_jobs=-1)
  rfc_cv_score = cross_validate(rfc, best_features, np.ravel(target,order='C'), cv=10, scoring=scoring_regressor)
  target_pred = cross_val_predict(rfc, best_features, np.ravel(target,order='C'), cv=10)
  #Printing a report with parameters and all the metrics
  print('\n')
  print(f"\t\tMin:{min}")
  print(f"\t\tMax:{max}")
  print("=== Regression Report ===")
  print(f"\t\tMAE: {np.mean(rfc_cv_score['test_mean_ae'])}")
  print(f"\t\tMSE: {np.mean(rfc_cv_score['test_mean_se'])}")
  print(f"\t\tMedian Absolute Error: {np.mean(rfc_cv_score['test_median_ae'])}")  
  print(f"\t\tR2_Score: {np.mean(rfc_cv_score['test_r2_score'])}")  






In [6]:
wrpr_exhaustive_regression(X_train_fs,y_train_time,5,5,scorer='neg_mean_absolute_error')

Features: 8568/8568

Best subset (corresponding names): ('lateral_current_lane', 'iTTC_ref3', 'longit_pos_preced1', 'iTTC_preced1', 'iTTC_preced2')
      lateral_current_lane  iTTC_ref3  ...  iTTC_preced1  iTTC_preced2
0                   -6.449  -0.935648  ...      0.000000     -0.679863
1                   -6.457  -0.942897  ...      0.000000     -0.685754
2                   -6.464  -0.927333  ...      0.000000     -0.690381
3                   -6.466  -0.856413  ...      0.000000     -0.675831
4                   -6.527  -0.719803  ...      0.000000     -0.619519
...                    ...        ...  ...           ...           ...
8575                -0.108  -0.493703  ...     -0.307886      0.000000
8576                -0.116  -0.396076  ...     -0.304984      0.000000
8577                -0.120  -0.431083  ...     -0.323830      0.000000
8578                -0.107  -0.595273  ...     -0.379599      0.000000
8579                -0.127  -0.724354  ...     -0.432520      0.000000

[8580 rows x 5 colum