# Correlated Features Removal

## Data Importation

In [0]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

X_train = pd.read_pickle("../x_train.pkl")
y_train = pd.read_pickle("../y_train.pkl")
y_train_time = pd.read_pickle("../y_train_time.pkl")

In [0]:
def pp_CorrGroup(index, corrGroup, featureImportances):
  """Pretty print correlated groups with there feature importances.

    Parameters
    ----------
    index : int
        The correlated group index.
    corrGroup : array
        Array with the correlated features from the correlated group.
    featureImportances : array
        Array with the correlated features' feature importance from the correlated group.
    """
  num_features = len(corrGroup)

  print("Group%2d:%2d features" %(index+1, num_features))
  
  # Prints the features and it's feature importance side by side
  for i, feature in enumerate(corrGroup):
    print(f"\t {feature} \t {featureImportances[i]}")

In [0]:
def correlated_groups_creation(data, threshold):
  """Creates and returns correlated groups of features.

      Parameters
      ----------
      data : {array-like, sparse matrix, pandas Dataframe}, shape = [n_samples, n_features]
          Data from which the correlated groups will be created, where 
          n_sample is the number of samples and n_features is the number of features.
          
      threshold : float
          Correlation threshold used to create the correlation groups. 
          All features having a correlation coefficient bigger or equal to the threshold
          will be put together inside a group.
      
      Returns
      -------
      list
          a list of lists containing the different correlation groups.
    """


  print(f'==================== Threshold: {threshold} ====================')

  # Correlation Matrix Creation
  corrMatrix = data.corr()

  # Only keep the lower Triangle of the Matrix
  corrMatrix.loc[:,:] = np.tril(corrMatrix, k=-1)

  already_in = set()
  result = []

  for col in corrMatrix:
    # If correlation coefficiant bigger or equal to threshold then add to result array
    correlated = corrMatrix[col][corrMatrix[col] >= threshold].index.tolist()
    if correlated and col not in already_in:
      already_in.update(set(correlated))
      correlated.append(col)
      result.append(correlated)

  return result
  
def correlated_groups_most_important_feature_extraction(data, correlatedGroups, labels, model_type='classifier'):
  """Extracts the most important features from each correlated groups using a Random Forest model of your choice and returns the new feature set list

      Parameters
      ----------
      data : {array-like, sparse matrix, pandas Dataframe}, shape = [n_samples, n_features]
          Training Data from which the correlated groups were created, where 
          n_sample is the number of samples and n_features is the number of features.
          Used to calculate the feature importances.

      correlatedGroups : array-like
          Array of Array containing the different correlation groups
          
      labels : {array-like, pandas Series}, shape = [n_samples]
          Target values.
      
      model_type : str (default: 'classifier')
          The model type you wish to extract the feature importance from.
          {'classifier', 'regressor'}
      
      Returns
      -------
      list
          a list of the remaining features
    """

  # Selected with most important features
  mostImportantFeatures = []


  for index, group in enumerate(correlatedGroups):
    # Retrieve the group's feature from the main dataset
    currFeaturesGroup = data[group]

    if model_type == 'classifier':
      # Create RandomForestClassifier and fit with group's features and labels
      model = RandomForestClassifier()
      model.fit(currFeaturesGroup, labels.values.ravel())
    elif model_type == 'regressor':
      # Create RandomForestRegressor and fit with group's features and labels
      model = RandomForestRegressor()
      model.fit(currFeaturesGroup, labels.values.ravel())
    else:
      raise Exception(f"Wrong model type. {model_type} is not accepted. Please choose between 'classifier' and 'regressor'") 

    # Extract Feature importances
    importance = model.feature_importances_
    pp_CorrGroup(index, group, importance)

    # Retrieve Group's Most important feature
    mostImportantIndex = np.argmax(importance)

    # Add to most important features array
    mostImportantFeatures.append(group[mostImportantIndex])

  # Flatten groups of correlated features
  allCorellatedFeatures = [feature for group in correlatedGroups for feature in group]
  allCorellatedFeatures = list(set(allCorellatedFeatures))

  # Remove the selected most important features from the corellated features
  for feature in mostImportantFeatures:
    allCorellatedFeatures.remove(feature)

  print(f'Removed Features: {len(allCorellatedFeatures)}\n{allCorellatedFeatures}')

  # Remove unwanted features
  data_fs = data.drop(allCorellatedFeatures, axis=1)

  print(f'Remaining Features: {len(data_fs.columns)} \n{list(data_fs.columns)}\n')

  return list(data_fs.columns)

def rmv_correlated_features(data, labels, threshold):
  # Retrieve the correlated groups
  result = correlated_groups_creation(data, threshold)

  print(f"Number of Correlation Groups: {len(result)}")

  # Returns the remaining features after most important features extraction from groups 
  return correlated_groups_most_important_feature_extraction(data, result, labels)

In [0]:
# Define the different thresholds which are going to be used
thresholds = [0.25, 0.5, 0.75, 0.85, 0.9, 0.95]

# Defining the already selected features from the previous methods
# Feature Set 1: Original Feature Set
# Feature Set 2: Information Gain Classification, k = 5
# Feature Set 3: Information Gain Classification, k = 10
# Feature Set 4: Information Gain Regression, k = 5
# Feature Set 5: Information Gain Regression, k = 10
feature_sets = [
    ['v_Vel', 'lateral_current_lane', 'v_Vel_Ref1', 'v_Vel_Ref2', 'v_Vel_Ref3', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_preced1', 'v_Vel_follow1', 'v_Vel_preced2', 'v_Vel_follow2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2'],
    ['lateral_current_lane', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'longit_pos_vehicle1', 'longit_pos_preced2'],
    ['lateral_current_lane', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'longit_pos_vehicle1', 'longit_pos_preced2', 'longit_pos_follow2', 'longit_pos_vehicle2', 'longit_pos_preced1', 'v_Vel', 'longit_pos_follow1'],
    ['lateral_current_lane', 'lat_pos_vehicle2', 'lat_pos_vehicle1', 'iTTC_ref3', 'longit_pos_vehicle3'],
    ['lateral_current_lane', 'lat_pos_vehicle2', 'lat_pos_vehicle1', 'iTTC_ref3', 'longit_pos_vehicle3', 'v_Vel', 'longit_pos_vehicle2', 'longit_pos_vehicle1', 'v_Vel_Ref2', 'v_Vel_Ref1']
]

# For each threshold calculate the corresponding feature set and add to the feature sets
for threshold in thresholds:
    feature_sets.append(rmv_correlated_features(X_train, y_train, threshold))

Number of Correlation Groups: 4
Group 1: 9 features
	 v_Vel_Ref1 	 0.0860771134650176
	 v_Vel_Ref2 	 0.0997076110564077
	 v_Vel_Ref3 	 0.11657333082929375
	 v_Vel_preced1 	 0.09913295603466174
	 v_Vel_follow1 	 0.10513018199623463
	 v_Vel_preced2 	 0.10063609310308445
	 v_Vel_follow2 	 0.09686390344175841
	 longit_pos_follow1 	 0.18824738058438584
	 v_Vel 	 0.10763142948915587
Group 2: 3 features
	 longit_pos_follow1 	 0.09196690548660236
	 longit_pos_preced2 	 0.10637037979824555
	 lateral_current_lane 	 0.801662714715152
Group 3: 3 features
	 longit_pos_preced1 	 0.3296806041989478
	 longit_pos_preced2 	 0.3593727953236461
	 longit_pos_vehicle3 	 0.3109466004774062
Group 4: 3 features
	 lat_pos_vehicle2 	 0.4036279896491245
	 longit_pos_follow2 	 0.19446890377155696
	 lat_pos_vehicle1 	 0.4019031065793186
Removed Features: 12
['lat_pos_vehicle1', 'longit_pos_preced1', 'longit_pos_vehicle3', 'v_Vel_Ref3', 'v_Vel_Ref1', 'v_Vel', 'v_Vel_Ref2', 'v_Vel_follow1', 'v_Vel_preced2', 'v_Vel_fo

In [0]:
def pp_Results(index, features, regression_results, classification_results):
    """Pretty Prints the classification & regression metrics results for a given feature set.

        Parameters
        ----------
        index : int
            The feature set's index

        features : array-like
            Array containing the different feature names
            
        regression_results : array
            Array containing the regression metrics' results.
            Here are the metrics order to respect [mean absolute error, R2 score, max error].

        classification_results : array
            Array containing the classification metrics' results.
            Here are the metrics order to respect [accuracy, precision, recall].
    """

    print('----------------------------------------')    
    print(f'Feature Set: {index + 1}')
    print(f'Features: {features}')
    print('----------------------------------------')

    print('\tRegression:')
    print(f"\t\tMean Absolute Error: {regression_results[0]}")
    print(f"\t\tR2 Score: {regression_results[1]}")
    print(f"\t\tMax Error: {regression_results[2]}\n")

    print('\tClassification:')
    print(f"\t\tAccuracy: {classification_results[0]}")
    print(f"\t\tPrecision: {classification_results[1]}")
    print(f"\t\tRecall: {classification_results[2]}\n")

In [0]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, mean_absolute_error, r2_score, max_error
import numpy as np


def metrics_classification_regression_calculation(feature_sets):
    """Calculates classification (Accuracy, Precision, Recall) and regression (Mean Absolute Error, R2 Score, Max Error) metrics for the given feature sets.
        Also pretty prints the results.

        Parameters
        ----------
        feature_sets : array
            Array containing all the feature sets 
        
        Returns
        -------
        tuple
            a tuple containing the lists of the different classification and regression metrics.
            Here is the order inside the tuple: (accuracy, precision, recall, mean absolute error, r2 score, max error).
    """

    
    # Classification Metrics Array
    accuracy_results = []
    precision_results = []
    recall_results = []

    # Regression Metrics Array
    mae_results = []
    r2_results = []
    max_error_results = []

    # Iterate over each individual set of features
    for i, features in enumerate(feature_sets):
        # Create Regression and Classification models
        regressor = RandomForestRegressor()
        classifier = RandomForestClassifier()

        # Classification Score Metrics Declaration
        scoring_classifier = {
            'accuracy': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score, average='macro'),
            'recall': make_scorer(recall_score, average='macro') 
        }

        # Regression Score Metrics Declaration
        scoring_regression = {
            'MAE': make_scorer(mean_absolute_error),
            'r2': make_scorer(r2_score),
            'max_error': make_scorer(max_error)
        }

        # Cross Validate + Metrics Retrieval for Regression and classification on only the current feature set
        regression_cross_validation_results = cross_validate(regressor, X_train[features], y_train_time, cv=10, scoring=scoring_regression, n_jobs=-1)
        classification_cross_validation_results = cross_validate(classifier, X_train[features], y_train, cv=10, scoring=scoring_classifier, n_jobs=-1)

        # Adding the different metrics scores to their respective arrays
        ## Classification Metrics
        accuracy_results.append(np.mean(classification_cross_validation_results['test_accuracy']))
        precision_results.append(np.mean(classification_cross_validation_results['test_precision']))
        recall_results.append(np.mean(classification_cross_validation_results['test_recall']))

        ## Regression Metrics
        mae_results.append(np.mean(regression_cross_validation_results['test_MAE']))
        r2_results.append(np.mean(regression_cross_validation_results['test_r2']))
        max_error_results.append(np.mean(regression_cross_validation_results['test_max_error']))

        # Creating Regression & Classification Results array to pretty print it 
        regression_results = [mae_results[-1], r2_results[-1], max_error_results[-1]]
        classification_results = [accuracy_results[-1], precision_results[-1], recall_results[-1]]

        # Result Printing
        pp_Results(i, features, regression_results, classification_results)

    return accuracy_results, precision_results, recall_results, mae_results, r2_results, max_error_results

In [0]:
import itertools

def pp_metrics_comparisons(accuracy, precision, recall, mae, r2, max_error):
    """Pretty Prints and performs the metrics comparison between the different sets. It compares a feature set's metric with the other feature sets' same metric.
        The difference is only printed if the current feature set's metric performs better than the one comparing it to. If a certain feature set is printed under
        another feature set's metric, then the latter performs better. At the end of each metric, the numberof feature sets the current one surpasses is printed.

        Parameters
        ----------
        accuracy : array
            Array containing all the feature sets' accuracy metric.

        precision : array
            Array containing all the feature sets' precision metric.

        recall : array
            Array containing all the feature sets' recall metric.

        mae : array
            Array containing all the feature sets' mean absolute error metric.

        r2 : array
            Array containing all the feature sets' r2 score metric.

        max_error : array
            Array containing all the feature sets' max error metric.
    """

    # Retrieving the number of feature sets
    num_sets = len(accuracy)

    # Creating array with all metrics
    metrics = [accuracy, precision, recall, mae, r2, max_error]

    # Iterate over each feature set and printing the difference with other feature sets only if current feature set is better than the one comparing to
    for i in range(num_sets):
        print('----------------------------------------')  
        print(f'Feature Set {i+1}')
        print('----------------------------------------')

        for j in range(len(metrics)):
            if j == 0:
                print(f'\t- Accuracy:')
            elif j == 1:
                print(f'\t- Precision:')
            elif j == 2:
                print(f'\t- Recall:')
            elif j == 3:
                print(f'\t- Mean Absolute Error:')
            elif j == 4:
                print(f'\t- R2 Score:')
            elif j == 5:
                print(f'\t- Max Error:')

            # Intialising the number of feature sets that the current feature set is better than
            better_than = 0

            for k in range(num_sets):    
                if i == k:
                    continue

                # Condition to take into consideration the metrics needed to minimize
                if j == 3 or j == 5:
                    comparison_result = metrics[j][k] - metrics[j][i]
                else:
                    comparison_result = metrics[j][i] - metrics[j][k]
                
                if comparison_result > 0:
                    better_than += 1
                    print(f"\t\t- Feature Set {k+1}: +{comparison_result}")
            print(f"\t\t- BETTER THAN {better_than} SETS")

In [0]:
accuracy_results, precision_results, recall_results, mae_results, r2_results, max_error_results = metrics_classification_regression_calculation(feature_sets)

----------------------------------------
Feature Set: 1
Features: ['v_Vel', 'lateral_current_lane', 'v_Vel_Ref1', 'v_Vel_Ref2', 'v_Vel_Ref3', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_preced1', 'v_Vel_follow1', 'v_Vel_preced2', 'v_Vel_follow2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']
----------------------------------------
	Regression:
		Mean Absolute Error: 0.2750547785547786
		R2 Score: -0.3941608301340057
		Max Error: 3.1100000000000003

	Classification:
		Accuracy: 0.9687645687645687
		Precision: 0.9694202977922787
		Recall: 0.9687645687645687

----------------------------------------
Feature Set: 2
Features: ['lateral_current_lane', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'longit_pos_vehicle1', 'longit_pos_preced2']
----------------------------------------
	Regression:
		Mean Absolute

In [0]:
pp_metrics_comparisons(accuracy_results, precision_results, recall_results, mae_results, r2_results, max_error_results)

----------------------------------------
Feature Set 1
----------------------------------------
	- Accuracy:
		- Feature Set 2: +0.0019813519813520752
		- Feature Set 3: +0.0012820512820512775
		- BETTER THAN 2 SETS
	- Precision:
		- Feature Set 2: +0.0021361352168073333
		- Feature Set 3: +0.0015615652368999866
		- BETTER THAN 2 SETS
	- Recall:
		- Feature Set 2: +0.0019813519813520752
		- Feature Set 3: +0.0012820512820512775
		- BETTER THAN 2 SETS
	- Mean Absolute Error:
		- Feature Set 2: +0.04111305361305356
		- Feature Set 3: +0.03534149184149177
		- Feature Set 4: +0.013036130536130452
		- Feature Set 9: +0.006727272727272693
		- BETTER THAN 4 SETS
	- R2 Score:
		- Feature Set 2: +0.24433203670573478
		- Feature Set 3: +0.11790712138719328
		- Feature Set 4: +0.05979493092784982
		- Feature Set 11: +0.033703243855223164
		- BETTER THAN 4 SETS
	- Max Error:
		- Feature Set 2: +0.20900000000000007
		- Feature Set 3: +0.14900000000000002
		- Feature Set 4: +0.07499999999999973
		- 

The best classification feature set according to the result above is the feature set 8, which corresponds to the correlated feature feature selection with a threshold of 0.75.

The best regression feature set according to the result above is the feature set 10, , which corresponds to the correlated feature feature selection with a threshold of 0.9.

In [0]:
# Best Overall Feature Set for Classification
best_classif_set = ['v_Vel', 'lateral_current_lane', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle2', 'iTTC_ref3', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']

# Best Overall Feature Set for Regression
best_regress_set = ['v_Vel', 'lateral_current_lane', 'v_Vel_Ref3', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_preced1', 'v_Vel_preced2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']

# Merging the two best feature sets into one + removing duplicates
new_feature_set = list(set(best_classif_set + best_regress_set))

# Adding new feature set to the feature sets
feature_sets.append(new_feature_set)

# Apply Correlated Feature Selection on the new feature set with the different threshold defined earlier and adding them to feature sets
for threshold in thresholds:
    feature_sets.append(rmv_correlated_features(X_train[new_feature_set], y_train, threshold))

Number of Correlation Groups: 4
Group 1: 3 features
	 longit_pos_follow1 	 0.10683559510676185
	 longit_pos_preced2 	 0.12337138577500353
	 lateral_current_lane 	 0.7697930191182347
Group 2: 2 features
	 longit_pos_preced1 	 0.5117675894884953
	 longit_pos_vehicle3 	 0.48823241051150473
Group 3: 4 features
	 v_Vel_preced1 	 0.2513635631388544
	 v_Vel_Ref3 	 0.24644744854591422
	 v_Vel_preced2 	 0.23811201362624052
	 v_Vel 	 0.2640769746889909
Group 4: 3 features
	 lat_pos_vehicle2 	 0.4160867773272653
	 longit_pos_follow2 	 0.19176195475057178
	 lat_pos_vehicle1 	 0.3921512679221629
Removed Features: 8
['lat_pos_vehicle1', 'longit_pos_vehicle3', 'v_Vel_Ref3', 'longit_pos_follow1', 'v_Vel_preced2', 'longit_pos_follow2', 'longit_pos_preced2', 'v_Vel_preced1']
Remaining Features: 11 
['lateral_current_lane', 'iTTC_follow1', 'longit_pos_preced1', 'v_Vel', 'iTTC_preced1', 'iTTC_preced2', 'lat_pos_vehicle2', 'longit_pos_vehicle2', 'iTTC_follow2', 'longit_pos_vehicle1', 'iTTC_ref3']

Number o

In [0]:
# Retrieving metrics results with new feature sets added + pretty printing them
accuracy_results, precision_results, recall_results, mae_results, r2_results, max_error_results = metrics_classification_regression_calculation(feature_sets)

----------------------------------------
Feature Set: 1
Features: ['v_Vel', 'lateral_current_lane', 'v_Vel_Ref1', 'v_Vel_Ref2', 'v_Vel_Ref3', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_preced1', 'v_Vel_follow1', 'v_Vel_preced2', 'v_Vel_follow2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']
----------------------------------------
	Regression:
		Mean Absolute Error: 0.27426340326340326
		R2 Score: -0.37188707735332044
		Max Error: 3.1559999999999997

	Classification:
		Accuracy: 0.975874125874126
		Precision: 0.9762654956482886
		Recall: 0.975874125874126

----------------------------------------
Feature Set: 2
Features: ['lateral_current_lane', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'longit_pos_vehicle1', 'longit_pos_preced2']
----------------------------------------
	Regression:
		Mean Absolute

In [0]:
# Performing new feature sets metrics' comparison
pp_metrics_comparisons(accuracy_results, precision_results, recall_results, mae_results, r2_results, max_error_results)

----------------------------------------
Feature Set 1
----------------------------------------
	- Accuracy:
		- Feature Set 2: +0.010139860139860235
		- Feature Set 3: +0.009790209790210058
		- Feature Set 4: +0.0005827505827508128
		- Feature Set 5: +0.004428904428904645
		- Feature Set 6: +0.0006993006993009088
		- Feature Set 7: +0.006293706293706514
		- Feature Set 8: +0.0036130536130539737
		- Feature Set 9: +0.00011655011655020697
		- Feature Set 10: +0.0037296037296039586
		- Feature Set 11: +0.002447552447552681
		- Feature Set 13: +0.0010489510489514187
		- Feature Set 14: +0.00279720279720308
		- Feature Set 15: +0.0018648018648020903
		- Feature Set 16: +0.0010489510489511966
		- Feature Set 17: +0.004312354312354438
		- Feature Set 18: +0.003030303030303161
		- BETTER THAN 16 SETS
	- Precision:
		- Feature Set 2: +0.009716187713370683
		- Feature Set 3: +0.009637239930101793
		- Feature Set 4: +0.000624930681198621
		- Feature Set 5: +0.004479056749455257
		- Feature Set 6

After comparing the results we can conclude that these are the optimal feature sets:
- Classification: Feature Set 12 (New Feature Set)
- Regression: Feature Set 17 (New Feature Set + Correlated Features t = 0.9)
- Classification & Regression: Feature Set 9 (Correlated Features t = 0.85)

Here their improvements:

## Feature Set 12
Features: \
\['lateral_current_lane', 'iTTC_follow1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_vehicle3', 'longit_pos_preced1', 'v_Vel', 'iTTC_preced1', 'iTTC_preced2', 'lat_pos_vehicle1', 'v_Vel_preced1', 'v_Vel_Ref3', 'lat_pos_vehicle2', 'longit_pos_vehicle2', 'iTTC_follow2', 'v_Vel_preced2', 'longit_pos_vehicle1', 'iTTC_ref3', 'longit_pos_follow2']

- Regression:
	- Mean Absolute Error: 0.27151981351981347
	- R2 Score: -0.3685475580466733
	- Max Error: 3.219

- Classification:
	- Accuracy: 0.9776223776223777
	- Precision: 0.9780706587639209
	- Recall: 0.9776223776223777

#### Comparisons

- Accuracy:
	- Feature Set 1: +0.0017482517482516613
	- Feature Set 2: +0.011888111888111896
	- Feature Set 3: +0.01153846153846172
	- Feature Set 4: +0.002331002331002474
	- Feature Set 5: +0.006177156177156307
	- Feature Set 6: +0.00244755244755257
	- Feature Set 7: +0.008041958041958175
	- Feature Set 8: +0.005361305361305635
	- Feature Set 9: +0.0018648018648018683
	- Feature Set 10: +0.00547785547785562
	- Feature Set 11: +0.004195804195804342
	- Feature Set 13: +0.00279720279720308
	- Feature Set 14: +0.004545454545454741
	- Feature Set 15: +0.0036130536130537516
	- Feature Set 16: +0.002797202797202858
	- Feature Set 17: +0.0060606060606061
	- Feature Set 18: +0.004778554778554822
	- BETTER THAN 17 SETS
- Precision:
	- Feature Set 1: +0.00180516311563228
	- Feature Set 2: +0.011521350829002963
	- Feature Set 3: +0.011442403045734073
	- Feature Set 4: +0.002430093796830901
	- Feature Set 5: +0.006284219865087537
	- Feature Set 6: +0.002378658225189567
	- Feature Set 7: +0.008168923551674756
	- Feature Set 8: +0.005388059396116751
	- Feature Set 9: +0.001857961750531656
	- Feature Set 10: +0.0056698675685200595
	- Feature Set 11: +0.004147567816648823
	- Feature Set 13: +0.0025940014308823978
	- Feature Set 14: +0.004226255027456993
	- Feature Set 15: +0.0036709810702276746
	- Feature Set 16: +0.00274155925222741
	- Feature Set 17: +0.005990387924042051
	- Feature Set 18: +0.004699595510907706
	- BETTER THAN 17 SETS
- Recall:
	- Feature Set 1: +0.0017482517482516613
	- Feature Set 2: +0.011888111888112007
	- Feature Set 3: +0.01153846153846172
	- Feature Set 4: +0.002331002331002474
	- Feature Set 5: +0.0061771561771560846
	- Feature Set 6: +0.00244755244755257
	- Feature Set 7: +0.008041958041958175
	- Feature Set 8: +0.005361305361305635
	- Feature Set 9: +0.0018648018648018683
	- Feature Set 10: +0.005477855477855509
	- Feature Set 11: +0.004195804195804342
	- Feature Set 13: +0.002797202797202858
	- Feature Set 14: +0.004545454545454741
	- Feature Set 15: +0.0036130536130537516
	- Feature Set 16: +0.002797202797202858
	- Feature Set 17: +0.0060606060606061
	- Feature Set 18: +0.004778554778554822
	- BETTER THAN 17 SETS


## Feature Set 17

Features: \
\['lateral_current_lane', 'iTTC_follow1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_vehicle3', 'longit_pos_preced1', 'v_Vel', 'iTTC_preced1', 'iTTC_preced2', 'lat_pos_vehicle1', 'v_Vel_preced1', 'v_Vel_Ref3', 'lat_pos_vehicle2', 'longit_pos_vehicle2', 'iTTC_follow2', 'v_Vel_preced2', 'longit_pos_vehicle1', 'iTTC_ref3', 'longit_pos_follow2']

- Regression:
	- Mean Absolute Error: 0.26619230769230773
	- R2 Score: -0.30978914600319485
	- Max Error: 3.1260000000000003

- Classification:
	- Accuracy: 0.9715617715617716
	- Precision: 0.9720802708398788
	- Recall: 0.9715617715617716

#### Comparisons

- Mean Absolute Error:
	- Feature Set 1: +0.008071095571095532
	- Feature Set 2: +0.04834498834498824
	- Feature Set 3: +0.05427156177156167
	- Feature Set 4: +0.016483682983682968
	- Feature Set 5: +0.010803030303030259
	- Feature Set 6: +0.018508158508158457
	- Feature Set 7: +0.004692307692307662
	- Feature Set 8: +0.006304195804195778
	- Feature Set 9: +0.004921911421911396
	- Feature Set 10: +0.009343822843822791
	- Feature Set 11: +0.0028356643356642897
	- Feature Set 12: +0.005327505827505741
	- Feature Set 13: +0.011752913752913763
	- Feature Set 14: +0.016280885780885712
	- Feature Set 15: +0.010814685314685268
	- Feature Set 16: +0.002129370629370575
	- Feature Set 18: +0.006742424242424272
	- BETTER THAN 17 SETS
- R2 Score:
	- Feature Set 1: +0.06209793135012559
	- Feature Set 2: +0.29227846843810057
	- Feature Set 3: +0.18532128283423988
	- Feature Set 4: +0.0633023578793297
	- Feature Set 5: +0.07105124173672367
	- Feature Set 6: +0.054893512318487714
	- Feature Set 7: +0.06685870172709102
	- Feature Set 8: +0.03347334150966885
	- Feature Set 9: +0.012311960671999689
	- Feature Set 10: +0.11361137489301887
	- Feature Set 11: +0.07567374632396967
	- Feature Set 12: +0.058758412043478425
	- Feature Set 14: +0.05498844180869683
	- Feature Set 15: +0.05950465683824724
	- Feature Set 16: +0.06821995872160297
	- Feature Set 18: +0.07109495895627838
	- BETTER THAN 16 SETS
- Max Error:
	- Feature Set 1: +0.02999999999999936
	- Feature Set 2: +0.11899999999999977
	- Feature Set 3: +0.09899999999999975
	- Feature Set 4: +0.08099999999999952
	- Feature Set 6: +0.1819999999999995
	- Feature Set 7: +0.09499999999999975
	- Feature Set 8: +0.09199999999999964
	- Feature Set 9: +0.021999999999999797
	- Feature Set 11: +0.09099999999999975
	- Feature Set 12: +0.09299999999999953
	- Feature Set 13: +0.0649999999999995
	- Feature Set 14: +0.133
	- Feature Set 16: +0.01200000000000001
	- Feature Set 18: +0.11899999999999933
	- BETTER THAN 14 SETS

## Feature Set 9

Features: \
\['v_Vel', 'lateral_current_lane', 'longit_pos_vehicle1', 'longit_pos_vehicle2', 'longit_pos_vehicle3', 'lat_pos_vehicle1', 'lat_pos_vehicle2', 'iTTC_ref3', 'v_Vel_preced1', 'v_Vel_preced2', 'longit_pos_preced1', 'longit_pos_follow1', 'longit_pos_preced2', 'longit_pos_follow2', 'iTTC_preced1', 'iTTC_follow1', 'iTTC_preced2', 'iTTC_follow2']

- Regression:
	- Mean Absolute Error: 0.2711142191142191
	- R2 Score: -0.32210110667519454
	- Max Error: 3.148

- Classification:
	- Accuracy: 0.9757575757575758
	- Precision: 0.9762126970133892
	- Recall: 0.9757575757575758

#### Comparisons

- Accuracy:
	- Feature Set 2: +0.010023310023310028
	- Feature Set 3: +0.009673659673659851
	- Feature Set 4: +0.00046620046620060585
	- Feature Set 5: +0.004312354312354438
	- Feature Set 6: +0.0005827505827507018
	- Feature Set 7: +0.006177156177156307
	- Feature Set 8: +0.0034965034965037667
	- Feature Set 10: +0.0036130536130537516
	- Feature Set 11: +0.002331002331002474
	- Feature Set 13: +0.0009324009324012117
	- Feature Set 14: +0.002680652680652873
	- Feature Set 15: +0.0017482517482518833
	- Feature Set 16: +0.0009324009324009896
	- Feature Set 17: +0.004195804195804231
	- Feature Set 18: +0.002913752913752954
	- BETTER THAN 15 SETS
- Precision:
	- Feature Set 2: +0.009663389078471307
	- Feature Set 3: +0.009584441295202417
	- Feature Set 4: +0.0005721320462992452
	- Feature Set 5: +0.004426258114555881
	- Feature Set 6: +0.0005206964746579112
	- Feature Set 7: +0.0063109618011431
	- Feature Set 8: +0.003530097645585095
	- Feature Set 10: +0.0038119058179884036
	- Feature Set 11: +0.002289606066117167
	- Feature Set 13: +0.0007360396803507419
	- Feature Set 14: +0.0023682932769253373
	- Feature Set 15: +0.0018130193196960187
	- Feature Set 16: +0.0008835975016957542
	- Feature Set 17: +0.004132426173510395
	- Feature Set 18: +0.00284163376037605
	- BETTER THAN 15 SETS
- Recall:
	- Feature Set 2: +0.010023310023310139
	- Feature Set 3: +0.009673659673659851
	- Feature Set 4: +0.00046620046620060585
	- Feature Set 5: +0.004312354312354216
	- Feature Set 6: +0.0005827505827507018
	- Feature Set 7: +0.006177156177156307
	- Feature Set 8: +0.0034965034965037667
	- Feature Set 10: +0.0036130536130536406
	- Feature Set 11: +0.002331002331002474
	- Feature Set 13: +0.0009324009324009896
	- Feature Set 14: +0.002680652680652873
	- Feature Set 15: +0.0017482517482518833
	- Feature Set 16: +0.0009324009324009896
	- Feature Set 17: +0.004195804195804231
	- Feature Set 18: +0.002913752913752954
	- BETTER THAN 15 SETS
- Mean Absolute Error:
	- Feature Set 1: +0.0031491841491841366
	- Feature Set 2: +0.04342307692307684
	- Feature Set 3: +0.049349650349650276
	- Feature Set 4: +0.011561771561771572
	- Feature Set 5: +0.005881118881118863
	- Feature Set 6: +0.01358624708624706
	- Feature Set 8: +0.0013822843822843822
	- Feature Set 10: +0.004421911421911395
	- Feature Set 12: +0.000405594405594345
	- Feature Set 13: +0.006831002331002367
	- Feature Set 14: +0.011358974358974316
	- Feature Set 15: +0.005892773892773873
	- Feature Set 18: +0.0018205128205128762
	- BETTER THAN 13 SETS
- R2 Score:
	- Feature Set 1: +0.0497859706781259
	- Feature Set 2: +0.2799665077661009
	- Feature Set 3: +0.1730093221622402
	- Feature Set 4: +0.05099039720733001
	- Feature Set 5: +0.05873928106472398
	- Feature Set 6: +0.042581551646488025
	- Feature Set 7: +0.05454674105509133
	- Feature Set 8: +0.021161380837669164
	- Feature Set 10: +0.10129941422101918
	- Feature Set 11: +0.06336178565196998
	- Feature Set 12: +0.046446451371478736
	- Feature Set 14: +0.04267648113669714
	- Feature Set 15: +0.04719269616624755
	- Feature Set 16: +0.05590799804960328
	- Feature Set 18: +0.05878299828427869
	- BETTER THAN 15 SETS
- Max Error:
	- Feature Set 1: +0.007999999999999563
	- Feature Set 2: +0.09699999999999998
	- Feature Set 3: +0.07699999999999996
	- Feature Set 4: +0.05899999999999972
	- Feature Set 6: +0.1599999999999997
	- Feature Set 7: +0.07299999999999995
	- Feature Set 8: +0.06999999999999984
	- Feature Set 11: +0.06899999999999995
	- Feature Set 12: +0.07099999999999973
	- Feature Set 13: +0.042999999999999705
	- Feature Set 14: +0.11100000000000021
	- Feature Set 18: +0.09699999999999953
	- BETTER THAN 12 SETS



# Additional Functions

In [0]:
def feature_importance_extraction(x_train, y_train):
    rf = RandomForestRegressor()
    rf.fit(x_train, y_train.values.ravel())

    feature_importances = pd.DataFrame(rf.feature_importances_, index= x_train.columns, columns=['importance']).sort_values('importance', ascending=False)

    return feature_importances