In [31]:
from itertools import combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

from main.utils.data_manage_utils import load_processed_data_by_folder
from main.utils.time_utils import print_time

ROOT_FOLER = os.path.abspath("../../../../")
INPUT_FOLDER = os.path.join(ROOT_FOLER, "data/preprocessing/base/class")

In [32]:
X_train, y_train, X_test, y_test = load_processed_data_by_folder(INPUT_FOLDER)
X_train

Unnamed: 0,MEDIAN_CARR_DELAY(MINS),MEDIAN_ORIGIN_DELAY(MINS),DEP_DELAY(MINS),CRS_ELAPSED_TIME(MINS),DISTANCE(KM),NR_PREV_ARR_FLIGHTS(1HR),ARR_DAY_SIN,ARR_DAY_COS,ARR_MIN_OF_DAY_SIN,ARR_MIN_OF_DAY_COS,...,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SN,EVENT_TS,08L/26R,08R/26L,09L/27R,09R/27L,10/28
5998106,-11.0,-11.0,107,89,548.78494,74,-0.433884,-0.900969,-0.985556,0.169350,...,0,0,0,0,0,0.0,0.0,3.0,0.0,3.0
9744846,-11.0,-11.0,-3,106,830.41944,83,0.433884,-0.900969,0.496217,-0.868199,...,0,0,0,0,0,3.0,3.0,0.0,3.0,3.0
8656393,-11.0,-9.0,64,98,653.39204,53,-0.781831,0.623490,-0.548293,-0.836286,...,0,0,3,0,2,0.0,3.0,0.0,0.0,3.0
7404998,-11.0,-8.0,-2,121,935.02654,88,-0.433884,-0.900969,-0.719340,0.694658,...,0,0,0,0,0,0.0,3.0,3.0,3.0,3.0
8855927,-11.0,-5.0,15,132,1120.10064,74,0.433884,-0.900969,-0.845728,0.533615,...,0,0,0,2,0,3.0,0.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6033952,-6.0,-9.0,-3,126,975.26004,95,-0.781831,0.623490,0.678801,-0.734323,...,0,0,0,0,0,0.0,0.0,3.0,0.0,0.0
7551519,-11.0,-15.0,-8,142,1200.56764,74,0.000000,1.000000,-0.649448,-0.760406,...,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0
7375452,-11.0,-8.0,-4,122,935.02654,58,0.433884,-0.900969,-0.999914,-0.013090,...,0,2,2,2,2,0.0,0.0,3.0,0.0,0.0
7757994,-6.0,-8.0,180,92,584.19042,62,-0.781831,0.623490,-0.608761,0.793353,...,0,0,0,0,0,1.0,3.0,1.0,3.0,1.0


In [33]:
# Define your classifier and variables
clf = RandomForestClassifier(class_weight="balanced", random_state=42, max_depth=12)
results = []
n_folds = 5

remaining_columns = list(X_train.columns)
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

# Establishing combined features for deletion
buddies = [("ARR_MIN_OF_DAY_SIN", "ARR_MIN_OF_DAY_COS"), ("ARR_DAY_SIN", "ARR_DAY_COS")]

print("Starting feature elimination: ")
start_col_n = X_train.shape[1]
done = False
while not done:  
    print(f"{X_train.shape[1]}/{start_col_n} columns left.")
    # Get accuracy using 5-fold cross-validation
    accuracy = np.mean(cross_val_score(clf, X_train, y_train, cv=5))
    print(f"\tAccuracy of run: {accuracy:.2f}")
    
    # Fit classifier
    start, _ = print_time()
    clf.fit(X_train, y_train)
    end, _ = print_time()
    time = end-start
    print("\tFit time: ", time)
    
    # Calculating feature importances
    feature_importances = clf.feature_importances_
    
    # Safe results
    results.append((len(remaining_columns), accuracy, remaining_columns.copy(), time))
    
    # Determine least important feature
    least_important_feature_index = np.argmin(feature_importances)
    least_important_feature_importance = feature_importances[least_important_feature_index]
    
    # Remove least important feature from dataset and column names
    X_train = np.delete(X_train, least_important_feature_index, axis=1)
    removed_column = remaining_columns.pop(least_important_feature_index)
    print(f"\tRemoving column: '{removed_column}'")
    
    # Check if one of the buddy columns was deleted. Drop second if happened.
    if any([removed_column in pair for pair in buddies]):
        buddy_dict = {col1: col2 for col1, col2 in buddies}
        buddy_dict.update({col2: col1 for col1, col2 in buddies})
        buddy = buddy_dict.get(removed_column)
        buddy_index = remaining_columns.index(buddy)
        remaining_columns.pop(buddy_index)
        X_train = np.delete(X_train, buddy_index, axis=1)
    
    # Anchor for while
    if X_train.shape[1] <= 1:
        done = True

print(f"Elimination is done with accuracy array of:\n{[res[1] for res in results]}")

Starting feature elimination: 
44/44 columns left.


KeyboardInterrupt: 

In [None]:
result_df = pd.DataFrame(results)
result_df.to_pickle("result_df.pkl")