In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os 
from main.utils.data_manage_utils import load_processed_data_by_folder, print_shapes
from main.utils.time_utils import print_time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

# Variables
ROOT_PATH = os.path.abspath("../../../../")
DATA_FOLDER = os.path.join(ROOT_PATH, "data/preprocessing/base/reg")

X_train, y_train, X_test, y_test = load_processed_data_by_folder(DATA_FOLDER)
print_shapes(X_train, y_train, X_test, y_test)

X_train

Shape of X_train: (263204, 45)
Shape of y_train: (263204,)
Shape of X_test: (65801, 45)
Shape of y_test: (65801,)


Unnamed: 0,MEDIAN_CARR_DELAY(MINS),MEDIAN_ORIGIN_DELAY(MINS),DEP_DELAY(MINS),CRS_ELAPSED_TIME(MINS),DISTANCE(KM),NR_PREV_ARR_FLIGHTS(1HR),ARR_DAY_SIN,ARR_DAY_COS,ARR_MIN_OF_DAY_SIN,ARR_MIN_OF_DAY_COS,...,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SN,EVENT_TS,08L/26R,08R/26L,09L/27R,09R/27L,10/28
5998106,-11.0,-11.0,107,89,548.78494,74,-0.433884,-0.900969,-0.985556,0.169350,...,0,0,0,0,0,0.0,0.0,3.0,0.0,3.0
9744846,-11.0,-11.0,-3,106,830.41944,83,0.433884,-0.900969,0.496217,-0.868199,...,0,0,0,0,0,3.0,3.0,0.0,3.0,3.0
8656393,-11.0,-9.0,64,98,653.39204,53,-0.781831,0.623490,-0.548293,-0.836286,...,0,0,3,0,2,0.0,3.0,0.0,0.0,3.0
7404998,-11.0,-8.0,-2,121,935.02654,88,-0.433884,-0.900969,-0.719340,0.694658,...,0,0,0,0,0,0.0,3.0,3.0,3.0,3.0
8855927,-11.0,-5.0,15,132,1120.10064,74,0.433884,-0.900969,-0.845728,0.533615,...,0,0,0,2,0,3.0,0.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6033952,-6.0,-9.0,-3,126,975.26004,95,-0.781831,0.623490,0.678801,-0.734323,...,0,0,0,0,0,0.0,0.0,3.0,0.0,0.0
7551519,-11.0,-15.0,-8,142,1200.56764,74,0.000000,1.000000,-0.649448,-0.760406,...,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0
7375452,-11.0,-8.0,-4,122,935.02654,58,0.433884,-0.900969,-0.999914,-0.013090,...,0,2,2,2,2,0.0,0.0,3.0,0.0,0.0
7757994,-6.0,-8.0,180,92,584.19042,62,-0.781831,0.623490,-0.608761,0.793353,...,0,0,0,0,0,1.0,3.0,1.0,3.0,1.0


In [17]:
def feature_selection(X, y , k=5, mse_gain_threshold = 0.001):
    features = X.columns.tolist()
    best_score = -np.inf
    best_features = None
    i = 1
    while len(features) > 2:
        scores = []
        print(f"Starting run {i} of {len(X.columns.tolist())}...")
        m = 1
        for feature in features:
            print(f"\tMidstep {m} of {len(features)}")
            selected_features = [f for f in features if f != feature]
            X_selected = X[selected_features]
            clf = RandomForestRegressor(max_depth=3, random_state=42) 
            cv = KFold(n_splits=k, shuffle=True, random_state=42)
            scores.append(np.mean(cross_val_score(clf, X_selected, y, cv=cv, scoring='neg_mean_squared_error', verbose=2)))
            m+=1
        
        min_score_idx = np.argmin(scores)
        print(f"Worst feature is {features[min_score_idx]} with score {scores[min_score_idx]:.4f}")
        i += 1
        if scores[min_score_idx] - best_score < mse_gain_threshold:
            break  # Stop if accuracy gain is less than threshold
        else:
            best_score = scores[min_score_idx]
            features.pop(min_score_idx)
    return features


selected_features = feature_selection(X_train, y_train)
print("Selected Features:", selected_features)

Starting run 1 of 45...
	Midstep 1 of 45


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  47.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.1s remaining:    0.0s


[CV] END .................................................... total time=  50.6s
[CV] END .................................................... total time=  44.7s
[CV] END .................................................... total time=  44.8s
[CV] END .................................................... total time=  46.8s
	Midstep 2 of 45


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  44.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.7s remaining:    0.0s


[CV] END .................................................... total time=  47.9s
[CV] END .................................................... total time=  46.9s
[CV] END .................................................... total time=  47.8s



KeyboardInterrupt

