In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
district_feature_training_data = pd.read_csv('../../data/models/linear_regression/lr_district_feature_training_data.csv')
district_target_training_data = pd.read_csv('../../data/models/linear_regression/lr_district_target_training_data.csv')
district_feature_testing_data = pd.read_csv('../../data/models/linear_regression/lr_district_feature_testing_data.csv')
district_target_testing_data = pd.read_csv('../../data/models/linear_regression/lr_district_target_testing_data.csv')

area_feature_training_data = pd.read_csv('../../data/models/linear_regression/lr_area_feature_training_data.csv')
area_target_training_data = pd.read_csv('../../data/models/linear_regression/lr_area_target_training_data.csv')
area_feature_testing_data = pd.read_csv('../../data/models/linear_regression/lr_area_feature_testing_data.csv')
area_target_testing_data = pd.read_csv('../../data/models/linear_regression/lr_area_target_testing_data.csv')

In [3]:
area_model = LinearRegression()
area_sfs = SFS(area_model, k_features='best', forward=True, floating=False, scoring='neg_mean_squared_error', cv=5, verbose=2)

In [4]:
area_sfs.fit(area_feature_training_data, area_target_training_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  68 out of  68 | elapsed:  2.0min finished

[2024-07-17 03:30:34] Features: 1/68 -- score: -6.268892406750622e-06[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  67 out of  67 | elapsed:  3.6min finished

[2024-07-17 03:34:11] Features: 2/68 -- score: -6.224501983399714e-06[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  66 out of  66 | elapsed:  4.2min finished

[2024-07-17 03:38:26] Features: 3/68 -- score: -6.214334206417533e-06[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs

SequentialFeatureSelector(estimator=LinearRegression(), k_features=(1, 68),
                          scoring='neg_mean_squared_error', verbose=2)

In [6]:
selected_features = area_sfs.k_feature_idx_

In [7]:
selected_features

(0,
 1,
 2,
 3,
 4,
 5,
 6,
 10,
 11,
 12,
 13,
 14,
 15,
 19,
 20,
 21,
 22,
 24,
 25,
 26,
 27,
 29,
 30,
 31,
 32,
 33,
 35,
 37,
 38,
 39,
 40,
 42,
 43,
 44,
 45,
 46,
 47,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 61,
 62,
 63,
 64)

In [8]:
selected_score = area_sfs.k_score_
selected_score

-6.1647950122796004e-06

In [9]:
subsets = area_sfs.subsets_
for feature_count in subsets:
    print(f"Features for {feature_count} feature(s): {subsets[feature_count]['feature_idx']}")
    print(f"CV Score: {subsets[feature_count]['cv_scores']}")
    print(f"Average CV Score: {subsets[feature_count]['avg_score']}")
    print("\n")

Features for 1 feature(s): (62,)
CV Score: [-1.34000853e-05 -5.60483649e-06 -3.83085580e-06 -3.89043340e-06
 -4.61825106e-06]
Average CV Score: -6.268892406750622e-06


Features for 2 feature(s): (52, 62)
CV Score: [-1.32488310e-05 -5.53967981e-06 -3.79635117e-06 -3.85940938e-06
 -4.67823856e-06]
Average CV Score: -6.224501983399714e-06


Features for 3 feature(s): (12, 52, 62)
CV Score: [-1.31754871e-05 -5.56787207e-06 -3.72623602e-06 -3.82864550e-06
 -4.77343031e-06]
Average CV Score: -6.214334206417533e-06


Features for 4 feature(s): (12, 52, 53, 62)
CV Score: [-1.30655971e-05 -5.49496228e-06 -3.73915022e-06 -3.87049761e-06
 -4.82489477e-06]
Average CV Score: -6.199020387980767e-06


Features for 5 feature(s): (12, 22, 52, 53, 62)
CV Score: [-1.30207293e-05 -5.50182079e-06 -3.75265550e-06 -3.85124455e-06
 -4.81241763e-06]
Average CV Score: -6.187773563112009e-06


Features for 6 feature(s): (12, 22, 40, 52, 53, 62)
CV Score: [-1.29812576e-05 -5.46940268e-06 -3.76638866e-06 -3.86160

In [11]:
selected_feature_indices = (0, 1, 2, 3, 4, 5, 6, 10, 11, 12, 13, 14, 15, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 61, 62, 63, 64)
feature_names = area_feature_training_data.columns
selected_feature_names = [f'{i}: {feature_names[i]}' for i in selected_feature_indices]
print(selected_feature_names)

['0: area_id', '1: day', '2: hour', '3: year', '4: month', '5: day_of_week', '6: unemployment', '10: crowded_housing', '11: below_pov', '12: district', '13: police_stations_distance_0.1', '14: police_stations_distance_0.3', '15: police_stations_distance_0.5', '19: bike_stations_distance_0.1', '20: bike_stations_distance_0.3', '21: bike_stations_distance_0.5', '22: bike_stations_distance_1', '24: bike_stations_distance_5', '25: bus_stops_distance_0.1', '26: bus_stops_distance_0.3', '27: bus_stops_distance_0.5', '29: bus_stops_distance_3', '30: bus_stops_distance_5', '31: train_stations_distance_0.1', '32: train_stations_distance_0.3', '33: train_stations_distance_0.5', '35: train_stations_distance_3', '37: alleylights_distance_0.1', '38: alleylights_distance_0.3', '39: alleylights_distance_0.5', '40: alleylights_distance_1', '42: alleylights_distance_5', '43: streetlights_allout_distance_0.1', '44: streetlights_allout_distance_0.3', '45: streetlights_allout_distance_0.5', '46: streetlig