# Feature Selection

In [42]:
# !pip install catboost

In [43]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE

from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

from collections import Counter 
from utils import *

In [44]:
df = pd.read_csv("data/train_new_feats.csv")

In [45]:
target = [[f"target_{i}" for i in range(1, 9)] + ["Claim Injury Type"] + ["WCB Decision"] + ["Agreement Reached"] + ["Claim Injury Type_encoded"]]
target = [item for sublist in target for item in sublist]


binary_target = [f"target_{i}" for i in range(1, 9)]

original_target  = [col for col in target if col not in binary_target]

ordinal_target = ["Claim Injury Type_encoded"]

In [46]:
features = [feat for feat in df.columns if feat not in target]

features = [feat for feat in features if df[feat].dtype != "datetime64[ns]"]

num_feats = [feat for feat in features if df[feat].dtype != "object"]

cat_feats = [feat for feat in features if df[feat].dtype == "object"]

cat_feats_index = [features.index(feat) for feat in cat_feats]


In [47]:
X= df[features]
X.columns

Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Average Weekly Wage', 'Carrier Name',
       'Carrier Type', 'County of Injury', 'COVID-19 Indicator',
       'District Name', 'Gender', 'IME-4 Count', 'Industry Code Description',
       'Medical Fee Region', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Description',
       'Zip Code', 'Number of Dependents', 'Accident Date_year',
       'Accident Date_missing', 'Accident Date_weekday', 'Accident_weekend',
       'Accident Date_month', 'Accident Date_month_cos',
       'Accident Date_month_sin', 'Accident Date_assembly_gap_days',
       'C3-C2_gap_days', 'C2_missing', 'C3_missing', 'C2_Accident_gap_weeks',
       'C3_Accident_gap_weeks', 'Hearing Date_missing',
       'Hearing_C3 gap_months', 'Hearing_C2 gap_months',
       'Hearing_assembly_gap_months', 'Days to Assembly',
       'Days to First Hearing', 'Days from COVID', 'Average Weekly 

In [48]:
y_ordinal = df[ordinal_target]
y_ordinal

Unnamed: 0,Claim Injury Type_encoded
0,1.0
1,3.0
2,3.0
3,1.0
4,2.0
...,...
574048,1.0
574049,1.0
574050,3.0
574051,1.0


In [49]:
n_splits = 3
stratified_kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

feat_sel_results = []

for i, (train_index, val_index) in enumerate(stratified_kf.split(X, y_ordinal), start=1):
        print(f"Initializing CV_{i}/{n_splits}...")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y_ordinal.iloc[train_index], y_ordinal.iloc[val_index]
        #y_binaries_train, y_binaries_val = y_binaries.iloc[train_index], y_binaries.iloc[val_index]

    
        #--------------- Target Ordinal Encoding 
        

        print(f"Ordinal encoding...")
        X_train_encoded = X_train.copy()
        X_val_encoded = X_val.copy()
        for cat in cat_feats:
            X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y_train, 1)
            # if i == 1:
            #     plot_numerical(X_val_encoded, f"{cat}_encoded")
            #     plot_numerical_vs_target(X_val_encoded, f"{cat}_encoded", y_val.squeeze())
        
        columns = X_train_encoded.columns

        # --------------- Frequency Encoding
        cat_feats = [feat for feat in X_train.columns if X_train[feat].dtype == "object"]
        print(f"Frequency encoding...")
        for cat in cat_feats:
            X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)
            # if i == 1:
            #     plot_numerical(X_val_encoded, f"{cat}_freq")
            #     plot_numerical_vs_target(X_val_encoded, f"{cat}_freq", y_val.squeeze())
        
        # --------------- Imputing missing values
        print(f"Impuiting missing values...")
        X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)
        
        
        # ------------ Correlation (dropping redundant features)
        redundant_features_to_drop = cross_corr_mean(X_train_imputed, corr_coeff=0.90)
        print(f"Redundant features to drop: {redundant_features_to_drop}")
        X_train_imputed = X_train_imputed.drop(columns=redundant_features_to_drop)
        X_val_imputed = X_val_imputed.drop(columns=redundant_features_to_drop)
        
        print(f"Number of features after dropping redundant features: {X_train_imputed.shape[1]}")
        
        
        # ------------ Recursive Feature Elimination
        columns = X_train_imputed.columns
        group_sizes = [X_train_imputed.shape[1] // 4 * i for i in range(1, 10)]
        nof_list = np.arange(20, X_train_imputed.shape[1] + 1, 5)
        high_score = 0
        val_score = 0
        nof = 0
        
        for n in nof_list:
            if (n > nof + 10) & (val_score < high_score):
                break
            
            print(f"RFE with {n} features testing...")
            

            model = RandomForestClassifier(max_depth=12, random_state=42, n_jobs=-1, n_estimators=100, max_samples=0.9, class_weight='balanced') 
                
            rfe = RFE(estimator=model, n_features_to_select=n, step=5, verbose = 1)
            
            X_train_rfe = rfe.fit_transform(X_train_imputed, y_train)
            X_val_rfe = rfe.transform(X_val_imputed)
                
            model.fit(X_train_rfe, y_train)
            
            train_predictions = model.predict(X_train_rfe)
            val_predictions = model.predict(X_val_rfe)
            
            train_score = f1_score(y_train, train_predictions, average="macro")
            val_score = f1_score(y_val, val_predictions, average="macro")
            
            if val_score > high_score or (val_score == 0 and n == nof_list[-1]):
                print(f"RFE with {n} features is the best so far...")
                print("Train Score: ", round(train_score, 3))
                print("Validation Score: ", round(val_score, 3))
                high_score = val_score
                nof = n

                features_to_select = pd.Series(rfe.support_, index=columns)
                RFE_feats = features_to_select[features_to_select].index.tolist()
        
        fine_steps =  np.arange(nof- 5, nof + 5, 3, dtype=int)
        fine_steps = [step for step in fine_steps if step != nof]        
        fine_steps = [step for step in fine_steps if step <= X_train_imputed.shape[1]]

        fine_steps
        for j in fine_steps:
            print(f"RFE with {j} features testing...")
            
            model = RandomForestClassifier(max_depth=12, random_state=42, n_jobs=-1, n_estimators=100, max_samples=0.9, class_weight='balanced') 

            
            rfe = RFE(estimator=model, n_features_to_select=j, step=3, verbose=1)
        
            X_train_rfe = rfe.fit_transform(X_train_imputed, y_train)
            X_val_rfe = rfe.transform(X_val_imputed)
            
            model.fit(X_train_rfe, y_train)
            model.fit(X_train_rfe, y_train)
            
            train_probas = model.predict_proba(X_train_rfe)[:, 1]
            val_probas = model.predict_proba(X_val_rfe)[:, 1]
            
            train_predictions = model.predict(X_train_rfe)
            val_predictions = model.predict(X_val_rfe)
            
            train_score = f1_score(y_train, train_predictions, average="macro")
            val_score = f1_score(y_val, val_predictions, average="macro")
        
            if val_score > high_score:
                print(f"RFE with {j} features is the best so far - Train Score: {round(train_score, 3)} / Val Score: {round(val_score, 3)}")
                high_score = val_score
                nof = j

                features_to_select = pd.Series(rfe.support_, index=columns)
                RFE_feats = features_to_select[features_to_select].index.tolist()
                    
        feat_sel_results.append({
                    "CV": f"CV_{i}",
                    "Target": "Ordinal_Encoded",
                    "RFE Features": RFE_feats,
                    "Validation Score": high_score
                })
        print(f"Best RFE Features: {RFE_feats}")
        print(f"Best Validation Score: {high_score} with {nof} features")
        print(f"Classification Report: {classification_report(y_val, val_predictions)}")
        print("--------------------------------------------------")
    
feat_sel_results

Initializing CV_1/3...
Ordinal encoding...
Frequency encoding...
Impuiting missing values...
Redundant features to drop: ['Average Weekly Wage', 'Days from COVID', 'C2_Accident_gap_weeks', 'Hearing Date_missing', 'Hearing_assembly_gap_months', 'Gender_encoded']
Number of features after dropping redundant features: 48
RFE with 20 features testing...
Fitting estimator with 48 features.
Fitting estimator with 43 features.
Fitting estimator with 38 features.
Fitting estimator with 33 features.
Fitting estimator with 28 features.
Fitting estimator with 23 features.
RFE with 20 features is the best so far...
Train Score:  0.492
Validation Score:  0.402
RFE with 25 features testing...
Fitting estimator with 48 features.
Fitting estimator with 43 features.
Fitting estimator with 38 features.
Fitting estimator with 33 features.
Fitting estimator with 28 features.
RFE with 30 features testing...
Fitting estimator with 48 features.
Fitting estimator with 43 features.
Fitting estimator with 38 fea

[{'CV': 'CV_1',
  'Target': 'Ordinal_Encoded',
  'RFE Features': ['Age at Injury',
   'Attorney/Representative',
   'COVID-19 Indicator',
   'IME-4 Count',
   'Number of Dependents',
   'Accident Date_year',
   'Accident Date_weekday',
   'Accident Date_month',
   'Accident Date_month_sin',
   'Accident Date_assembly_gap_days',
   'C3-C2_gap_days',
   'C2_missing',
   'C3_missing',
   'C3_Accident_gap_weeks',
   'Hearing_C3 gap_months',
   'Hearing_C2 gap_months',
   'Days to Assembly',
   'Days to First Hearing',
   'Average Weekly Wage_log',
   'Carrier Name_encoded',
   'Carrier Type_encoded',
   'County of Injury_encoded',
   'District Name_encoded',
   'Industry Code Description_encoded',
   'WCIO Cause of Injury Description_encoded',
   'WCIO Nature of Injury Description_encoded',
   'WCIO Part Of Body Description_encoded',
   'Zip Code_encoded',
   'County of Worker_encoded',
   'Carrier Name_freq',
   'County of Injury_freq',
   'District Name_freq',
   'Industry Code Descripti

In [50]:
all_features = []
for result in feat_sel_results:
    all_features.extend(result['RFE Features'])

feature_counts = Counter(all_features)

selected_features = [feature for feature, count in feature_counts.items() if count >= 2]

selected_features

['Age at Injury',
 'Attorney/Representative',
 'IME-4 Count',
 'Accident Date_year',
 'Accident Date_assembly_gap_days',
 'C3-C2_gap_days',
 'C2_missing',
 'C3_missing',
 'C3_Accident_gap_weeks',
 'Hearing_C3 gap_months',
 'Hearing_C2 gap_months',
 'Days to Assembly',
 'Days to First Hearing',
 'Average Weekly Wage_log',
 'Carrier Name_encoded',
 'County of Injury_encoded',
 'Industry Code Description_encoded',
 'WCIO Cause of Injury Description_encoded',
 'WCIO Nature of Injury Description_encoded',
 'WCIO Part Of Body Description_encoded',
 'Zip Code_encoded',
 'County of Worker_encoded',
 'Carrier Name_freq',
 'County of Injury_freq',
 'District Name_freq',
 'Industry Code Description_freq',
 'WCIO Cause of Injury Description_freq',
 'WCIO Nature of Injury Description_freq',
 'WCIO Part Of Body Description_freq',
 'Zip Code_freq',
 'County of Worker_freq']

In [51]:
len(selected_features)

31