# Feature Importance

In [2]:
import pandas as pd
import numpy as np

## Dataset

In [3]:
df = pd.read_feather('removed_duplicated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726288 entries, 0 to 726287
Columns: 767 entries, HKTLMYY to TLJYWBE
dtypes: float64(742), int64(21), object(4)
memory usage: 4.2+ GB


In [4]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

False    726224
True         64
Name: TLJYWBE, dtype: int64

In [5]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=0.0005)  
X_resampled, _ = rus.fit_resample(df, df[target])

In [6]:
X_resampled[target].value_counts()

False    128000
True         64
Name: TLJYWBE, dtype: int64

In [7]:
X, y = X_resampled[[c for c in X_resampled.columns if c != target]], X_resampled[target]
y.value_counts()

False    128000
True         64
Name: TLJYWBE, dtype: int64

In [8]:
X_resampled['YEZPLBM'].value_counts(dropna=False)

NaN       128063
9999.0         1
Name: YEZPLBM, dtype: int64

In [9]:
df[df[target]]['YEZPLBM'].value_counts(dropna=False)

NaN    64
Name: YEZPLBM, dtype: int64

## Modeling

In [10]:
dtypes = X_resampled.dtypes
dtypes.head()

HKTLMYY      int64
IJEXXXL    float64
KVNLYTZ    float64
ADOIOYN    float64
GPCKISJ    float64
dtype: object

In [11]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [12]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 741, 4)

In [13]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', 'passthrough')  
                          ])

In [54]:
param_grid = [
    {
        'classifier': [RidgeClassifier()],
        'classifier__alpha': [100, 150, 200],  
        'classifier__class_weight': [{0: 1, 1: w} for w in [75, 100, 150, 200]]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 75, 100, 125],
        'classifier__min_samples_split': [8, 10, 15, 20],
        'classifier__min_samples_leaf': [4, 6, 8],
        'classifier__class_weight': [{0: 1, 1: w} for w in [50, 75, 100, 150]] 
    }
]

In [55]:
%%time
search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=2)
search.fit(training_features, train_target)

CPU times: user 29.7 s, sys: 4.52 s, total: 34.2 s
Wall time: 1h 48min 11s


In [56]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.204):
{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,
                       min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [64]:
import pandas as pd
pd.set_option('display.max_colwidth', 400)
results_df = pd.DataFrame(search.cv_results_)
top_results = results_df.sort_values(by='mean_test_score', ascending=False)
top_results[['params', 'mean_test_score', 'std_test_score']].head(10)

Unnamed: 0,params,mean_test_score,std_test_score
37,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}",0.203785,0.070294
90,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 150}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.170452,0.108905
36,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.168571,0.106972
8,"{'classifier': RidgeClassifier(), 'classifier__alpha': 200, 'classifier__class_weight': {0: 1, 1: 75}}",0.168087,0.06925
45,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 100}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.159341,0.105847
33,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}",0.159341,0.105847
32,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}",0.159341,0.105847
27,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.157436,0.106212
4,"{'classifier': RidgeClassifier(), 'classifier__alpha': 150, 'classifier__class_weight': {0: 1, 1: 75}}",0.153891,0.06752
0,"{'classifier': RidgeClassifier(), 'classifier__alpha': 100, 'classifier__class_weight': {0: 1, 1: 75}}",0.151519,0.063387


## Feature importance

### By best RF clf

In [105]:
feature_names = training_features.columns.tolist()
feature_importance = best_estimator.named_steps['classifier'].feature_importances_
feature_importance = dict(zip(feature_names, feature_importance))
sorted_rf_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)

In [124]:
rf_fi = pd.DataFrame(sorted_rf_features, columns=['feature','rf_importance'])
rf_fi['rf_relative_importance'] = abs(rf_fi.rf_importance) / abs(rf_fi.rf_importance.max())
rf_fi.head()

Unnamed: 0,feature,rf_importance,rf_relative_importance
0,KGJACPV,0.045163,1.0
1,GUEUYTS,0.043535,0.963963
2,ZAIRSDG,0.026666,0.590436
3,USAETVU,0.023823,0.527492
4,OJSWUAF,0.022448,0.49705


### By LR classifier 

In [111]:
lr_clf = RidgeClassifier(alpha=200, class_weight={0: 1, 1: 75})
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', lr_clf)  
                          ])

In [113]:
pipeline.fit(training_features, train_target)

In [114]:
y_pred_train = pipeline.predict(training_features)
f1 = f1_score(train_target, y_pred_train)
precision = precision_score(train_target, y_pred_train)
print(f"F1 Score on Training Set: {f1:.3f}")
print(f"Precision on Training Set: {precision:.3f}")

F1 Score on Training Set: 0.552
Precision on Training Set: 0.381


In [115]:
test_predictions = pipeline.predict(test_features)
f1 = f1_score(test_target, test_predictions)
precision = precision_score(test_target, test_predictions)
print(f"F1 Score on Test Set: {f1:.3f}")
print(f"Precision on Test Set: {precision:.3f}")

F1 Score on Test Set: 0.077
Precision on Test Set: 0.056


In [117]:
feature_names = training_features.columns.tolist()
coefficients = pipeline.named_steps['classifier'].coef_[0]
feature_importance = dict(zip(feature_names, coefficients))
sorted_lr_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)

In [125]:
lr_fi = pd.DataFrame(sorted_lr_features, columns=['feature','lr_importance'])
lr_fi['lr_relative_importance'] = abs(lr_fi.lr_importance) / abs(lr_fi.lr_importance).max()
lr_fi.head()

Unnamed: 0,feature,lr_importance,lr_relative_importance
0,YEZPLBM,-0.223787,1.0
1,GYXMTQR,-0.172428,0.770499
2,RRMMRAJ,0.12746,0.569559
3,OMIIMXP,0.116486,0.520523
4,AEXRRBM,0.113696,0.508055


In [126]:
fi_df = rf_fi.merge(lr_fi, on='feature')
fi_df.head(20)

Unnamed: 0,feature,rf_importance,rf_relative_importance,lr_importance,lr_relative_importance
0,KGJACPV,0.045163,1.0,0.095134,0.425108
1,GUEUYTS,0.043535,0.963963,0.005939,0.026538
2,ZAIRSDG,0.026666,0.590436,0.007464,0.033354
3,USAETVU,0.023823,0.527492,0.011544,0.051586
4,OJSWUAF,0.022448,0.49705,-0.011266,0.05034
5,GYXMTQR,0.013477,0.298413,-0.172428,0.770499
6,NYWUAUO,0.011368,0.251719,-0.018634,0.083266
7,RZPQSGM,0.010406,0.230416,-0.024375,0.108922
8,BGDAMPC,0.010342,0.228992,-0.01903,0.085037
9,AKUNFFN,0.009881,0.21878,-0.055187,0.246603


In [127]:
fi_df.to_csv('feature_importance_1_to_1000_downsample_using_lr_and_rf.csv', index=False)

## Selecting top features

In [2]:
fi_df = pd.read_csv('feature_importance_1_to_1000_downsample_using_lr_and_rf.csv')
fi_df.head()

Unnamed: 0,feature,rf_importance,rf_relative_importance,lr_importance,lr_relative_importance
0,KGJACPV,0.045163,1.0,0.095134,0.425108
1,GUEUYTS,0.043535,0.963963,0.005939,0.026538
2,ZAIRSDG,0.026666,0.590436,0.007464,0.033354
3,USAETVU,0.023823,0.527492,0.011544,0.051586
4,OJSWUAF,0.022448,0.49705,-0.011266,0.05034


In [3]:
fi_df['total_relative_imp'] = fi_df.rf_relative_importance + fi_df.lr_relative_importance
fi_df.sort_values(by='total_relative_imp', ascending=False).head(20)

Unnamed: 0,feature,rf_importance,rf_relative_importance,lr_importance,lr_relative_importance,total_relative_imp
0,KGJACPV,0.045163,1.0,0.095134,0.425108,1.425108
5,GYXMTQR,0.013477,0.298413,-0.172428,0.770499,1.068913
623,YEZPLBM,8.8e-05,0.00195,-0.223787,1.0,1.00195
1,GUEUYTS,0.043535,0.963963,0.005939,0.026538,0.9905
2,ZAIRSDG,0.026666,0.590436,0.007464,0.033354,0.62379
331,RRMMRAJ,0.000823,0.018227,0.12746,0.569559,0.587786
3,USAETVU,0.023823,0.527492,0.011544,0.051586,0.579079
63,AEXRRBM,0.002922,0.064694,0.113696,0.508055,0.572749
218,OMIIMXP,0.001266,0.028022,0.116486,0.520523,0.548545
4,OJSWUAF,0.022448,0.49705,-0.011266,0.05034,0.547391


In [21]:
top_50_features = fi_df.sort_values(by='rf_importance', ascending=False).head(50).feature.values
top_50_features

array(['KGJACPV', 'GUEUYTS', 'ZAIRSDG', 'USAETVU', 'OJSWUAF', 'GYXMTQR',
       'NYWUAUO', 'RZPQSGM', 'BGDAMPC', 'AKUNFFN', 'KPQSPBC', 'AUSDIEL',
       'AIKOJYC', 'ULCBNOU', 'RCVCFXR', 'DKOVDCG', 'LPKKAWN', 'LGXZTHS',
       'DVSZBLN', 'OKIKPOJ', 'SHEGNUT', 'FJCYMJD', 'FFJOGRA', 'OAEOBHM',
       'OMPWYAA', 'TZNKMJP', 'FPGUGZC', 'HNJMSWN', 'HFHITLP', 'LADIFWP',
       'OKMWHQM', 'CFAIUJQ', 'GPSLGET', 'VFXXPOM', 'UIDFQDA', 'PIFCYQL',
       'GJFZDWF', 'TWXNMUD', 'FZHCPXN', 'XFLCFEY', 'ZZTPIJA', 'MUEKGHC',
       'CHWVXPB', 'ALDDXXN', 'EFOYHEQ', 'AUNDDNZ', 'AGTCLZR', 'PWRCZZP',
       'UGXVMKE', 'CVWPAJZ'], dtype=object)

In [22]:
dtypes.loc[top_50_features].value_counts()

float64    49
int64       1
dtype: int64

In [25]:
null_prop = df.isnull().sum() / len(df)
null_prop.loc[top_50_features].sort_values(ascending=False).head(10)

UIDFQDA    0.997081
FZHCPXN    0.986991
VFXXPOM    0.907871
OJSWUAF    0.811887
TWXNMUD    0.736149
GJFZDWF    0.736149
AUNDDNZ    0.736149
OAEOBHM    0.736149
FJCYMJD    0.736149
ZAIRSDG    0.711652
dtype: float64

In [26]:
df[top_50_features].to_feather('dataset_reduced_to_top_50_features_by_rf_importance.feather')

## Selection using mutual information

In [14]:
features = [c for c in df.columns if c != target]
len(features)

766

In [15]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.25, random_state=42, stratify=df[target])

In [16]:
X_train = preprocessor.fit_transform(X_train)

In [None]:
selector = SelectKBest(mutual_info_classif, k=50)
selector.fit(X_train, y_train)

selected_feature_names = X_train.columns[selector.get_support()]

In [None]:
feature_union =set(selected_feature_names).union(set(top_50_features))
len(feature_union)