In [1]:
import os

os.chdir('../.')

%pwd

'd:\\work\\loan-approval-prediction'

In [2]:
import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier


def input_feature_selection(X, y, cols):
    """feature selection with different methods"""
    # Method-1: Exhaustive Feature Selector
    efs = ExhaustiveFeatureSelector(RandomForestClassifier(), min_features=4, max_features=8, scoring='roc_auc', cv=2)
    efs = efs.fit(X, y)
    selected_features = X.columns[list(efs.best_idx_)]
    # print(selected_features)
    # print(efs.best_score_)
    selected_feat1 = [cols[i] for i in list(selected_features)]
    
    # Method-2: RandomForest importance
    model = RandomForestClassifier(n_estimators=340)
    model.fit(X, y)
    importance = model.feature_importances_
    final_df = pd.DataFrame({"Feature":pd.DataFrame(X).columns, "Importances":importance})
    final_df.set_index('Importances')
    final_df = final_df.sort_values('Importances')
    # final_df.plot.bar(color='teal')
    th_features = final_df['Feature'][final_df['Importances'] > 0.04].values
    selected_feat2 = [cols[i] for i in th_features]
    
    # Method-3: Lasso
    feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
    feature_sel_model.fit(X, y)
    feature_sel_model.get_support()
    # this is how we can make a list of the selected features
    lasso_feature = list(X.columns[(feature_sel_model.get_support())])
    selected_feat3 = [cols[i] for i in lasso_feature]
    
    # total feature
    total_feature = selected_feat1 + selected_feat2 + selected_feat3
    total_feature_unique = list(set(total_feature))
    final_features = [feat for feat in cols if feat in total_feature_unique]
    return final_features


In [3]:
from src.utils import input_dataframe, feature_engineering

# read data
in_file = "notebooks/data/input_training_data.csv"

df = input_dataframe(in_file)

# FEATURE ENGINEERING
#=====================================

cols = df.columns.tolist()[1:]
feateng = feature_engineering(df, colslist=cols, return_type='XY')
X = feateng['X']
y = feateng['y']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='backward', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='forward', inplace=True)


In [5]:
final_features = input_feature_selection(X, y, cols)

Features: 1749/1749

In [6]:
final_features

['Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area']