# Feature Engineering

1. Select low/high-variance variables
2. Dropping redundant variables
3. RFE

In [1]:
import pandas as pd
import dslabs_functions as dslabs
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from matplotlib.pyplot import subplots, show
from matplotlib.pyplot import savefig, figure

## tratar da primeira coluna nos notebooks anteriores
winning_alternative_scaling = 2
folder = "feat_eng_csvs"
filename = f'../data_preparation_csvs/scaling_csvs/data_scaled_minmax_alt2.csv'
data_scaling: pd.DataFrame = pd.read_csv(filename, sep=',', decimal='.', na_values='')
print(f"Dataset nr records={data_scaling.shape[0]}", f"nr variables={data_scaling.shape[1]}")
filename_prefix = "../data_preparation_csvs/outliers_treatment_csvs/feature_eng_"

train = pd.read_csv('../data_preparation_csvs/scaling_csvs/train_data_scaled_minmax_alt2.csv')
test = pd.read_csv('../data_preparation_csvs/scaling_csvs/test_data_scaled_minmax_alt2.csv')

Dataset nr records=100000 nr variables=32


In [2]:
# Save to file
def save(df, name):
    df.to_csv(f'{filename_prefix}{name}_data.csv', index=False)

    ## Separate into Train and Test and save in different files
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

    ## Save the training and testing sets to CSV files
    train_data.to_csv(f'../data_preparation_csvs/{folder}/train_data_mv_{name}.csv', index=False)
    test_data.to_csv(f'../data_preparation_csvs/{folder}/test_data_mv_{name}.csv', index=False)

    ## Evaluate Approach
    file_tag = "Credit_Score"
    target = "Credit_Score"

    figure()
    eval: dict[str, list] = dslabs.evaluate_approach(train_data, test_data, target=target, metric="recall")
    dslabs.plot_multibar_chart(
        ["NB", "KNN"], eval, title=f"{file_tag} evaluation", percentage=True
    )
    savefig(f"../data_preparation_images/{folder}_result/data_mv_{name}_eval.png")
    show()

In [3]:
target = "Credit_Score"
file_tag = "Credit_Score"
print("Original variables", train.columns.to_list())
vars2drop: list[str] = dslabs.select_low_variance_variables(train, 0.015, target=target)
print("Variables to drop", vars2drop)
print(len(vars2drop), len(train.columns))

Original variables ['Month', 'Occupation', 'Payment_of_Min_Amount', 'CreditMix', 'Payment_Behaviour', 'Payday Loan', 'Personal Loan', 'Debt Consolidation Loan', 'Auto Loan', 'Not Specified Loan', 'Student Loan', 'Credit-Builder Loan', 'Mortgage Loan', 'Home Equity Loan', 'Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'NumofLoan', 'Delay_from_due_date', 'NumofDelayedPayment', 'ChangedCreditLimit', 'NumCreditInquiries', 'OutstandingDebt', 'CreditUtilizationRatio', 'Credit_History_Age', 'TotalEMIpermonth', 'Amountinvestedmonthly', 'MonthlyBalance', 'Credit_Score']
Variables to drop ['Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'NumofLoan', 'NumofDelayedPayment', 'NumCreditInquiries', 'TotalEMIpermonth', 'Amountinvestedmonthly']
9 32


In [None]:
target = "Credit_Score"
file_tag = "Credit_Score"

eval_metric = "recall"
figure(figsize=(2 * dslabs.HEIGHT, dslabs.HEIGHT))
print(dslabs.study_variance_for_feature_selection(
    train,
    test,
    target=target,
    max_threshold=0.015,
    lag=0.01,
    metric=eval_metric,
    file_tag=file_tag,
))
show()

In [None]:
vals = pd.DataFrame(data={'NB': [0.8305960639954694, 0.8717966869602152, 0.8874415970550757, 0.8874415970550757, 0.8873000141582896, 0.8873708056066827, 0.8908395865779414, 0.8908395865779414, 0.8905564207843693, 0.8905564207843693], 'KNN': [0.8912643352682996, 0.8824861956675634, 0.877530794280051, 0.877601585728444, 0.8785926660059464, 0.8948746991363443, 0.8945207418943791, 0.8945207418943791, 0.9318278351975081, 0.9318278351975081]}
)

print(vals["KNN"])

In [None]:
print("Original variables", train.columns.values)
vars2drop: list[str] = dslabs.select_redundant_variables(
    train, target=target, min_threshold=0.54
)
print("Variables to drop", vars2drop)

In [8]:
from math import ceil


options: list[float] = [
    round(0.5 + i * 0.01, 3)
    for i in range(ceil((1 - 0.5) / 0.01) + 1)
]
                                                # OPTIMAL VALUE: 0.54 (2 cols dropped)

vars2drop: list[str] = dslabs.select_redundant_variables(
    train, target=target, min_threshold=0.54
)
print("Variables to drop", vars2drop)
df = train.copy()
df.drop(columns=vars2drop)

Variables to drop ['Payment_of_Min_Amount', 'Delay_from_due_date']
0        0.0
1        0.0
2        0.0
3        0.0
4        0.5
        ... 
79995    0.0
79996    1.0
79997    0.5
79998    0.0
79999    0.5
Name: Payment_of_Min_Amount, Length: 80000, dtype: float64


In [12]:
eval_metric = "recall"

new_train = train.copy()
new_train.drop(columns=vars2drop)
new_test = train.copy()
new_test.drop(columns=vars2drop)

figure(figsize=(2 * dslabs.HEIGHT, dslabs.HEIGHT))
print(dslabs.study_redundancy_for_feature_selection(
    new_train,
    new_test,
    target=target,                      # RECALL DE 1 NO KNN WTF NNGAGNGNAGNANGANGA
    metric=eval_metric,
    file_tag=file_tag,
))
show()

{'NB': [0.8443983402489627], 'KNN': [1.0]}


<Figure size 1200x600 with 0 Axes>

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import pandas as pd

# Assuming your data is loaded into a pandas DataFrame 'data'
# 'target_column' refers to the column you want to predict/classify

# Replace these with your actual data and target column
target_column = 'Credit_Score'  # Your target column name

X = data_scaling.copy()
X.drop(columns=["Credit_Score"])

y = data_scaling[target]

# List of models to test
models = [
    LogisticRegression(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

# Iterate through each model and perform RFE
for model in models:
    model_name = model.__class__.__name__
    rfe = RFE(model)  # Adjust the number of features as needed
    rfe.fit(X, y)
    selected_features = X.columns[rfe.support_]
    print(f"Selected Features for {model_name}:")
    print(selected_features)
    print("\n")

Selected Features for LogisticRegression:
Index(['Payment_of_Min_Amount', 'CreditMix', 'Payment_Behaviour',
       'Payday Loan', 'Debt Consolidation Loan', 'Auto Loan',
       'Not Specified Loan', 'Student Loan', 'Mortgage Loan',
       'Monthly_Inhand_Salary', 'Delay_from_due_date', 'ChangedCreditLimit',
       'OutstandingDebt', 'Credit_History_Age', 'Amountinvestedmonthly',
       'Credit_Score'],
      dtype='object')


Selected Features for RandomForestClassifier:
Index(['CreditMix', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'NumofLoan',
       'Delay_from_due_date', 'NumofDelayedPayment', 'ChangedCreditLimit',
       'NumCreditInquiries', 'OutstandingDebt', 'Credit_History_Age',
       'TotalEMIpermonth', 'MonthlyBalance', 'Credit_Score'],
      dtype='object')


Selected Features for GradientBoostingClassifier:
Index(['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interes