## Imports

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,precision_recall_fscore_support
import warnings
import os

## Data Loading and Initial Preprocessing

In [27]:
a1=pd.read_excel("case_study1.xlsx")
a2=pd.read_excel("case_study2.xlsx")

In [28]:
df1=a1.copy()
df2=a2.copy()

In [29]:
##EDA
#removing null
df1=df1.loc[df1['Age_Oldest_TL']!=-99999]

colums_to_be_removed=[]
for i in df2.columns:
    if df2.loc[df2[i]==-99999].shape[0]>10000:
        colums_to_be_removed.append(i)

df2=df2.drop(colums_to_be_removed,axis=1)

for i in df2.columns:
    df2=df2.loc[df2[i]!=-99999]

In [30]:
#merging data

df=pd.merge(df1, df2,how='inner',left_on=['PROSPECTID'],right_on=['PROSPECTID'])   


In [31]:
# Check categorical columns
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


## Chi-Square Test of Independence


### Objective:
The Chi-square test of independence evaluates whether there is a significant relationship between two categorical variables. In this case, you are testing the relationship between each categorical feature (MARITALSTATUS, EDUCATION, GENDER, last_prod_enq2, first_prod_enq2) and the target variable (Approved_Flag).

### Hypotheses:

* Null Hypothesis (𝐻_0): The two variables are independent (no association).
* Alternative Hypothesis (𝐻_𝑎): The two variables are not independent (there is an association).
Interpretation:

A low p-value (typically ≤ 0.05) indicates that you can reject the null hypothesis, suggesting a significant association between the variables.
A high p-value (> 0.05) indicates that you cannot reject the null hypothesis, suggesting no significant association between the variables.

### Results Interpretation
Here are the p-values you provided:

MARITALSTATUS: 
3.578
×
1
0
−
233
3.578×10 
−233
 
EDUCATION: 
2.694
×
1
0
−
30
2.694×10 
−30
 
GENDER: 
1.908
×
1
0
−
5
1.908×10 
−5
 
last_prod_enq2: 
0.0
0.0
first_prod_enq2: 
7.850
×
1
0
−
287
7.850×10 
−287
 
### Interpretation:

MARITALSTATUS: The p-value is extremely low, indicating a significant association between marital status and the approval flag.

EDUCATION: The p-value is very low, indicating a significant association between education and the approval flag.

GENDER: The p-value is low, indicating a significant association between gender and the approval flag.

last_prod_enq2: The p-value is effectively zero, indicating a very strong association between the last product enquiry type and the approval flag.

first_prod_enq2: The p-value is extremely low, indicating a significant association between the first product enquiry type and the approval flag.

In summary, all the tested categorical features (MARITALSTATUS, EDUCATION, GENDER, last_prod_enq2, first_prod_enq2) have a significant association with the approval flag (Approved_Flag). This means that these features can be considered important in predicting the target variable and should be included in the model.

In [32]:
# Chi-square test for categorical features
from scipy.stats import chi2_contingency
for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

MARITALSTATUS --- 3.5781808610388605e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.9079361001865664e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.849976105554191e-287


In [33]:
# Select relevant numerical columns
numeric_columns = [col for col in df.columns if df[col].dtype != 'object' and col not in ['PROSPECTID', 'Approved_Flag']]

## Variance Inflation Factor Test 


The Variance Inflation Factor (VIF) measures the amount of multicollinearity (correlation between predictor variables) in a regression model. High multicollinearity can inflate the variance of coefficient estimates and make the model unstable.

### Interpretation:
VIF = 1: No correlation between the predictor and other variables.

1 < VIF < 5: Moderate correlation but not severe.

VIF > 5: High correlation that may be problematic.

VIF > 10: Very high correlation, indicating severe multicollinearity, and the variable should likely be removed.

In [37]:
# Variance Inflation Factor (VIF) to remove multicollinear features
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the numeric columns data
vif_data = df[numeric_columns].copy()
columns_to_be_kept = []

In [38]:
# Calculate VIF for each feature and remove those with high multicollinearity
while True:
    vif_values = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
    max_vif = max(vif_values)
    max_vif_index = vif_values.index(max_vif)
    
    if max_vif > 10:
        print(f"Dropping {vif_data.columns[max_vif_index]} with VIF {max_vif}")
        vif_data.drop(vif_data.columns[max_vif_index], axis=1, inplace=True)
    else:
        break

# Update the columns to be kept based on VIF
columns_to_be_kept = vif_data.columns.tolist()
print("Columns to be kept based on VIF:", columns_to_be_kept)

  vif = 1. / (1. - r_squared_i)


Dropping Total_TL with VIF inf


  vif = 1. / (1. - r_squared_i)


Dropping Tot_Closed_TL with VIF inf


  vif = 1. / (1. - r_squared_i)


Dropping pct_active_tl with VIF inf


  vif = 1. / (1. - r_squared_i)


Dropping Auto_TL with VIF inf


  vif = 1. / (1. - r_squared_i)


Dropping num_deliq_6mts with VIF inf
Dropping pct_of_active_TLs_ever with VIF 2688.950269071678
Dropping Secured_TL with VIF 91.06825895268156
Dropping enq_L12m with VIF 36.973403461591815
Dropping Credit_Score with VIF 33.37615091051607
Dropping num_std_12mts with VIF 26.132728283549536
Dropping pct_PL_enq_L6m_of_L12m with VIF 24.111711369003796
Dropping Total_TL_opened_L12M with VIF 22.140448203659652
Dropping Unsecured_TL with VIF 19.94482644532673
Dropping pct_CC_enq_L6m_of_L12m with VIF 19.1598991176436
Dropping enq_L6m with VIF 16.71623271795243
Dropping num_times_30p_dpd with VIF 13.67574175637928
Dropping AGE with VIF 12.91749147385467
Dropping PL_enq_L12m with VIF 12.431183953953077
Dropping Tot_Active_TL with VIF 12.316359114662921
Columns to be kept based on VIF: ['Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_open_L6M', 'pct_tl_closed_L6M', 'pct_closed_tl', 'Tot_TL_closed_L12M', 'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Consumer_TL', 'Gold_

In [39]:
len(columns_to_be_kept)

53

In [40]:
len(df.columns)

79

## ANOVA Test


The Analysis of Variance (ANOVA) test is used to determine if there are statistically significant differences between the means of three or more independent (unrelated) groups. In the context of your code, it's used to determine whether each numerical feature significantly differs across the categories of the target variable Approved_Flag.

### Hypotheses:
Null Hypothesis (𝐻_0): The means of the different groups are equal (i.e., there is no significant difference between groups).

Alternative Hypothesis (𝐻_𝑎): At least one group mean is different from the others (i.e., there is a significant difference between groups).

### Steps in the ANOVA Test:
Partitioning the Data: The numerical feature data is partitioned into groups based on the levels of the categorical variable (here, Approved_Flag).

Calculate F-statistic: The F-statistic is calculated by comparing the variance between the groups to the variance within the groups.

Compute p-value: The p-value indicates the probability that the observed differences are due to chance.
Interpretation of p-value:

* p-value ≤ 0.05: Reject the null hypothesis. There is a statistically significant difference between the means of the groups.
* p-value > 0.05: Fail to reject the null hypothesis. There is no statistically significant difference between the means of the groups.



In [41]:
from scipy.stats import chi2_contingency, f_oneway

In [42]:
# ANOVA test for numerical features
columns_to_be_kept_numerical = []
for i in columns_to_be_kept:
    groups = [list(df[df['Approved_Flag'] == group][i]) for group in df['Approved_Flag'].unique()]
    f_statistic, p_value = f_oneway(*groups)
    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [44]:
len(columns_to_be_kept_numerical)

50

## Preprocessing

In [45]:
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [46]:
df = df[features + ['Approved_Flag']]

In [48]:
# Label encoding for categorical features
df['EDUCATION'] = df['EDUCATION'].replace({
    'SSC': 1,
    '12TH': 2,
    'GRADUATE': 3,
    'UNDER GRADUATE': 3,
    'OTHERS': 1,
    'POST-GRADUATE': 4,
    'PROFESSIONAL': 3
})
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 56 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Total_TL_opened_L6M         42064 non-null  int64  
 1   Tot_TL_closed_L6M           42064 non-null  int64  
 2   pct_tl_open_L6M             42064 non-null  float64
 3   pct_tl_closed_L6M           42064 non-null  float64
 4   pct_closed_tl               42064 non-null  float64
 5   Tot_TL_closed_L12M          42064 non-null  int64  
 6   pct_tl_open_L12M            42064 non-null  float64
 7   pct_tl_closed_L12M          42064 non-null  float64
 8   Tot_Missed_Pmnt             42064 non-null  int64  
 9   CC_TL                       42064 non-null  int64  
 10  Consumer_TL                 42064 non-null  int64  
 11  Gold_TL                     42064 non-null  int64  
 12  Home_TL                     42064 non-null  int64  
 13  PL_TL                       420

  df['EDUCATION'] = df['EDUCATION'].replace({


In [49]:
df_encoded = pd.get_dummies(df, columns=["MARITALSTATUS", 'GENDER', 'last_prod_enq2', 'first_prod_enq2'])

## Feature selection using RandomForestClassifier

In [50]:
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'], axis=1)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [51]:
# Initialize and fit the model
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(x_train, y_train)

In [52]:
# Get feature importances
importances = rf_classifier.feature_importances_
feature_names = x.columns

In [53]:
# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [54]:
# Sort features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [61]:
feature_importances

Unnamed: 0,Feature,Importance
15,Age_Oldest_TL,0.126006
40,enq_L3m,0.086309
39,time_since_recent_enq,0.079656
23,num_std,0.047524
17,time_since_recent_payment,0.042522
...,...,...
27,num_sub_12mts,0.000346
31,num_lss,0.000194
26,num_sub_6mts,0.000148
30,num_dbt_12mts,0.000092


In [64]:
# Display the top features
print("Top features based on importance:")
print(feature_importances.head(50))

Top features based on importance:
                         Feature  Importance
15                 Age_Oldest_TL    0.126006
40                       enq_L3m    0.086309
39         time_since_recent_enq    0.079656
23                       num_std    0.047524
17     time_since_recent_payment    0.042522
42           Time_With_Curr_Empr    0.038820
41              NETMONTHLYINCOME    0.034316
16                 Age_Newest_TL    0.032184
24                  num_std_6mts    0.031987
33                       tot_enq    0.031887
46        pct_PL_enq_L6m_of_ever    0.029913
32         recent_level_of_deliq    0.025623
6               pct_tl_open_L12M    0.024062
19     max_recent_level_of_deliq    0.023846
4                  pct_closed_tl    0.023192
18          num_times_delinquent    0.019986
38                    PL_enq_L6m    0.019593
14                      Other_TL    0.016838
2                pct_tl_open_L6M    0.014646
10                   Consumer_TL    0.014484
37                   

In [65]:
import pandas as pd

# Assuming feature_importances is a DataFrame with Feature and Importance columns
# Calculate cumulative importance
feature_importances['Cumulative Importance'] = feature_importances['Importance'].cumsum()

# Set the threshold for cumulative importance
threshold = 0.95

# Select features contributing to the cumulative importance below the threshold
selected_features = feature_importances[feature_importances['Cumulative Importance'] <= threshold]['Feature']

print(f"Selected features based on cumulative importance up to {threshold * 100}%:")
print(selected_features)

# Print number of selected features
print(f"Number of selected features: {len(selected_features)}")

Selected features based on cumulative importance up to 95.0%:
15                   Age_Oldest_TL
40                         enq_L3m
39           time_since_recent_enq
23                         num_std
17       time_since_recent_payment
42             Time_With_Curr_Empr
41                NETMONTHLYINCOME
16                   Age_Newest_TL
24                    num_std_6mts
33                         tot_enq
46          pct_PL_enq_L6m_of_ever
32           recent_level_of_deliq
6                 pct_tl_open_L12M
19       max_recent_level_of_deliq
4                    pct_closed_tl
18            num_times_delinquent
38                      PL_enq_L6m
14                        Other_TL
2                  pct_tl_open_L6M
10                     Consumer_TL
37                          PL_enq
50                       EDUCATION
7               pct_tl_closed_L12M
11                         Gold_TL
20                 num_deliq_12mts
3                pct_tl_closed_L6M
8                  Tot_Misse

## Selecting number of features by testing on RF and XGB

In [77]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [76]:
def train_and_evaluate(x_train, x_test, y_train, y_test):
    results = {}

    # Random Forest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
    rf_classifier.fit(x_train, y_train)
    y_pred_rf = rf_classifier.predict(x_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    precision_rf, recall_rf, f1_score_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted')

    results['Random Forest'] = {
        'Accuracy': accuracy_rf,
        'Precision': precision_rf,
        'Recall': recall_rf,
        'F1 Score': f1_score_rf
    }

    # XGBoost Classifier
    xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)
    xgb_classifier.fit(x_train, y_train)
    y_pred_xgb = xgb_classifier.predict(x_test)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    precision_xgb, recall_xgb, f1_score_xgb, _ = precision_recall_fscore_support(y_test, y_pred_xgb, average='weighted')

    results['XGBoost'] = {
        'Accuracy': accuracy_xgb,
        'Precision': precision_xgb,
        'Recall': recall_xgb,
        'F1 Score': f1_score_xgb
    }

    return results

In [78]:
# Function to select top N features and evaluate classifiers
def evaluate_with_top_features(df_encoded, feature_importances, num_features):
    # Select top N features based on importance
    selected_features = feature_importances.head(num_features)['Feature']
    
    # Prepare data
    y = df_encoded['Approved_Flag']
    x = df_encoded[selected_features]

    # Encode target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Split the data
    x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

    # Train and evaluate classifiers
    results = train_and_evaluate(x_train, x_test, y_train, y_test)
    return results

In [79]:
# Evaluate classifiers with different numbers of top features
num_features_list = [10, 20, 30, 40, 50]
evaluation_results = {}

In [80]:
for num_features in num_features_list:
    results = evaluate_with_top_features(df_encoded, feature_importances, num_features)
    evaluation_results[num_features] = results

In [81]:
# Print evaluation results
for num_features, results in evaluation_results.items():
    print(f"\nResults with top {num_features} features:")
    for classifier, metrics in results.items():
        print(f"{classifier}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")


Results with top 10 features:
Random Forest:
  Accuracy: 0.7285
  Precision: 0.6949
  Recall: 0.7285
  F1 Score: 0.7003
XGBoost:
  Accuracy: 0.7302
  Precision: 0.6994
  Recall: 0.7302
  F1 Score: 0.7032

Results with top 20 features:
Random Forest:
  Accuracy: 0.7706
  Precision: 0.7457
  Recall: 0.7706
  F1 Score: 0.7515
XGBoost:
  Accuracy: 0.7738
  Precision: 0.7527
  Recall: 0.7738
  F1 Score: 0.7590

Results with top 30 features:
Random Forest:
  Accuracy: 0.7713
  Precision: 0.7459
  Recall: 0.7713
  F1 Score: 0.7510
XGBoost:
  Accuracy: 0.7801
  Precision: 0.7603
  Recall: 0.7801
  F1 Score: 0.7663

Results with top 40 features:
Random Forest:
  Accuracy: 0.7742
  Precision: 0.7494
  Recall: 0.7742
  F1 Score: 0.7527
XGBoost:
  Accuracy: 0.7820
  Precision: 0.7633
  Recall: 0.7820
  F1 Score: 0.7690

Results with top 50 features:
Random Forest:
  Accuracy: 0.7701
  Precision: 0.7437
  Recall: 0.7701
  F1 Score: 0.7482
XGBoost:
  Accuracy: 0.7783
  Precision: 0.7587
  Recall: 0

In [82]:
# Evaluate classifiers with different numbers of top features
num_features_list = [12, 15, 18]
evaluation_results = {}

In [83]:
for num_features in num_features_list:
    results = evaluate_with_top_features(df_encoded, feature_importances, num_features)
    evaluation_results[num_features] = results

In [84]:
# Print evaluation results
for num_features, results in evaluation_results.items():
    print(f"\nResults with top {num_features} features:")
    for classifier, metrics in results.items():
        print(f"{classifier}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")


Results with top 12 features:
Random Forest:
  Accuracy: 0.7685
  Precision: 0.7443
  Recall: 0.7685
  F1 Score: 0.7507
XGBoost:
  Accuracy: 0.7730
  Precision: 0.7519
  Recall: 0.7730
  F1 Score: 0.7581

Results with top 15 features:
Random Forest:
  Accuracy: 0.7675
  Precision: 0.7430
  Recall: 0.7675
  F1 Score: 0.7490
XGBoost:
  Accuracy: 0.7723
  Precision: 0.7512
  Recall: 0.7723
  F1 Score: 0.7573

Results with top 18 features:
Random Forest:
  Accuracy: 0.7717
  Precision: 0.7486
  Recall: 0.7717
  F1 Score: 0.7544
XGBoost:
  Accuracy: 0.7726
  Precision: 0.7512
  Recall: 0.7726
  F1 Score: 0.7579


## Features Finally selected

In [85]:
selected_features = feature_importances.head(12)['Feature']

In [86]:
selected_features

15                Age_Oldest_TL
40                      enq_L3m
39        time_since_recent_enq
23                      num_std
17    time_since_recent_payment
42          Time_With_Curr_Empr
41             NETMONTHLYINCOME
16                Age_Newest_TL
24                 num_std_6mts
33                      tot_enq
46       pct_PL_enq_L6m_of_ever
32        recent_level_of_deliq
Name: Feature, dtype: object

## Testing Various Models

In [95]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [87]:
from sklearn.model_selection import GridSearchCV

y = df_encoded['Approved_Flag']
x = df_encoded[selected_features]

In [88]:
x

Unnamed: 0,Age_Oldest_TL,enq_L3m,time_since_recent_enq,num_std,time_since_recent_payment,Time_With_Curr_Empr,NETMONTHLYINCOME,Age_Newest_TL,num_std_6mts,tot_enq,pct_PL_enq_L6m_of_ever,recent_level_of_deliq
0,72,0,566,21,549,114,51000,18,5,6,0.000,29
1,7,0,209,0,47,50,19000,7,0,1,0.000,0
2,47,0,587,10,302,191,18,2,5,4,0.000,25
3,131,0,3951,53,583,75,15000,32,4,1,0.000,0
4,150,4,7,5,245,154,0,17,0,15,0.429,26
...,...,...,...,...,...,...,...,...,...,...,...,...
42059,24,1,0,0,15,249,18500,5,0,4,0.000,24
42060,74,0,203,6,57,186,25000,7,4,2,0.000,0
42061,9,2,1,0,32,66,18000,5,0,6,1.000,0
42062,15,0,242,0,58,54,12802,8,0,3,0.000,0


In [89]:
y

0        P2
1        P2
2        P2
3        P1
4        P3
         ..
42059    P4
42060    P1
42061    P3
42062    P2
42063    P2
Name: Approved_Flag, Length: 42064, dtype: object

In [99]:
# Manually encode target variable
df_encoded['Approved_Flag_Encoded'] = df_encoded['Approved_Flag'].replace({'P1': 0, 'P2': 1, 'P3': 2, 'P4': 3})

  df_encoded['Approved_Flag_Encoded'] = df_encoded['Approved_Flag'].replace({'P1': 0, 'P2': 1, 'P3': 2, 'P4': 3})


In [100]:
# Prepare the data with the top 12 features for testing
selected_features = feature_importances.head(12)['Feature']
y = df_encoded['Approved_Flag_Encoded']
x = df_encoded[selected_features]

In [101]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [102]:
# Dictionary to store models and their names
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
    'XGBoost': xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB()
}

In [103]:
# Function to train and evaluate classifiers with a given set of features
def train_and_evaluate_model(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1_score

In [104]:
# Evaluate all models
evaluation_results = {}

for model_name, model in models.items():
    accuracy, precision, recall, f1_score = train_and_evaluate_model(model, x_train, x_test, y_train, y_test)
    evaluation_results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score
    }

# Print evaluation results for each model
for model_name, metrics in evaluation_results.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Random Forest:
  Accuracy: 0.7685
  Precision: 0.7443
  Recall: 0.7685
  F1 Score: 0.7507

XGBoost:
  Accuracy: 0.7730
  Precision: 0.7519
  Recall: 0.7730
  F1 Score: 0.7581

Logistic Regression:
  Accuracy: 0.6351
  Precision: 0.5344
  Recall: 0.6351
  F1 Score: 0.5463

K-Nearest Neighbors:
  Accuracy: 0.6164
  Precision: 0.5737
  Recall: 0.6164
  F1 Score: 0.5770

Decision Tree:
  Accuracy: 0.6952
  Precision: 0.6990
  Recall: 0.6952
  F1 Score: 0.6970

Support Vector Machine:
  Accuracy: 0.5997
  Precision: 0.3596
  Recall: 0.5997
  F1 Score: 0.4496

Naive Bayes:
  Accuracy: 0.6729
  Precision: 0.6485
  Recall: 0.6729
  F1 Score: 0.6555


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Grid Search on Random Forest

In [105]:
# Prepare the data with the top 12 features
selected_features = feature_importances.head(12)['Feature']
y = df_encoded['Approved_Flag_Encoded']
x = df_encoded[selected_features]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=rf_params, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(x_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

print("Best parameters for Random Forest:", rf_grid_search.best_params_)

Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}


## Grid Search on XGBoost

In [106]:
# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42), param_grid=xgb_params, cv=3, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(x_train, y_train)
best_xgb_model = xgb_grid_search.best_estimator_

print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}


## Evaluating Best Model

In [107]:
# Evaluate the tuned models
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1_score

rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(best_rf_model, x_test, y_test)
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1 = evaluate_model(best_xgb_model, x_test, y_test)


In [108]:
print(f"\nRandom Forest - Best Parameters: {rf_grid_search.best_params_}")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print(f"F1 Score: {rf_f1}")

print(f"\nXGBoost - Best Parameters: {xgb_grid_search.best_params_}")
print(f"Accuracy: {xgb_accuracy}")
print(f"Precision: {xgb_precision}")
print(f"Recall: {xgb_recall}")
print(f"F1 Score: {xgb_f1}")


Random Forest - Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}
Accuracy: 0.7702365386901224
Precision: 0.7456240438629397
Recall: 0.7702365386901224
F1 Score: 0.7518259672413596

XGBoost - Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Accuracy: 0.7773683584928087
Precision: 0.7527894378041347
Recall: 0.7773683584928087
F1 Score: 0.7577031986379033


## Cross Validation

In [110]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [111]:
# Cross-validation for the best models
rf_cv_scores = cross_val_score(best_rf_model, x, y, cv=5, scoring='accuracy')
print(f"\nRandom Forest CV Accuracy: {rf_cv_scores.mean()} (+/- {rf_cv_scores.std() * 2})")

xgb_cv_scores = cross_val_score(best_xgb_model, x, y, cv=5, scoring='accuracy')
print(f"XGBoost CV Accuracy: {xgb_cv_scores.mean()} (+/- {xgb_cv_scores.std() * 2})")


Random Forest CV Accuracy: 0.7734403524559138 (+/- 0.006725200706978069)
XGBoost CV Accuracy: 0.7761742817127605 (+/- 0.00533194829691741)


## Saving the models

In [112]:
import pickle

# Save the best Random Forest model
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

# Save the best XGBoost model
with open('best_xgb_model.pkl', 'wb') as f:
    pickle.dump(best_xgb_model, f)