## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os


## Load data

In [2]:
a1 = pd.read_excel("case_study1.xlsx")
a2 = pd.read_excel("case_study2.xlsx")

In [3]:
df1 = a1.copy()
df2 = a2.copy()

## Data Cleaning

In [4]:
df1.describe()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
count,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,...,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0
mean,25668.5,4.858598,2.770415,2.088184,0.736754,0.428919,0.184574,0.089095,0.577542,0.422458,...,0.124981,1.136084,1.561847,0.070146,0.282511,2.844904,2.013694,1.089762,-32.575639,-62.149525
std,14819.571046,7.177116,5.94168,2.290774,1.296717,0.989972,0.297414,0.205635,0.379867,0.379867,...,0.505201,2.227997,5.376434,0.340861,0.858168,6.187177,3.198322,2.417496,2791.869609,2790.818622
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99999.0,-99999.0
25%,12834.75,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,4.0
50%,25668.5,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.556,0.444,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,33.0,8.0
75%,38502.25,5.0,3.0,3.0,1.0,1.0,0.308,0.053,1.0,0.75,...,0.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,64.0,17.0
max,51336.0,235.0,216.0,47.0,27.0,19.0,1.0,1.0,1.0,1.0,...,27.0,41.0,235.0,10.0,29.0,235.0,55.0,80.0,392.0,392.0


As we can see there is a value "-99999" in "Age_Oldest_TL" and "Age_Newest_TL" Column. These values are missing values, so we will drop these values.

In [5]:
# Remove nulls
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [6]:
df2.describe()

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,PL_utilization,PL_Flag,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,Credit_Score
count,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,...,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0,51336.0
mean,25668.5,-8129.961314,-70020.09132,-70022.375838,1.573749,-70003.987085,13.521953,0.184977,0.480053,0.295076,...,-86556.225194,0.167874,0.190414,0.065182,0.170492,0.056302,-45127.943635,0.271116,0.052887,679.859222
std,14819.571046,27749.328514,45823.312757,45819.820741,4.165012,45847.9761,53.336976,0.71024,1.52221,1.027471,...,34111.41475,0.373758,0.376218,0.235706,0.350209,0.213506,49795.784556,0.44454,0.22381,20.502764
min,1.0,-99999.0,-99999.0,-99999.0,0.0,-99999.0,0.0,0.0,0.0,0.0,...,-99999.0,0.0,0.0,0.0,0.0,0.0,-99999.0,0.0,0.0,469.0
25%,12834.75,46.0,-99999.0,-99999.0,0.0,-99999.0,0.0,0.0,0.0,0.0,...,-99999.0,0.0,0.0,0.0,0.0,0.0,-99999.0,0.0,0.0,669.0
50%,25668.5,70.0,-99999.0,-99999.0,0.0,-99999.0,0.0,0.0,0.0,0.0,...,-99999.0,0.0,0.0,0.0,0.0,0.0,0.333,0.0,0.0,680.0
75%,38502.25,161.0,8.0,3.0,1.0,15.0,10.0,0.0,0.0,0.0,...,-99999.0,0.0,0.0,0.0,0.0,0.0,2.16425,1.0,0.0,691.0
max,51336.0,6065.0,35.0,35.0,74.0,900.0,900.0,12.0,28.0,20.0,...,1.708,1.0,1.0,1.0,1.0,1.0,173800.0,1.0,1.0,811.0


As we can see "-99999" value is present in several columns and there are some columns in which this value is present more than 30%. If we drop those values from data then we will loss so many data. So we will directly drop those columns which has so many missing values.

In [7]:
columns_to_be_removed = []

for i in df2.columns:   
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed .append(i)
        
df2 = df2.drop(columns_to_be_removed, axis =1)

Now we will drop those rows which has the "-99999" values in it. 

In [8]:
for i in df2.columns:
    df2 = df2.loc[ df2[i] != -99999 ]

## Data Merging

We will check is there is any common column in both of the dataset, so that we can merge this two dataset.

In [9]:
for i in list(df1.columns):
    if i in list(df2.columns):
        print (i)

PROSPECTID


By using this "PROSPECTID" column we will merge this two dataset.

In [10]:
df = pd. merge ( df1, df2, how ='inner', left_on = ['PROSPECTID'], right_on = ['PROSPECTID'])  

## Statistical Analysis

### 1. Analysis on categorical columns

check how many columns are categorical

In [11]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [12]:
# Chi-square test
for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


### Interpretation

Null Hypothesis (H₀): The two variables are independent (not associated).

Alternative Hypothesis (H₁): The two variables are dependent (associated).

Since the pvalue is less than 0.05 for all categorical variables, which indicates that there is association between these categotrical variables with "Approved_Flag".

### 2. Analysis on numerical analysis

## i.VIF for numerical columns

In [13]:
# 
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [14]:
# VIF sequentially check

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0


In [15]:
for i in range (0,total_columns):
    
    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)
    
    
    if vif_value <= 6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index = column_index+1
    
    else:
        vif_data = vif_data.drop([numeric_columns[i]],axis=1)




  vif = 1. / (1. - r_squared_i)


0 --- inf


  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735


  vif = 1. / (1. - r_squared_i)


2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.099653381646727
3 --- 5.5813520096427585
4 --- 1.985584353098778


  vif = 1. / (1. - r_squared_i)


5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.384346405965583
7 --- 3.064658415523423
8 --- 2.898639771299253
9 --- 4.377876915347324
10 --- 2.2078535836958433
11 --- 4.916914200506864
12 --- 5.214702030064725
13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.380634506427232
15 --- 1.4210050015175733
16 --- 8.083255010190323
16 --- 1.624122752404011
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.825857047132431
18 --- 1.5080839450032664
19 --- 2.172088834824577
20 --- 2.623397553527229
21 --- 2.2959970812106167
22 --- 7.360578319196439
22 --- 2.160238777310255
23 --- 2.8686288267891467
24 --- 6.458218003637277
24 --- 2.8474118865638265
25 --- 4.753198156284083
26 --- 16.22735475594825
26 --- 6.424377256363877
26 --- 8.887080381808687
26 --- 2.3804746142952653
27 --- 8.609513476514548
27 --- 13.06755093547673
27 --- 3.5000400566546555
28 --- 1.9087955874813773
29 --- 17.006562234161628
29 --- 10.730485153719197
29 --- 2.3538497522950275
30 --- 22.104855915136433
30 --- 2.7971639638512906
31 --- 3.424171203217696
32 --- 10.175021454450922
32 --- 6.408710354561301
32 --- 1.001151196262562
33 --- 3.069197305397274
34 --- 2.8091261600643724
35 --- 20.249538381980678
35 --- 15.864576541593774
35 --- 1.8331649740532

In [16]:
print(vif_data.columns)

Index(['pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Tot_TL_closed_L12M',
       'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Home_TL', 'PL_TL',
       'Secured_TL', 'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL',
       'Age_Newest_TL', 'time_since_recent_payment',
       'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd',
       'num_std_12mts', 'num_sub', 'num_sub_6mts', 'num_sub_12mts', 'num_dbt',
       'num_dbt_12mts', 'num_lss', 'num_lss_12mts', 'recent_level_of_deliq',
       'CC_enq_L12m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L3m',
       'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'pct_currentBal_all_TL',
       'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever',
       'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag'],
      dtype='object')


These are the columns which are lowest VIF Score with each other, so we can keep these column for model building.

## ii. Anova for columns_to_be_kept

In [17]:
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']


    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)


In [18]:
print(columns_to_be_kept_numerical)

['pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Tot_TL_closed_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Home_TL', 'PL_TL', 'Secured_TL', 'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd', 'num_std_12mts', 'num_sub', 'num_sub_6mts', 'num_sub_12mts', 'num_dbt', 'num_dbt_12mts', 'num_lss', 'recent_level_of_deliq', 'CC_enq_L12m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L3m', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag']


These are the numerical columns that we will kept for our further analysis.

In [19]:
# listing all the final features
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

Till now, the feature selection is done for categorical and numerical features.

Now we will do the encoding of categorical features.

##  Encoding of categorical features

In [20]:
print(df['MARITALSTATUS'].unique())
print(df['EDUCATION'].unique())
print(df['GENDER'].unique())
print(df['last_prod_enq2'].unique())
print(df['first_prod_enq2'].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


As we can see the 'EDUCATION' variable is ordinal variable and other variables are nominal and binary variables.

we have to handle these variables differently.

In [21]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [22]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

In [23]:
df_encoded.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

Upto this step we have build the data for fitting a machine learning algorithm, Now we will use different machine learning algorithms and will do the prediction

## Machine Learning Algorithms

### 1. Random Forest

In [24]:
x = df_encoded. drop(['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

In [38]:
from sklearn.model_selection import GridSearchCV

In [42]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],          # Number of trees
    'max_depth': [10, 20, 30],          # Tree depth
    'min_samples_split': [2, 5, 10],    # Minimum samples to split
    'max_features': ['sqrt', 'log2'],   # Number of features to consider
}

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model with hyperparameter tuning
grid_search.fit(x_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(x_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [45]:
# Print results
print("\nBest Hyperparameters:", best_params)
print(f'Best Cross-Validation Accuracy: {best_score:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')


Best Hyperparameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.77
Test Accuracy: 0.77
Precision: 0.74
Recall: 0.77
F1 Score: 0.74


In [47]:
from sklearn.metrics import classification_report

# Print a full classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

          P1       0.85      0.70      0.77      1014
          P2       0.80      0.93      0.86      5045
          P3       0.45      0.21      0.29      1325
          P4       0.73      0.72      0.72      1029

    accuracy                           0.77      8413
   macro avg       0.71      0.64      0.66      8413
weighted avg       0.74      0.77      0.74      8413



## 2. xgboost

In [28]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

In [29]:
x = df_encoded. drop(['Approved_Flag'],axis = 1)
y = df_encoded['Approved_Flag']

In [30]:
x['EDUCATION'] = x['EDUCATION'].astype('int')

In [31]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)



In [32]:
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)



Accuracy: 0.78



In [33]:
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Class p1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

Class p2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

Class p3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

Class p4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



In [50]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Splitting features and target
x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

# Handling data type for EDUCATION column if needed
x['EDUCATION'] = x['EDUCATION'].astype('int')

# Encoding categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)



# Defining the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Initializing the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fitting the grid search to the training data
grid_search.fit(x_train, y_train)

# Getting the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Training the final model with the best hyperparameters
best_xgb_classifier = xgb.XGBClassifier(**best_params, objective='multi:softmax', num_class=4)
best_xgb_classifier.fit(x_train, y_train)

# Making predictions on the test set
y_pred = best_xgb_classifier.predict(x_test)

# Evaluating the final model
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy with Best Hyperparameters: {accuracy:.2f}\n')

# Calculating precision, recall, and F1-score for each class
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

# Printing metrics for each class
class_labels = label_encoder.classes_
for label, p, r, f1 in zip(class_labels, precision, recall, f1_score):
    print(f"Class {label}:")
    print(f"Precision: {p:.2f}")
    print(f"Recall: {r:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print()


Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}

Accuracy with Best Hyperparameters: 0.78

Class P1:
Precision: 0.84
Recall: 0.76
F1 Score: 0.80

Class P2:
Precision: 0.82
Recall: 0.93
F1 Score: 0.87

Class P3:
Precision: 0.47
Recall: 0.26
F1 Score: 0.34

Class P4:
Precision: 0.74
Recall: 0.74
F1 Score: 0.74



## 3. Decision Tree

In [34]:
x = df_encoded. drop (['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

In [36]:
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


Accuracy: 0.71



In [37]:
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Class p1:
Precision: 0.7222767419038273
Recall: 0.7258382642998028
F1 Score: 0.7240531234628628

Class p2:
Precision: 0.8093847352024922
Recall: 0.82398414271556
F1 Score: 0.8166191926136921

Class p3:
Precision: 0.3430599369085173
Recall: 0.3283018867924528
F1 Score: 0.3355187042036251

Class p4:
Precision: 0.6474747474747474
Recall: 0.6229348882410107
F1 Score: 0.6349678058444775

