In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
%matplotlib inline


In [None]:
df = pd.read_csv(
    "dataset/train.csv",
)

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.corr(numeric_only=True)

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.isnull().sum()

In [None]:
duplicate_mask = df.duplicated()
print(duplicate_mask.value_counts())
df[duplicate_mask]

In [None]:
# from ydata_profiling import ProfileReport

# ProfileReport(df)

In [None]:
sns.heatmap(data=df.corr(numeric_only=True),cmap='coolwarm',annot=True)

In [None]:
sns.regplot(data=df,x='ApplicantIncome',y='LoanAmount')

In [None]:
sns.countplot(data=df,x='Gender')

In [None]:
sns.countplot(data=df,x='Education')

In [None]:
sns.countplot(data=df,x='Married',hue='Gender')

In [None]:
sns.countplot(data=df,x='Loan_Status',hue='Gender')

In [None]:
sns.countplot(data=df,x='Loan_Status',hue='Credit_History')

In [None]:
sns.countplot(data=df,x='Loan_Status',hue='Education')

In [None]:
sns.countplot(data=df,x='Loan_Status',hue='Married')

In [None]:
df.info()

In [None]:
sns.countplot(data=df,x='Loan_Status',hue='Property_Area')

In [None]:
plt.title('Applicants Minimum Income In Different Properties Area')
sns.barplot(df.pivot_table(index='Property_Area',aggfunc={'ApplicantIncome':pd.Series.min}),x='Property_Area',y='ApplicantIncome')
plt.ylabel('Applicant Income')
plt.xlabel('Property Area')

In [None]:
df['Loan_Status'].value_counts().to_frame().T

As we can see our data is highly imbalance so we will apply resampling techniques and for that we will use undersampling

In [None]:
from sklearn import preprocessing
def encode_features(df_train, features, df_test=None):
    if df_test is not None:
        df_combined = pd.concat([df_train[features], df_test[features]])
    else:
        df_combined = df_train[features]
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        
        if df_test is not None:
            df_test[feature] = le.transform(df_test[feature])
    
    return df_train, df_test
    

In [None]:
def fill_missing_values(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            mean_value = df[column].mean()
            df[column]=df[column].fillna(mean_value)
        else:
            mode_value = df[column].mode()[0]
            df[column]=df[column].fillna(mode_value)
    return df

In [None]:
y=df['Loan_Status']
X=df.drop(columns=['Loan_Status','Loan_ID'])

In [None]:
loan_not_given=df.loc[df['Loan_Status']=='N']
loan_given=df.loc[df['Loan_Status']=='Y'][:loan_not_given.shape[0]]
normal_distributed_df=pd.concat([loan_given,loan_not_given])
shuffled_undersampled_df= normal_distributed_df.sample(frac=1, random_state=42)
loan_given.shape[0]

In [None]:
y_undersampled=shuffled_undersampled_df['Loan_Status']
X_undersampled=shuffled_undersampled_df.drop(columns=['Loan_Status','Loan_ID'])

In [None]:
sns.countplot(data=shuffled_undersampled_df,x='Loan_Status')

In [None]:
X=fill_missing_values(X)
X_undersampled=fill_missing_values(X_undersampled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
)

In [None]:
features_to_encode=['Gender','Married' ,'Dependents', 'Education', 'Self_Employed','Credit_History','Property_Area']

In [None]:
sm = SMOTE(random_state=42)
X_train_encoded,X_test_res=encode_features(X_train, df_test=X_test,features = features_to_encode)
X_train_res, y_train_res = sm.fit_resample(X_train_encoded, y_train)
y_test_res = y_test.copy(deep=True)

In [None]:
X_train_undersampled, X_test_undersampled, y_train_undersampled, y_test_undersampled = (
    train_test_split(
        X_undersampled,
        y_undersampled,
        test_size=0.2,
        shuffle=True,
    )
)

In [None]:
df.columns

In [None]:
data_train,data_test=encode_features(X_train, df_test=X_test,features = features_to_encode)

In [None]:




dtree = make_pipeline(StandardScaler(), tree.DecisionTreeClassifier())
svc=make_pipeline(StandardScaler(),SVC())
logreg=make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
nbayes = make_pipeline(StandardScaler(), GaussianNB())
forest=make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=150))

dtree.fit(data_train, y_train)
svc.fit(data_train, y_train)
logreg.fit(data_train,y_train)
nbayes.fit(data_train, y_train)
forest.fit(data_train, y_train)


y_pred1=dtree.predict(data_test)
y_pred2=svc.predict(data_test)
y_pred3=logreg.predict(data_test)
y_pred4=nbayes.predict(data_test)
y_pred5=forest.predict(data_test)

print('Svm : ',accuracy_score(y_test, y_pred2))
print('D tree : ',accuracy_score(y_test, y_pred1))
print('Log Reg : ',accuracy_score(y_test, y_pred3))
print('Naive Bayes : ',accuracy_score(y_test, y_pred4))
print('Forest: ',accuracy_score(y_test, y_pred5))

In [None]:
y_test

In [None]:
X_dmatrix,_=encode_features(X,features=features_to_encode)
y_dmatrix=y.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
data_dmatrix = xgb.DMatrix(data=X_dmatrix,label=y_dmatrix)


In [None]:
data_dmatrix

In [None]:
models = {
    'Support Vector Machine':make_pipeline(StandardScaler(),SVC()),
    'Decision Tree': tree.DecisionTreeClassifier(),
    'Logistic Regression':make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=150),
    'Gradient Boosting':GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=6, random_state=0),
    'Extreme Boosting':XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.01, objective='binary:logistic')
}


metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
}

pos_label='Y'
for name, model in models.items():
    if name=='Extreme Boosting':
        y_train=y_train.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        y_train=y_train
        y_test=y_test.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        pos_label=1
    model.fit(data_train, y_train)
    y_pred = model.predict(data_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,pos_label=pos_label)
    recall = recall_score(y_test, y_pred,pos_label=pos_label)
    f1 = f1_score(y_test, y_pred,pos_label=pos_label)
    # metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_pred))
    metrics['Model'].append(name)
    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1 Score'].append(f1)


metrics_df = pd.DataFrame(metrics)

metrics_melted = metrics_df.melt(id_vars='Model', value_vars=['Accuracy', 'Precision','Recall','F1 Score'], var_name='Metric', value_name='Value')

plt.figure(figsize=(13, 8))
bars = sns.barplot(data=metrics_melted, x='Metric', y='Value', hue='Model')
for container in bars.containers:
    plt.bar_label(container, fmt='%.2f', label_type='edge')
plt.title('Comparison of Classification Models')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.legend(title='Model')
plt.show()
# fig,axs=plt.subplots(ncols=5,figsize=(20,3))
# fig.suptitle('Confusion matrix comparison',size=17)
# for index,item in enumerate(metrics['Confusion Matrix']):
#     sns.heatmap(item,cmap='coolwarm',ax=axs[index],annot=True,fmt='d')
#     axs[index].set_xlabel('Prediction'+'\n\n'+metrics['Model'][index])
#     axs[index].set_ylabel('Actual')

In [None]:
data_train_undersampled,data_test_undersampled=encode_features(X_train_undersampled, df_test=X_test_undersampled,features = features_to_encode)

In [None]:
models_undersampled = {
    'Support Vector Machine':make_pipeline(StandardScaler(),SVC()),
    'Decision Tree': tree.DecisionTreeClassifier(),
    'Logistic Regression':make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=150),
    'Gradient Boosting':GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=6, random_state=0),
    'Extreme Boosting':XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.01, objective='binary:logistic')
}


metrics_undersampled = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
}

pos_label='Y'
for name, model in models_undersampled.items():
    if name=='Extreme Boosting':
        y_train_undersampled=y_train_undersampled.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        y_train_undersampled=y_train_undersampled
        y_test_undersampled=y_test_undersampled.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        pos_label=1
    model.fit(data_train_undersampled, y_train_undersampled)
    y_pred = model.predict(data_test_undersampled)
    accuracy = accuracy_score(y_test_undersampled, y_pred)
    precision = precision_score(y_test_undersampled, y_pred,pos_label=pos_label)
    recall = recall_score(y_test_undersampled, y_pred,pos_label=pos_label)
    f1 = f1_score(y_test_undersampled, y_pred,pos_label=pos_label)
    # metrics['Confusion Matrix'].append(confusion_matrix(y_test_undersampled, y_pred))
    metrics_undersampled['Model'].append(name)
    metrics_undersampled['Accuracy'].append(accuracy)
    metrics_undersampled['Precision'].append(precision)
    metrics_undersampled['Recall'].append(recall)
    metrics_undersampled['F1 Score'].append(f1)


metrics_df_undersampled = pd.DataFrame(metrics_undersampled)

metrics_melted_undersampled = metrics_df_undersampled.melt(id_vars='Model', value_vars=['Accuracy', 'Precision','Recall','F1 Score'], var_name='Metric', value_name='Value')

plt.figure(figsize=(13, 8))
bars = sns.barplot(data=metrics_melted_undersampled, x='Metric', y='Value', hue='Model')
for container in bars.containers:
    plt.bar_label(container, fmt='%.2f', label_type='edge')
plt.title('Comparison of Classification Models')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(bottom=0.2)
plt.legend(title='Model',loc='lower center')
plt.show()
# fig,axs=plt.subplots(ncols=5,figsize=(20,3))
# fig.suptitle('Confusion matrix comparison',size=17)
# for index,item in enumerate(metrics['Confusion Matrix']):
#     sns.heatmap(item,cmap='coolwarm',ax=axs[index],annot=True,fmt='d')
#     axs[index].set_xlabel('Prediction'+'\n\n'+metrics['Model'][index])
#     axs[index].set_ylabel('Actual')

In [None]:
metrics_melted['Balanced']='N'
metrics_melted_undersampled['Balanced']='Y'

In [None]:
metrics_melted_undersampled

In [None]:
combined_metric_df=pd.concat([metrics_melted_undersampled,metrics_melted])

In [None]:
g = sns.FacetGrid(data=combined_metric_df, col='Metric', height=7, aspect=0.5)
g.map_dataframe(sns.barplot, x='Balanced', y='Value')
g.add_legend()
plt.show()

In [None]:
X

In [None]:

models_over = {
    'Support Vector Machine': make_pipeline(StandardScaler(), SVC()),
    'Decision Tree': tree.DecisionTreeClassifier(),
    'Logistic Regtrain_ression': make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
    'Naive Bayes': GaussianNB(),
    'Random Fotrain_rest': RandomForestClassifier(n_estimators=150),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=6, random_state=0),
    'Extreme Boosting': XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.01, objective='binary:logistic')
}

metrics_over = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
}

pos_label = 'Y'
for name, model in models_over.items():
    if name == 'Extreme Boosting':
        y_train_res = y_train_res.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        y_train_res = y_train_res
        y_test_res = y_test_res.apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))
        pos_label = 1
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_res)
    accuracy = accuracy_score(y_test_res, y_pred)
    precision = precision_score(y_test_res, y_pred, pos_label=pos_label)
    recall = recall_score(y_test_res, y_pred, pos_label=pos_label)
    f1 = f1_score(y_test_res, y_pred, pos_label=pos_label)
    metrics_over['Model'].append(name)
    metrics_over['Accuracy'].append(accuracy)
    metrics_over['Precision'].append(precision)
    metrics_over['Recall'].append(recall)
    metrics_over['F1 Score'].append(f1)

metrics_df_over = pd.DataFrame(metrics_over)

metrics_melted_over = metrics_df_over.melt(id_vars='Model', value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score'], var_name='Metric', value_name='Value')

plt.figure(figsize=(13, 8))
bars = sns.barplot(data=metrics_melted_over, x='Metric', y='Value', hue='Model')
for container in bars.containers:
    plt.bar_label(container, fmt='%.2f', label_type='edge')
plt.title('Comparison of Classification Models')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(bottom=0.2)
plt.legend(title='Model', loc='lower center')
plt.show()


In [None]:
metrics_melted['Balanced']='N'
metrics_melted_undersampled['Balanced']='U'
metrics_melted_over['Balanced']='O'

In [None]:
full_combined_metric_df=pd.concat([metrics_melted_undersampled,metrics_melted,metrics_melted_over])

In [None]:
g = sns.FacetGrid(data=full_combined_metric_df, col='Metric', height=7,aspect=0.4)
g.map_dataframe(sns.barplot, x='Balanced', y='Value')
g.add_legend()
plt.show()

In [None]:
df

# Using First 3 pricipal components

In [268]:
y_pca=df['Loan_Status']
X_pca=df.drop(columns=['Loan_Status','Loan_ID'])

In [269]:
X_pca=X_pca.astype({'Credit_History':'object'})

In [270]:
X_pca=fill_missing_values(X_pca)

  df[column]=df[column].fillna(mode_value)


In [271]:
X_pca,_=encode_features(X_pca,features=features_to_encode)

In [272]:
y_pca=y_pca.apply(lambda x:1 if x=='Y' else 0)

In [273]:
X_pca

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1,2
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1,2
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1,2
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1,0
610,1,1,3,0,0,4106,0.0,40.000000,180.0,1,0
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1,2
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1,2


In [275]:
X_pca=StandardScaler().fit_transform(X_pca)

In [276]:
X_pca_train,X_pca_test,y_pca_train,y_pca_test=train_test_split(X_pca,y_pca,random_state=42,shuffle=True)

In [277]:
pca=PCA(n_components=3)

In [278]:
pca.fit(X_pca)

In [279]:
train_data=pca.transform(X_pca_train)
test_data=pca.transform(X_pca_test)

In [280]:
pca.explained_variance_ratio_

array([0.17391006, 0.13371826, 0.10389253])

In [281]:
log_reg=LogisticRegression()

In [282]:
def classification_model(model):
    model.fit(train_data,y_pca_train)
    y_pca_pred=model.predict(test_data)
    print(f'{model} Accuracy : ',accuracy_score(y_pca_test, y_pca_pred))
    print(f'{model} precision : ',precision_score(y_pca_test, y_pca_pred))
    print(f'{model} Recall : ',recall_score(y_pca_test, y_pca_pred))
    print(f'{model} f1 Score : ',f1_score(y_pca_test, y_pca_pred))

In [283]:
classification_model(LogisticRegression())
classification_model(tree.DecisionTreeClassifier())
classification_model(SVC())

LogisticRegression() Accuracy :  0.7012987012987013
LogisticRegression() precision :  0.6875
LogisticRegression() Recall :  0.99
LogisticRegression() f1 Score :  0.8114754098360656
DecisionTreeClassifier() Accuracy :  0.7012987012987013
DecisionTreeClassifier() precision :  0.7368421052631579
DecisionTreeClassifier() Recall :  0.84
DecisionTreeClassifier() f1 Score :  0.7850467289719626
SVC() Accuracy :  0.6818181818181818
SVC() precision :  0.6758620689655173
SVC() Recall :  0.98
SVC() f1 Score :  0.8


# Using All Principal Components

In [284]:
y_pca=df['Loan_Status']
X_pca=df.drop(columns=['Loan_Status','Loan_ID'])

In [285]:
X_pca=X_pca.astype({'Credit_History':'object'})

In [286]:
X_pca=fill_missing_values(X_pca)

  df[column]=df[column].fillna(mode_value)


In [287]:
X_pca,_=encode_features(X_pca,features=features_to_encode)

In [288]:
y_pca=y_pca.apply(lambda x:1 if x=='Y' else 0)

In [291]:
X_pca=StandardScaler().fit_transform(X_pca)

In [292]:
X_pca_train,X_pca_test,y_pca_train,y_pca_test=train_test_split(X_pca,y_pca,random_state=42,shuffle=True)

In [293]:
pca=PCA()

In [294]:
pca.fit(X_pca)

In [295]:
train_data=pca.transform(X_pca_train)
test_data=pca.transform(X_pca_test)

In [296]:
pca.explained_variance_ratio_

array([0.17391006, 0.13371826, 0.10389253, 0.09714503, 0.09120898,
       0.08967639, 0.08093971, 0.07330465, 0.07213718, 0.05251229,
       0.03155491])

In [297]:
def classification_model(model):
    model.fit(train_data,y_pca_train)
    y_pca_pred=model.predict(test_data)
    print(f'{model} Accuracy : ',accuracy_score(y_pca_test, y_pca_pred))
    print(f'{model} precision : ',precision_score(y_pca_test, y_pca_pred))
    print(f'{model} Recall : ',recall_score(y_pca_test, y_pca_pred))
    print(f'{model} f1 Score : ',f1_score(y_pca_test, y_pca_pred))

In [298]:
classification_model(LogisticRegression())
classification_model(tree.DecisionTreeClassifier())
classification_model(SVC())

LogisticRegression() Accuracy :  0.7727272727272727
LogisticRegression() precision :  0.7480916030534351
LogisticRegression() Recall :  0.98
LogisticRegression() f1 Score :  0.8484848484848485
DecisionTreeClassifier() Accuracy :  0.7272727272727273
DecisionTreeClassifier() precision :  0.7589285714285714
DecisionTreeClassifier() Recall :  0.85
DecisionTreeClassifier() f1 Score :  0.8018867924528302
SVC() Accuracy :  0.7857142857142857
SVC() precision :  0.7557251908396947
SVC() Recall :  0.99
SVC() f1 Score :  0.8571428571428571


# Using Selective Principal Components

In [299]:
y_pca=df['Loan_Status']
X_pca=df.drop(columns=['Loan_Status','Loan_ID'])

In [300]:
X_pca=X_pca.astype({'Credit_History':'object'})

In [301]:
X_pca=fill_missing_values(X_pca)

  df[column]=df[column].fillna(mode_value)


In [302]:
X_pca,_=encode_features(X_pca,features=features_to_encode)

In [303]:
y_pca=y_pca.apply(lambda x:1 if x=='Y' else 0)

In [304]:
X_pca=StandardScaler().fit_transform(X_pca)

In [305]:
X_pca_train,X_pca_test,y_pca_train,y_pca_test=train_test_split(X_pca,y_pca,random_state=42,shuffle=True)

In [306]:
pca=PCA()

In [307]:
pca.fit(X_pca)

In [309]:
pca.explained_variance_ratio_

array([0.17391006, 0.13371826, 0.10389253, 0.09714503, 0.09120898,
       0.08967639, 0.08093971, 0.07330465, 0.07213718, 0.05251229,
       0.03155491])

In [None]:
train_data=pca.transform(X_pca_train)
test_data=pca.transform(X_pca_test)

In [None]:
pca.explained_variance_ratio_

array([0.17391006, 0.13371826, 0.10389253])

In [None]:
log_reg=LogisticRegression()

In [None]:
def classification_model(model):
    model.fit(train_data,y_pca_train)
    y_pca_pred=model.predict(test_data)
    print(f'{model} Accuracy : ',accuracy_score(y_pca_test, y_pca_pred))
    print(f'{model} precision : ',precision_score(y_pca_test, y_pca_pred))
    print(f'{model} Recall : ',recall_score(y_pca_test, y_pca_pred))
    print(f'{model} f1 Score : ',f1_score(y_pca_test, y_pca_pred))

In [None]:
classification_model(LogisticRegression())
classification_model(tree.DecisionTreeClassifier())
classification_model(SVC())

LogisticRegression() Accuracy :  0.7012987012987013
LogisticRegression() precision :  0.6875
LogisticRegression() Recall :  0.99
LogisticRegression() f1 Score :  0.8114754098360656
DecisionTreeClassifier() Accuracy :  0.7012987012987013
DecisionTreeClassifier() precision :  0.7368421052631579
DecisionTreeClassifier() Recall :  0.84
DecisionTreeClassifier() f1 Score :  0.7850467289719626
SVC() Accuracy :  0.6818181818181818
SVC() precision :  0.6758620689655173
SVC() Recall :  0.98
SVC() f1 Score :  0.8
