In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB


In [None]:
df=pd.read_csv('Default_Fin.csv')
df.head()

In [None]:
df=df.rename(columns={'Defaulted?':'Defaulted'})

In [None]:
df.info()

## Null Values

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data=df.isnull(),cmap='CMRmap')
plt.show()

As we can see there are no null values 

In [None]:
df=df.drop('Index',axis=1)
df.columns

In [None]:
df.head()

In [None]:
for i in df.columns:
    sns.distplot(df[i])
    plt.show()
    
    

1)As we can see bank balance and annual salary columns are almost has gaussian distribution 

2)Defaulted and Employed are binary columns 

In [None]:
df.describe()

1) Approx 70.56 percent of the customers are employed 

2)The average bank balance of the customers are Rs.10024 with a minimum of 0 and maximum of Rs.31851

3)The average annual salary of the customers are Rs.402203 with minimum $9263

In [None]:
plt.figure(figsize=(10,8))
sns.pairplot(df,hue='Defaulted')
plt.show()

In [None]:
employed = df.query("Employed == 1")
unemployed = df.query("Employed == 0")

In [None]:
plt.figure()
fig=px.pie(df,values=employed['Defaulted'].value_counts(),names=['Defaulters','Non-Defaulters'],title='Distribution of defualters who are Employed')
fig.update_traces(pull=[0.2,0,0.06,0.06,0.06,0.06])
fig.show()

In [None]:
plt.figure()
fig=px.pie(df,values=unemployed['Defaulted'].value_counts(),names=['Defaulters','Non-Defaulters'],title='Distribution of defaulters who are Unemployed')
fig.update_traces(pull=[0.2,0,0.06,0.06,0.06,0.06])
fig.show()

In [None]:
fig = plt.figure(figsize = (20, 9))
sns.set_style("dark")
sns.kdeplot(df[df['Defaulted']==1]['Bank Balance'])
sns.kdeplot(df[df['Defaulted']==0]['Bank Balance'])
plt.title('Default x Bank Balance',fontsize=25)
plt.legend(labels=['Defaulted', 'Did Not Default'])
plt.show()

In [None]:
fig = plt.figure(figsize = (20, 9))
sns.set_style("dark")
sns.kdeplot(df[df['Defaulted']==1]['Annual Salary'])
sns.kdeplot(df[df['Defaulted']==0]['Annual Salary'])
plt.title('Default x Annual Salaries')
plt.legend(labels=['Defaulted', 'Did Not Default'])
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True,cmap='Blues')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df,x='Employed',hue='Defaulted',palette='nipy_spectral')
plt.xlabel('Employed',fontsize=17)
plt.show()

In [None]:
df1=df
df1.head()

In [None]:
bank_balance_bins = [0, 5000, 10000, 15000, 20000]  # Customize these bins 
annual_salary_bins = [0, 100000, 300000, 500000, 1000000] #Customizing the bins
df1['Bank Balance Bucket'] = pd.cut(df1['Bank Balance'], bins=bank_balance_bins, labels=['<5k', '5k-10k', '10k-15k', '15k-20k'])
df1['Annual Salary Bucket'] = pd.cut(df1['Annual Salary'], bins=annual_salary_bins, labels=['<100k', '100k-300k', '300k-500k', '500k-1M'])

# Print the resulting DataFrame
df1.head()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df1,x='Bank Balance Bucket',hue='Defaulted',palette='twilight_shifted_r')
plt.xlabel('Bank Balance',fontsize=17)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df1,x='Annual Salary Bucket',hue='Defaulted',palette='twilight_shifted_r')
plt.xlabel('Bank Balance',fontsize=17)
plt.show()

In [None]:
df=df.drop(['Bank Balance Bucket','Annual Salary Bucket'],axis=1)

### Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler

x=df.drop('Defaulted',axis=1)
y=df['Defaulted']
x=StandardScaler().fit_transform(x)
x=pd.DataFrame(x,columns=df.iloc[:,:3].columns)

### Splitting the data 

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=100,test_size=0.3)
print('Xtrain shape',xtrain.shape)
print('Xtest shape',xtest.shape)
print('Ytrain shape',ytrain.shape)
print('Ytest shape',ytest.shape)


### Model Building

In [None]:
#Train different models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Gaussian NB': GaussianNB()
}
#Define a empty dictionary to store results 
results = {}

for model_name, model in models.items():
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)

    accuracy = accuracy_score(ytest, ypred)
    f1 = f1_score(ytest, ypred)  # Rename this variable to something else (e.g., f1_score_value)
    recall = recall_score(ytest, ypred)
    precision = precision_score(ytest, ypred)
    roc_auc = roc_auc_score(ytest, ypred)

    results[model_name] = {
        'Accuracy': accuracy,
        'f1_score': f1,  # Corrected variable name
        'Recall': recall,
        'Precision': precision,
        'roc_auc_score': roc_auc
    }

#Print the results   
for model_name,metrics in results.items():
    print(f"Metrics for {model_name}:")
    for metric_name,value in metrics.items():
        print(f"{metric_name}:{value}")
    print()

From the above models, the highest accuracy we achieved in Logistic Regression and GauusianNB , but as the data is imbalanced so accuracy won't be the criteria for performance of model, rather we will choose f1 score , so the model which has highest f1 score is Gradient Boosting Model with f1_score of 0.37241379310344824

### ROC_AUC CURVE

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

gbc=GradientBoostingClassifier()
model=gbc.fit(xtrain,ytrain)
ypred=model.predict(xtest)

fpr,tpr,thresholds=roc_curve(ytest,ypred)
roc_auc=roc_auc_score(ytest,ypred)

plt.figure(figsize=(17,8))
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=17)
plt.ylabel('True Positive Rate',fontsize=17)
plt.title('Receiver Operating Characteristic (ROC) Curve',fontsize=25)
plt.legend(loc='lower right',fontsize=14)
plt.show()





