# IMPORT LIBRARIES

In [None]:
# Importing all libraries to be used
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import roc_curve

# DATA LOADING

In [None]:
# Load the Dataset
df = pd.read_csv('/kaggle/input/loan-approval-prediction-dataset/loan_approval_dataset.csv')

In [None]:
# print all column
pd.set_option('display.max_columns', None)
# print first 5 rows
df.head()

# EXPLORE THE DATASET

In [None]:
df.shape

In [None]:
# Display dataset information
df.info()

In [None]:
# Check for missing values
print(df.isnull().sum())

# Fill or drop missing values
df = df.dropna()  # For simplicity, we'll drop rows with missing values

In [None]:
# Check data types
print(df.dtypes)

In [None]:
df.nunique()

In [None]:
df.describe().T

In [None]:
# heatmap for missing value
sns.heatmap(df.isnull())

In [None]:
# Histogram
df.hist(figsize=(10, 10))
plt.show()

# ADD OTHER COLUMNS

In [None]:
df.columns

In [None]:
a=[' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status']

In [None]:
b=['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status']

In [None]:
dic = dict(zip(a,b))
dic

In [None]:
df=df.rename(columns=dic)
df.columns

In [None]:
df.head()

# DATA CLEANING

In [None]:
df.duplicated().sum()

In [None]:
# Check the null values
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.strip()
df.columns

In [None]:
df['education'] = df['education'].str.strip()
df['self_employed'] =df['self_employed'].str.strip()
df['loan_status'] =df['loan_status'].str.strip()

In [None]:
print(df.isnull().sum())

#  OUTLIER DETECTION

In [None]:
df.plot(kind='box', figsize= (12,12), layout=(4,4), sharex= False, subplots= True) ;plt.tight_layout()

### Detecting Outliers using IQR

In [None]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

In [None]:
def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers

In [None]:
# Detect outliers using both methods
for column in df.select_dtypes(include=[np.number]).columns:
    iqr_outliers = detect_outliers_iqr(df, column)
    zscore_outliers = detect_outliers_zscore(df, column)
    
    print(f"\nColumn: {column}")
    print(f"IQR method outliers: {len(iqr_outliers)}")
    print(f"Z-score method outliers: {len(zscore_outliers)}")

In [None]:
# Visualization fot bank_asset_value
plt.figure(figsize=(15, 5))
    
plt.subplot(1, 3, 1)
sns.boxplot(df['bank_asset_value'])
plt.title('Boxplot')
    
plt.subplot(1, 3, 2)
sns.histplot(df['bank_asset_value'], kde=True)
plt.title('Histogram with KDE')
    
plt.subplot(1, 3, 3)
stats.probplot(df['bank_asset_value'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
    
plt.tight_layout()
plt.show()

In [None]:
# Visualization fot commercial_assets_value
plt.figure(figsize=(15, 5))
    
plt.subplot(1, 3, 1)
sns.boxplot(df['commercial_assets_value'])
plt.title('Boxplot')
    
plt.subplot(1, 3, 2)
sns.histplot(df['commercial_assets_value'], kde=True)
plt.title('Histogram with KDE')
    
plt.subplot(1, 3, 3)
stats.probplot(df['commercial_assets_value'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
    
plt.tight_layout()
plt.show()

In [None]:
# Visualization fot commercial_assets_value
plt.figure(figsize=(15, 5))
    
plt.subplot(1, 3, 1)
sns.boxplot(df['luxury_assets_value'])
plt.title('Boxplot')
    
plt.subplot(1, 3, 2)
sns.histplot(df['luxury_assets_value'], kde=True)
plt.title('Histogram with KDE')
    
plt.subplot(1, 3, 3)
stats.probplot(df['luxury_assets_value'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
    
plt.tight_layout()
plt.show()

### Treating Outliers

Caping with Outliers , In the dataset loan Approval , extreme values in commercial , luxury and bank asset values are possible since there can be people having lots of money in Bank and having more worth properties , as it can be crucial values for our analysis and modelling we will cape with it

In [None]:
def cap_outliers(df, column, method='IQR'):
    if method == 'IQR':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
    elif method == 'zscore':
        mean = df[column].mean()
        std = df[column].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
    
    df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

# Apply to all numerical columns
for column in df.select_dtypes(include=[np.number]).columns:
    df = cap_outliers(df, column, method='IQR')  # or 'zscore'

In [None]:
df.shape

In [None]:
# Visualization fot commercial_assets_value
plt.figure(figsize=(15, 5))
    
plt.subplot(1, 3, 1)
sns.boxplot(df['commercial_assets_value'])
plt.title('Boxplot')
    
plt.subplot(1, 3, 2)
sns.histplot(df['commercial_assets_value'], kde=True)
plt.title('Histogram with KDE')
    
plt.subplot(1, 3, 3)
stats.probplot(df['commercial_assets_value'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
    
plt.tight_layout()
plt.show()

### Correlation Between Features 

In [None]:
# Making a copy of the dataframe to avoid modifying the original
df_corr = df.copy()

# Encoding binary categorical variables 
binary_cats = ['education', 'self_employed', 'loan_status']
for col in binary_cats:
    df_corr[col] = df_corr[col].map({'No': 0, 'Yes': 1, 'Not Graduate': 0, 'Graduate': 1, 'Rejected': 0, 'Approved': 1})

plt.figure(figsize=(12, 10))
# Create a correlation matrix using the numeric columns
correlation_matrix = df_corr.corr()
# Plot the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# DISTRIBUTION OF THE DATASET EACH FEATURE

In [None]:
# Select numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Set up the plot
plt.figure(figsize=(20, 15))

# Create subplots
for i, column in enumerate(numeric_columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(data=df, x=column, kde=True)
    plt.title(column)
    plt.xlabel('')  

plt.tight_layout()
plt.show()

# VISUALIZATION NUMERIC FEATURES RELATIONSHIP(CORRELATION)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='income_annum', y='loan_amount', hue='loan_status')
plt.xlabel('Annual Income')
plt.ylabel('Loan Amount')
plt.title('Loan Amount vs Income by Loan Status')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='income_annum', y='bank_asset_value', hue='loan_status')
plt.xlabel('Annual Income')
plt.ylabel('Bank Balance')
plt.title('Loan Amount vs Bank Balance by Loan Status')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='bank_asset_value', y='luxury_assets_value' , hue='loan_status')
plt.xlabel('Bank Balance')
plt.ylabel('Luxuries ')
plt.title('Loan Amount vs Bank Balance by Loan Status')
plt.show()

### Applicants with more balance in their accounts tend to buy high value luxury items

In [None]:
#Pair plot for key numeric variables:
sns.pairplot(df[['income_annum', 'loan_amount', 'cibil_score', 'loan_status']], hue='loan_status')
plt.show()

### No relation between cibil score and income anum and loan amount

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='income_annum', data=df)
plt.title('CIBIL Score Distribution by Loan Status')
plt.show()

## ANALYZING THE FEATURE HAVING THE HIGH CHANCE OF LOAN APPROVAL

In [None]:
# Select numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Set up the plot
plt.figure(figsize=(20, 15))

# Create subplots
for i, column in enumerate(numeric_columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(data=df, x=column, kde=True, hue='loan_status')
    plt.title(column)
    plt.xlabel('')  

plt.tight_layout()
plt.show()

### As the cibil_score increases the Approval of loan status has been seen
INDICATING applicants having a good credit history and loan replayment tends to have higher chances of loan approval

In [None]:
# CIBIL score distribution by loan status
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='cibil_score', data=df)
plt.title('CIBIL Score Distribution by Loan Status')
plt.show()

### INDICATING applicants having a good credit history and loan replayment tends to have higher chances of loan approval, highest loan rejection is in region below 500 cibil score , where as above 600 cibil_score Applicants tends to get approved except for few people

### Analyzing Cause of the Outlier

In [None]:
print("CIBIL Score range:")
print(df['cibil_score'].describe());

high_cibil_threshold = 700  # Adjust this value based on the CIBIL score range in your dataset
high_cibil_rejected = df[(df['cibil_score'] > high_cibil_threshold) & (df['loan_status'] == 'Rejected')]
print(f"\nNumber of high CIBIL score (>{high_cibil_threshold}) rejections: {len(high_cibil_rejected)}");

plt.figure(figsize=(12, 6))
sns.scatterplot(x='cibil_score', y='loan_status', data=df)
plt.title('CIBIL Score vs Loan Status')
plt.ylabel('Loan Status (0: Rejected, 1: Approved)')
plt.show();

# MODELLING

## FEATURE SELECTION

In [None]:
# Drop unnecessary columns
columns_to_drop = ['loan_id']
df = df.drop(columns=columns_to_drop)

# Select features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)
y = y.map({'Rejected': 0, 'Approved': 1})

### Manual Data Splitting:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scaling:

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model Training (KNN):

In [None]:
knn = KNeighborsClassifier(n_neighbors=16)  # You can adjust the number of neighbors
knn.fit(X_train_scaled, y_train) 

### Model Evaluation:

In [None]:
# Predictions
y_pred = knn.predict(X_test_scaled)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### Cross-validation:|

In [None]:
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")

### ROC Curve:

In [None]:
from sklearn.metrics import roc_curve

y_pred_proba = knn.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
df['loan_status'] = df['loan_status'].replace({' Approved':'Approved',' Rejected':'Rejected'})

In [None]:
import scipy.stats as stats

In [None]:
pvalue=[]
for i in df.drop('loan_status',axis=1).columns:
    if df[i].dtype=='object':
        stat,pval,_,_ = stats.chi2_contingency(pd.crosstab(df[i],df['loan_status']))
        pvalue.append(pval)
    else:
        a=df[df['loan_status']=='Approved'][i]
        b=df[df['loan_status']=='Rejected'][i]
        stat,pval = stats.ttest_ind(a,b)
        pvalue.append(pval)

In [None]:
pvalue

In [None]:
df.columns

In [None]:
col = ['education', 'self_employed','no_of_dependents','income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']

In [None]:
sig = pd.DataFrame({'Features':col,'p-value':pvalue})
sig

In [None]:
sig[sig['p-value']>0.05]['Features'].values

In [None]:
df = df[['education', 'self_employed', 'no_of_dependents', 'income_annum',
       'loan_amount', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value',
       'bank_asset_value','loan_status']]
df.head()

In [None]:
df['education'].unique()

In [None]:
df['self_employed'].unique()

In [None]:
df.columns

In [None]:
df['education'] = df['education'].replace({' Graduate':1,' Not Graduate':0})
df['self_employed'] = df['self_employed'].replace({' No':0,' Yes':1})

In [None]:
df.head()

In [None]:
df['no_of_dependents'].unique()

In [None]:
sns.scatterplot(x='income_annum',y='loan_amount',data=df)
plt.grid()
plt.show()

In [None]:
depend = ['income_annum','residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value']
depend

In [None]:
r=2
c=3
it=1
for i in depend:
    plt.subplot(r,c,it)
    sns.scatterplot(x=i,y='loan_amount',data=df,hue='loan_status')
    plt.grid()
    it+=1
plt.tight_layout()
plt.show()

# Project 01 Conclusion

Successfully provided comprehensive hands-on experience in managing and analyzing messy datasets. The project encompassed various critical stages, including data cleaning, exploratory data analysis (EDA), outlier detection and removal, manual data splitting, and model training. Utilizing a K-Nearest Neighbors (KNN) model, we evaluated its performance through both manual data splitting and cross-validation. The comparison of these evaluation methods demonstrated the model's robustness and reliability. This project significantly enhanced our understanding and skills in data preprocessing, EDA, and machine learning model evaluation, showcasing the importance of thorough data preparation and analysis in achieving accurate predictive modeling results. Detailed results and visualizations are included in the attached Word file for further reference.