In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV


In [74]:
#loading the data set
train_data = pd .read_csv("C:/Users/GVJai/Desktop/Project/Loan_Prediction/Data/Loan Status Prediction/Loan_Status_train.csv")

In [75]:
train_data.shape

(614, 13)

In [76]:
print(train_data.head())
print(train_data.info())
print(train_data.describe())


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [77]:
train_data.isna().sum()


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [78]:

# Dropping Loan_ID column 
train_data.drop(['Loan_ID'],axis=1,inplace=True)

In [79]:
#handling missing values for numerical variables
imputer = SimpleImputer(strategy='mean')
train_data['LoanAmount'] = imputer.fit_transform(train_data[['LoanAmount']])
train_data['Loan_Amount_Term'] = imputer.fit_transform(train_data[['Loan_Amount_Term']])
train_data['Credit_History'] = imputer.fit_transform(train_data[['Credit_History']])


In [80]:
#For categorical variables, use the most frequent strategy
categorical_columns = ['Gender', 'Married', 'Dependents', 'Self_Employed']
imputer_cat = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns] = imputer_cat.fit_transform(train_data[categorical_columns])


In [81]:
train_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [118]:
# Encode categorical variables using OneHotEncoder
categorical_columns = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Education', 'Property_Area']
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Apply the transformations
X = train_data.drop(columns=['Loan_Status'])
y = train_data['Loan_Status']
X = preprocessor.fit_transform(X)


In [119]:
#encoding the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [84]:
#split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [85]:
import matplotlib.pyplot as plt
import seaborn as sns

In [86]:
#training the model in different algorithms
#1. Logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_val)


In [87]:
#2. Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)


In [88]:
#3. Random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)


In [89]:
#4. XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)


In [90]:
#model evaluation
def evaluate_model(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

print("Logistic Regression Performance")
evaluate_model(y_val, y_pred_log_reg)

print("Decision Tree Performance")
evaluate_model(y_val, y_pred_dt)

print("Random Forest Performance")
evaluate_model(y_val, y_pred_rf)

print("XGBoost Performance")
evaluate_model(y_val, y_pred_xgb)


Logistic Regression Performance
Accuracy: 0.7886178861788617
Confusion Matrix:
 [[18 25]
 [ 1 79]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123

Decision Tree Performance
Accuracy: 0.6829268292682927
Confusion Matrix:
 [[21 22]
 [17 63]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.49      0.52        43
           1       0.74      0.79      0.76        80

    accuracy                           0.68       123
   macro avg       0.65      0.64      0.64       123
weighted avg       0.68      0.68      0.68       123

Random Forest Performance
Accuracy: 0.7723577235772358
Confusion Matrix:
 [[18 25]
 [ 3 77]]
Classificatio

In [91]:
train_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [92]:
df = pd.DataFrame(train_data)

In [93]:
# Create new features
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['LoanAmount_to_Income_Ratio'] = df['LoanAmount'] / df['TotalIncome']
df['LoanAmount_per_Term'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Income_per_Dependents'] = df['TotalIncome'] / (df['Dependents'].replace('3+', 3).astype(int) + 1)
df['Is_Graduate'] = df['Education'].apply(lambda x: 1 if x == 'Graduate' else 0)
df['Is_Self_Employed'] = df['Self_Employed'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Is_Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

In [94]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,LoanAmount_to_Income_Ratio,LoanAmount_per_Term,Income_per_Dependents,Is_Graduate,Is_Self_Employed,Is_Gender
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y,5849.0,0.025032,0.4067,5849.0,1,0,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0.021015,0.355556,3045.5,1,0,1
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0.022,0.183333,3000.0,1,1,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0.024287,0.333333,4941.0,0,0,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0.0235,0.391667,6000.0,1,0,1


In [95]:
# Encode target variable
df['Loan_Status'] = df['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0)

In [96]:
# Define features and target variable
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

In [97]:
# Define categorical and numerical columns
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
                      'TotalIncome', 'LoanAmount_to_Income_Ratio', 'LoanAmount_per_Term', 'Income_per_Dependents', 
                      'Is_Graduate', 'Is_Self_Employed', 'Is_Gender']

In [98]:
# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [99]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [100]:
# Define the model
model = RandomForestClassifier(random_state=42)


In [101]:
# Create and evaluate the pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [102]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20]
}

In [104]:
# GridSearchCV to find the best model
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [105]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Improved accuracy: {accuracy:.2f}")

Best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Improved accuracy: 0.80


In [106]:
#load the test data
test_data = pd.read_csv("C:/Users/GVJai/Desktop/Project/Loan_Prediction/Data/Loan Status Prediction/Loan_Status_test.csv")

In [107]:
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [108]:
# Handle missing values in the training data
test_data['Gender'].fillna(test_data['Gender'].mode()[0], inplace=True)
test_data['Dependents'].fillna(test_data['Dependents'].mode()[0], inplace=True)
test_data['Self_Employed'].fillna(test_data['Self_Employed'].mode()[0], inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].median(), inplace=True)
test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0], inplace=True)

In [109]:
test_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [110]:
df1 = pd.DataFrame(test_data)

In [111]:
# Create new features
df1['TotalIncome'] = df1['ApplicantIncome'] + df1['CoapplicantIncome']
df1['LoanAmount_to_Income_Ratio'] = df1['LoanAmount'] / df1['TotalIncome']
df1['LoanAmount_per_Term'] = df1['LoanAmount'] / df1['Loan_Amount_Term']
df1['Income_per_Dependents'] = df1['TotalIncome'] / (df1['Dependents'].replace('3+', 3).astype(int) + 1)
df1['Is_Graduate'] = df1['Education'].apply(lambda x: 1 if x == 'Graduate' else 0)
df1['Is_Self_Employed'] = df1['Self_Employed'].apply(lambda x: 1 if x == 'Yes' else 0)
df1['Is_Gender'] = df1['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

In [112]:
df1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome,LoanAmount_to_Income_Ratio,LoanAmount_per_Term,Income_per_Dependents,Is_Graduate,Is_Self_Employed,Is_Gender
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,5720,0.019231,0.305556,5720.0,1,0,1
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,4576,0.027535,0.35,2288.0,1,0,1
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,6800,0.030588,0.577778,2266.666667,1,0,1
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,4886,0.020467,0.277778,1628.666667,1,0,1
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,3276,0.02381,0.216667,3276.0,0,0,1


In [113]:
# Drop any columns that are not features
X_test_new = df1.drop(columns=['Loan_ID'])

In [114]:
# Make predictions on the test data
test_predictions = best_model.predict(X_test_new)

In [115]:
# Add predictions to the test DataFrame
df1['Predicted_Loan_Status'] = test_predictions

In [117]:
# Save the updated test DataFrame with predictions to a new CSV file
df1.to_csv("C:/Users/GVJai/Desktop/Project/Loan_Prediction/Test_data_with_predictions.csv", index=False)
print("Test data with predictions saved successfully.")

Test data with predictions saved successfully.
