Training
 3 different machine learning models on your loan approval dataset and compare their performance.

In [1]:
###Import required Libraries
# Install if not already
!pip install scikit-learn

# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



Prepare the data


In [10]:
import pandas as pd
# Load your cleaned dataset (replace with actual file name if different)
df = pd.read_csv("loan_data_cleaned_with_status.csv")

# Confirm it's loaded
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,Y
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,N
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,N
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,Y


In [12]:
# One-hot encode categorical variables
df_model = pd.get_dummies(df, drop_first=True)

# Define features and target
X = df_model.drop('Loan_Status_Y', axis=1)
y = df_model['Loan_Status_Y']

#  Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Feature Scaling (for Logistic Regression and KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training Models

In [14]:
##Logistic REgresssion

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

In [15]:
##Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)  # No need to scale for tree models
rf_pred = rf_model.predict(X_test)

In [16]:
##KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)

Evaluate All Models

In [17]:
def evaluate_model(name, y_test, y_pred):
    print(f"\n {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate each model
evaluate_model("Logistic Regression", y_test, lr_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("K-Nearest Neighbors", y_test, knn_pred)


 Logistic Regression
Accuracy: 0.7162162162162162
Confusion Matrix:
 [[ 0 21]
 [ 0 53]]
Classification Report:
               precision    recall  f1-score   support

       False       0.00      0.00      0.00        21
        True       0.72      1.00      0.83        53

    accuracy                           0.72        74
   macro avg       0.36      0.50      0.42        74
weighted avg       0.51      0.72      0.60        74


 Random Forest
Accuracy: 0.7027027027027027
Confusion Matrix:
 [[ 0 21]
 [ 1 52]]
Classification Report:
               precision    recall  f1-score   support

       False       0.00      0.00      0.00        21
        True       0.71      0.98      0.83        53

    accuracy                           0.70        74
   macro avg       0.36      0.49      0.41        74
weighted avg       0.51      0.70      0.59        74


 K-Nearest Neighbors
Accuracy: 0.581081081081081
Confusion Matrix:
 [[ 4 17]
 [14 39]]
Classification Report:
               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Compare in a Table

In [18]:
# Create a performance summary
model_scores = {
    "Model": ["Logistic Regression", "Random Forest", "K-Nearest Neighbors"],
    "Accuracy": [
        accuracy_score(y_test, lr_pred),
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, knn_pred)
    ]
}

pd.DataFrame(model_scores)

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.716216
1,Random Forest,0.702703
2,K-Nearest Neighbors,0.581081
