In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

# ----------------------------
# 1. Load Dataset
# ----------------------------
df = pd.read_csv("/content/loan_approval_dataset.csv")

# Strip spaces from column names
df.columns = df.columns.str.strip()

print("Dataset shape:", df.shape)
print(df.head())

# ----------------------------
# 2. Handle Missing Values
# ----------------------------
df.ffill(inplace=True)   # forward fill (instead of fillna(method='ffill'))

# ----------------------------
# 3. Encode Categorical Features
# ----------------------------
encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'loan_status':   # target ko abhi encode nahi karna
        df[col] = encoder.fit_transform(df[col])

# Encode target column separately
df['loan_status'] = encoder.fit_transform(df['loan_status'])

print("Encoded Data:")
print(df.head())

# ----------------------------
# 4. Features & Target
# ----------------------------
X = df.drop(["loan_id", "loan_status"], axis=1)   # loan_id drop because it's just an ID
y = df["loan_status"]

# ----------------------------
# 5. Train/Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 6. Handle Imbalanced Data with SMOTE
# ----------------------------
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", y_train_res.value_counts().to_dict())

# ----------------------------
# 7. Train Logistic Regression
# ----------------------------
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_res, y_train_res)
y_pred_lr = log_reg.predict(X_test)

print("\n🔹 Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))

# ----------------------------
# 8. Train Decision Tree
# ----------------------------
dt = DecisionTreeClassifier(random_state=42, max_depth=5)
dt.fit(X_train_res, y_train_res)
y_pred_dt = dt.predict(X_test)

print("\n🔹 Decision Tree Results:")
print(classification_report(y_test, y_pred_dt))

# ----------------------------
# 9. Confusion Matrices
# ----------------------------
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Dataset shape: (4269, 13)
   loan_id  no_of_dependents      education self_employed  income_annum  \
0        1                 2       Graduate            No       9600000   
1        2                 0   Not Graduate           Yes       4100000   
2        3                 3       Graduate            No       9100000   
3        4                 3       Graduate            No       8200000   
4        5                 5   Not Graduate           Yes       9800000   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     29900000         12          778                   2400000   
1     12200000          8          417                   2700000   
2     29700000         20          506                   7100000   
3     30700000          8          467                  18200000   
4     24200000         20          382                  12400000   

   commercial_assets_value  luxury_assets_value  bank_asset_value loan_status  
0                 17600000        