In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'loan-approval-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/loan-approval-prediction-dataset


In [5]:
 # Changing the path of Dataset
import shutil

# Source path
src_path = "/kaggle/input/loan-approval-prediction-dataset"

# Destination path in /content
dst_path = "/content/students-performance-dataset"

# Copy the entire folder to /content
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)

print("Dataset copied to:", dst_path)


Dataset copied to: /content/students-performance-dataset


In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/students-performance-dataset/loan_approval_dataset.csv")

# Show first rows
print(df.head())


   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0                  1760000

In [9]:
 # Checking Datatype
print("Data Types:\n", df.dtypes)

Data Types:
 loan_id                       int64
 no_of_dependents             int64
 education                   object
 self_employed               object
 income_annum                 int64
 loan_amount                  int64
 loan_term                    int64
 cibil_score                  int64
 residential_assets_value     int64
 commercial_assets_value      int64
 luxury_assets_value          int64
 bank_asset_value             int64
 loan_status                 object
dtype: object


In [10]:
 # Checkin Missing Values
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64


In [12]:
# Show all column names with repr to reveal hidden characters
for col in df.columns:
    print(repr(col))


'loan_id'
' no_of_dependents'
' education'
' self_employed'
' income_annum'
' loan_amount'
' loan_term'
' cibil_score'
' residential_assets_value'
' commercial_assets_value'
' luxury_assets_value'
' bank_asset_value'
' loan_status'


In [13]:
# Strip leading/trailing spaces from all column names
df.columns = df.columns.str.strip()

# Verify columns
print("Cleaned Columns:\n", df.columns)

# Now check target class distribution
print("\nTarget Class Distribution:\n", df['loan_status'].value_counts())


Cleaned Columns:
 Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

Target Class Distribution:
 loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64


In [14]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['education', 'self_employed'], drop_first=True)

# Verify new columns
print("Columns after encoding:\n", df_encoded.columns)

# Check first few rows
df_encoded.head()


Columns after encoding:
 Index(['loan_id', 'no_of_dependents', 'income_annum', 'loan_amount',
       'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status', 'education_ Not Graduate', 'self_employed_ Yes'],
      dtype='object')


Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,education_ Not Graduate,self_employed_ Yes
0,1,2,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,False,False
1,2,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,True,True
2,3,3,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected,False,False
3,4,3,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected,False,False
4,5,5,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected,True,True


In [15]:
# Define features and target
X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Verify shapes and first few rows
print("Features shape:", X.shape)
print("Target shape:", y.shape)
y.value_counts()


Features shape: (4269, 12)
Target shape: (4269,)


Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1


In [17]:
# Check unique values in target column
print(df_encoded['loan_status'].unique())

# Also check if there are any nulls
print("Null values in loan_status:", df_encoded['loan_status'].isnull().sum())


[' Approved' ' Rejected']
Null values in loan_status: 0


In [18]:
# Strip spaces from target column
df_encoded['loan_status'] = df_encoded['loan_status'].str.strip()

# Map target to numeric
y = df_encoded['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Features remain the same
X = df_encoded.drop('loan_status', axis=1)

# Verify mapping
print("Target value counts:\n", y.value_counts())


Target value counts:
 loan_status
1    2656
0    1613
Name: count, dtype: int64


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Train-test split (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify shapes
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (3415, 12)
X_test_scaled shape: (854, 12)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train the model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9133489461358314

Confusion Matrix:
 [[281  42]
 [ 32 499]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88       323
           1       0.92      0.94      0.93       531

    accuracy                           0.91       854
   macro avg       0.91      0.90      0.91       854
weighted avg       0.91      0.91      0.91       854



In [21]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Apply SMOTE to balance the classes in training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

print("Original training set class distribution:\n", y_train.value_counts())
print("Resampled training set class distribution:\n", y_train_res.value_counts())

# Retrain Random Forest on balanced data
rf_model_res = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model_res.fit(X_train_res, y_train_res)

# Predictions on test set
y_pred_res = rf_model_res.predict(X_test_scaled)

# Evaluate
print("Random Forest (SMOTE) Accuracy:", accuracy_score(y_test, y_pred_res))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_res))
print("\nClassification Report:\n", classification_report(y_test, y_pred_res))


Original training set class distribution:
 loan_status
1    2125
0    1290
Name: count, dtype: int64
Resampled training set class distribution:
 loan_status
1    2125
0    2125
Name: count, dtype: int64
Random Forest (SMOTE) Accuracy: 0.9824355971896955

Confusion Matrix:
 [[315   8]
 [  7 524]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       323
           1       0.98      0.99      0.99       531

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [22]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Randomized Search with 5-fold CV, scoring based on f1
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit on resampled training set
random_search.fit(X_train_res, y_train_res)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate best model on test set
best_rf = random_search.best_estimator_
y_pred_best = best_rf.predict(X_test_scaled)

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


15 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best Parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
Optimized Random Forest Accuracy: 0.977751756440281

Confusion Matrix:
 [[315   8]
 [ 11 520]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97       323
           1       0.98      0.98      0.98       531

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854

