In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv
/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv


In [2]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df = df.dropna(subset=['Loan_Status'])


In [4]:
# Clean Loan_Status before mapping
df['Loan_Status'] = df['Loan_Status'].astype(str).str.strip().str.upper()

# Only keep rows where Loan_Status is valid
df = df[df['Loan_Status'].isin(['Y', 'N'])].copy()

# Now map safely
y = df['Loan_Status'].map({'N': 0, 'Y': 1})

# Confirm no NaNs remain
assert y.isna().sum() == 0, "Still contains NaNs in y!"


In [5]:
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df = df[df['LoanAmount'] > 0]

df['LoanAmountLog'] = np.log1p(df['LoanAmount'])
df['DebtRatio'] = df['LoanAmount'] / (df['TotalIncome'] + 1)

In [6]:
df.drop(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'], axis=1, inplace=True)


In [7]:
X=df.drop('Loan_Status',axis=1)
y=df['Loan_Status'].map({'n':0,'y':1})

In [8]:
numeric_features = ['Loan_Amount_Term', 'Credit_History', 'TotalIncome', 'LoanAmountLog', 'DebtRatio']
categorical_features = [col for col in X.columns if col not in numeric_features]


In [9]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

In [10]:
rf_pipeline=Pipeline([
    ('preprocessing',preprocessor),
    ('classifier',RandomForestClassifier(random_state=43))
])

In [11]:
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [4, 6, 8],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [12]:
# Clean and map target
df['Loan_Status'] = df['Loan_Status'].astype(str).str.strip().str.upper()
df = df[df['Loan_Status'].isin(['Y', 'N'])].copy()
df.reset_index(drop=True, inplace=True)

# Convert to 0/1
df['Loan_Status'] = df['Loan_Status'].map({'N': 0, 'Y': 1})

# Drop any NaNs again (precaution)
df = df.dropna(subset=['Loan_Status'])

# Split
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

print("✅ Clean target ready. Unique values:", y.unique())


✅ Clean target ready. Unique values: [1 0]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)



In [15]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [16]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [17]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [18]:
print(" Best Hyperparameter:", grid_search.best_params_)
print(" Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

 Best Hyperparameter: {'classifier__max_depth': 8, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
 Test Accuracy: 0.7886178861788617

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.47      0.61        43
           1       0.77      0.96      0.86        80

    accuracy                           0.79       123
   macro avg       0.82      0.71      0.73       123
weighted avg       0.80      0.79      0.77       123

