1. Import The Necessary Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

2. Load the Datasets

In [2]:
# Load training and testing data
train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

# Display the first few rows of the training data
print("Training Data Sample:")
print(train_data.head())

print("\nTest Data Sample:")
print(test_data.head())


Training Data Sample:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N 

3. Handle Missing Values

In [3]:
# Check for missing values in the training data
print("\nMissing values in training data:")
print(train_data.isnull().sum())

print("\nMissing values in test data:")
print(test_data.isnull().sum())

# Fill missing values in numeric columns with the mean
imputer = SimpleImputer(strategy='mean')
train_data['LoanAmount'] = imputer.fit_transform(train_data[['LoanAmount']])
test_data['LoanAmount'] = imputer.transform(test_data[['LoanAmount']])

# Fill missing values in categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']

# Impute missing values in training and test data
train_data[categorical_cols] = cat_imputer.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = cat_imputer.transform(test_data[categorical_cols])



Missing values in training data:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Missing values in test data:
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


4. Encoding Categorical Data

In [4]:
# Convert categorical variables into numeric using LabelEncoder
encoder = LabelEncoder()

# List of categorical columns
cat_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Apply LabelEncoder on each categorical column
for col in cat_columns:
    train_data[col] = encoder.fit_transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])

print("Categorical Variables Successfully Converted. ")


Categorical Variables Successfully Converted. 


5. Feature Selection

In [5]:
# Select the features and target variable from training data
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status']

# Convert 'Loan_Status' from categorical to numeric (Y = 1, N = 0)
y = y.map({'Y': 1, 'N': 0})

# Use the same feature columns for the test set, except 'Loan_ID'
X_test = test_data.drop(columns=['Loan_ID'])


6. Train-Test Split

In [6]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


7. Train with the RandomForest Classifier Model

In [7]:
# Initialize the RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)


8. Evaluate the Model

In [8]:
# Predict on the validation set
y_pred_val = model.predict(X_val)

# Check accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nValidation Accuracy: {accuracy * 100:.2f}%")



Validation Accuracy: 76.42%


9. Make Predictions on the Test Data


In [9]:
# Predict loan eligibility on the test data
test_predictions = model.predict(X_test)

# Add predictions to the test dataset
test_data['Loan_Status_Predictions'] = test_predictions

# Convert the numeric predictions back to 'Y' and 'N'
test_data['Loan_Status_Predictions'] = test_data['Loan_Status_Predictions'].map({1: 'Y', 0: 'N'})


10. Save the Predictions


In [10]:
# Save the output as a CSV file
test_data[['Loan_ID', 'Loan_Status_Predictions']].to_csv('loan_predictions.csv', index=False)

print("\nPredictions saved to 'loan_predictions.csv'.")



Predictions saved to 'loan_predictions.csv'.
