In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import joblib

#  Load the dataset
data = pd.read_csv('loan_prediction.csv')

#  Check the last 5 rows of the dataset
print(data.tail())

#  Find the shape of the dataset
print("Number of Rows:", data.shape[0])
print("Number of Columns:", data.shape[1])


#  Get information about the dataset
print(data.info())


#  Check for null values
print(data.isnull().sum())
print(data.isnull().sum() * 100 / len(data))

#  Handling missing values
data = data.drop('Loan_ID', axis=1)
columns = ['Gender', 'Dependents', 'LoanAmount', 'Loan_Amount_Term']
data = data.dropna(subset=columns)
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

#  EDA
plt.figure(figsize=(12,6))
sns.countplot(x='Loan_Status', data=data)
plt.title('Loan Status Count')
plt.show()

plt.figure(figsize=(12,6))
sns.histplot(data['ApplicantIncome'], bins=50)
plt.title('Applicant Income Distribution')
plt.show()

plt.figure(figsize=(12,6))
sns.histplot(data['CoapplicantIncome'], bins=50)
plt.title('Coapplicant Income Distribution')
plt.show()

#  Handling categorical columns
data['Dependents'] = data['Dependents'].replace(to_replace="3+", value='4')
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0}).astype('int')
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0}).astype('int')
data['Education'] = data['Education'].map({'Graduate': 1, 'Not Graduate': 0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map({'Yes': 1, 'No': 0}).astype('int')
data['Property_Area'] = data['Property_Area'].map({'Rural': 0, 'Semiurban': 2, 'Urban': 3}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0}).astype('int')


#  Store feature matrix in X and response (target) in vector Y
X = data.drop('Loan_Status', axis=1)
Y = data['Loan_Status']


#  Feature scaling
scaler = StandardScaler()
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
X[cols] = scaler.fit_transform(X[cols])

#  Split the dataset into the training set and test set & apply K-Fold Cross Validation
model_df = {}

def model_val(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(Y_test, Y_pred)}")

    score = cross_val_score(model, X, Y, cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model] = round(np.mean(score) * 100, 2)

# 11. Logistic Regression
model = LogisticRegression()
model_val(model, X, Y)

#  SVC
svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ["linear"]}
rs_svc = RandomizedSearchCV(SVC(), param_distributions=svc_grid, cv=5, n_iter=4, verbose=True)
rs_svc.fit(X, Y)
print("Best score for SVC:", rs_svc.best_score_)
print("Best parameters for SVC:", rs_svc.best_params_)

#  Decision Tree Classifier
model = DecisionTreeClassifier()
model_val(model, X, Y)

#  Random Forest Classifier
rf_grid = {'n_estimators': np.arange(10, 1000, 10),
           'max_features': ['sqrt', 'log2', 1, 0.5],
           'max_depth': [None, 3, 5, 10, 20, 30],
           'min_samples_split': [2, 5, 20, 50, 100],
           'min_samples_leaf': [1, 2, 5, 10]}
rs_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=rf_grid, cv=5, n_iter=20, verbose=True)
rs_rf.fit(X, Y)
print("Best score for Random Forest Classifier:", rs_rf.best_score_)
print("Best parameters for Random Forest Classifier:", rs_rf.best_params_)


#  Gradient Boosting Classifier
model = GradientBoostingClassifier()
model_val(model, X, Y)
#  Save the model
rf = RandomForestClassifier(n_estimators=660, min_samples_split=50, min_samples_leaf=1,
                             max_features='sqrt', max_depth=30)
rf.fit(X, Y)
joblib.dump(rf, 'loan_status_predict')

# Load the model
model = joblib.load('loan_status_predict')

# Get user input for loan prediction
gender = int(input("Enter Gender (1 for Male, 0 for Female): "))
married = int(input("Are you Married? (1 for Yes, 0 for No): "))
dependents = int(input("Enter Number of Dependents: "))
education = int(input("Are you Graduate? (1 for Yes, 0 for No): "))
self_employed = int(input("Are you Self Employed? (1 for Yes, 0 for No): "))
applicant_income = float(input("Enter Applicant's Income: "))
coapplicant_income = float(input("Enter Coapplicant's Income: "))
loan_amount = float(input("Enter Loan Amount: "))
loan_amount_term = float(input("Enter Loan Amount Term: "))
credit_history = int(input("Enter Credit History (1 for Yes, 0 for No): "))
property_area = int(input("Enter Property Area (0 for Rural, 2 for Semiurban, 3 for Urban): "))

# Predict using the model
df = pd.DataFrame({
    'Gender': gender,
    'Married': married,
    'Dependents': dependents,
    'Education': education,
    'Self_Employed': self_employed,
    'ApplicantIncome': applicant_income,
    'CoapplicantIncome': coapplicant_income,
    'LoanAmount': loan_amount,
    'Loan_Amount_Term': loan_amount_term,
    'Credit_History': credit_history,
    'Property_Area': property_area
}, index=[0])

result = model.predict(df)

# Output prediction
if result == 1:
    print("Congratulations! Your loan application is likely to be approved.")
else:
    print("Sorry, your loan application is likely to be rejected.")



