In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [42]:
df = pd.read_csv('loans.csv')


In [43]:
# Ensure that non-numeric columns are excluded from interpolation
df_numeric = df.select_dtypes(include=[np.number])

# Interpolate only numeric columns
df_numeric.interpolate(method='linear', inplace=True)

# Now replace missing values in other categorical columns with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df.dropna(inplace=True)


In [44]:
df.replace({"Loan_Status": {'N': 0, 'Y': 1}}, inplace=True)


In [45]:
df.replace({'Married': {'No': 0, 'Yes': 1},
            'Gender': {'Male': 1, 'Female': 0},
            'Self_Employed': {'No': 0, 'Yes': 1},
            'Property_Area': {'Rural': 0, 'Semiurban': 1, 'Urban': 2},
            'Education': {'Graduate': 1, 'Not Graduate': 0},
            'Dependents': {'3+': 4}}, inplace=True)


In [46]:
X = df.drop(columns=['Loan_ID', 'Loan_Status'], axis=1)
y= df['Loan_Status']


In [47]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2


In [48]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV


In [49]:
# Assuming you have your data in X (features) and y (target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [50]:
import warnings
warnings.filterwarnings("ignore")

In [51]:
# List of models to evaluate
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier()
}

# Compare models
model_scores = {}
for name, model in models.items():
    # Perform cross-validation to evaluate the model's accuracy
    scores = cross_val_score(model, X_train, y_train, cv=5)
    model_scores[name] = scores.mean()
    print(f"{name} Accuracy: {scores.mean():.4f}")

# Select the best model
best_model_name = max(model_scores, key=model_scores.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {model_scores[best_model_name]:.4f}")

# Assign the best model to 'best_model'
best_model = models[best_model_name]


SVM Accuracy: 0.6903
Random Forest Accuracy: 0.7802
Gradient Boosting Accuracy: 0.7541
Logistic Regression Accuracy: 0.8062
KNN Accuracy: 0.6384

Best Model: Logistic Regression with Accuracy: 0.8062


In [52]:
best_model.fit(X_train, y_train)


In [53]:
y_pred = best_model.predict(X_test)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Test Accuracy: 0.8302

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.48      0.64        33
           1       0.81      0.99      0.89        73

    accuracy                           0.83       106
   macro avg       0.88      0.74      0.76       106
weighted avg       0.85      0.83      0.81       106


Confusion Matrix:
[[16 17]
 [ 1 72]]


In [57]:
# Save the trained model using pickle
with open('modelss.pkl', 'wb') as model_file:
     pickle.dump(best_model, model_file)

print("Model saved successfully as 'modelss.pkl'.")

Model saved successfully as 'modelss.pkl'.


In [58]:
model_path = r'C:\Users\User\loan_pred\models.pkl'  # Use raw string to avoid escape sequences
# Save the trained model using pickle
with open(model_path, 'wb') as model_file:
    pickle.dump(best_model, model_file)

print("Model saved successfully as 'models.pkl'.")


Model saved successfully as 'models.pkl'.
