In [None]:
import pandas as pd

train_data = pd.read_csv('/content/loan-train.csv')
test_data = pd.read_csv('/content/loan-test.csv')

print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

print("\nMissing values in Training Data:")
print(train_data.isnull().sum())

print("\nMissing values in Test Data:")
print(test_data.isnull().sum())


Training Data:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2    

data cleaning and feature engineering

In [None]:
# Fill missing values
train_data['Gender'].fillna(train_data['Gender'].mode()[0], inplace=True)
train_data['Married'].fillna(train_data['Married'].mode()[0], inplace=True)
train_data['Dependents'].fillna(train_data['Dependents'].mode()[0], inplace=True)
train_data['Self_Employed'].fillna(train_data['Self_Employed'].mode()[0], inplace=True)
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0], inplace=True)
train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0], inplace=True)

test_data['Gender'].fillna(test_data['Gender'].mode()[0], inplace=True)
test_data['Married'].fillna(test_data['Married'].mode()[0], inplace=True)
test_data['Dependents'].fillna(test_data['Dependents'].mode()[0], inplace=True)
test_data['Self_Employed'].fillna(test_data['Self_Employed'].mode()[0], inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mode()[0], inplace=True)
test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0], inplace=True)

# Convert categorical variables to numeric
train_data['Gender'] = train_data['Gender'].map({'Male': 1, 'Female': 0})
train_data['Married'] = train_data['Married'].map({'Yes': 1, 'No': 0})
train_data['Education'] = train_data['Education'].map({'Graduate': 1, 'Not Graduate': 0})
train_data['Self_Employed'] = train_data['Self_Employed'].map({'Yes': 1, 'No': 0})
train_data['Property_Area'] = train_data['Property_Area'].map({'Urban': 2, 'Semiurban': 1, 'Rural': 0})
train_data['Dependents'] = train_data['Dependents'].replace('3+', 3).astype(int)


test_data['Gender'] = test_data['Gender'].map({'Male': 1, 'Female': 0})
test_data['Married'] = test_data['Married'].map({'Yes': 1, 'No': 0})
test_data['Education'] = test_data['Education'].map({'Graduate': 1, 'Not Graduate': 0})
test_data['Self_Employed'] = test_data['Self_Employed'].map({'Yes': 1, 'No': 0})
test_data['Property_Area'] = test_data['Property_Area'].map({'Urban': 2, 'Semiurban': 1, 'Rural': 0})
test_data['Dependents'] = test_data['Dependents'].replace('3+', 3).astype(int)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data['LoanAmount'] = scaler.fit_transform(train_data[['LoanAmount']])
test_data['LoanAmount'] = scaler.transform(test_data[['LoanAmount']])

print("Data preprocessing completed.")


Data preprocessing completed.


In [None]:
# Display the exact column names and the first few rows of the data
print("Columns in the dataset:")
print(train_data.columns)

print("\nFirst few rows of the dataset:")
print(train_data.head())

Columns in the dataset:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

First few rows of the dataset:
    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0  LP001002       1        0           0          1              0   
1  LP001003       1        1           1          1              0   
2  LP001005       1        1           0          1              1   
3  LP001006       1        1           0          0              0   
4  LP001008       1        0           0          1              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0   -0.211241             360.0   
1             4583             1508.0   -0.211241             360.0   
2             3000                0.0   -0.948996             360.

split data

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status'].map({'Y': 1, 'N': 0})

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data splitting completed.")



Data splitting completed.


data modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Random Forest
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)

# XGBoost


print("Model training completed.")


Model training completed.


Evaluation

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Support Vector Machine (SVM)
svm = SVC(probability=True, random_state=42)
svm.fit(X_train, y_train)

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Adding the models to the dictionary for evaluation
models = {
    'Logistic Regression': log_reg,
    'Decision Tree': tree,
    'Random Forest': forest,
    'SVM': svm,
    'KNN': knn
}

# Evaluate all models using the same metrics as before
for name, model in models.items():
    y_pred = model.predict(X_val)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"F1 Score: {f1_score(y_val, y_pred)}")
    print(f"ROC AUC Score: {roc_auc_score(y_val, y_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_pred)}\n")


--- Logistic Regression ---
Accuracy: 0.7886178861788617
Precision: 0.7596153846153846
Recall: 0.9875
F1 Score: 0.8586956521739131
ROC AUC Score: 0.7030523255813954
Confusion Matrix:
[[18 25]
 [ 1 79]]

--- Decision Tree ---
Accuracy: 0.6829268292682927
Precision: 0.7469879518072289
Recall: 0.775
F1 Score: 0.7607361963190183
ROC AUC Score: 0.6433139534883721
Confusion Matrix:
[[22 21]
 [18 62]]

--- Random Forest ---
Accuracy: 0.7642276422764228
Precision: 0.7524752475247525
Recall: 0.95
F1 Score: 0.8397790055248618
ROC AUC Score: 0.6843023255813954
Confusion Matrix:
[[18 25]
 [ 4 76]]

--- SVM ---
Accuracy: 0.6504065040650406
Precision: 0.6504065040650406
Recall: 1.0
F1 Score: 0.7881773399014779
ROC AUC Score: 0.5
Confusion Matrix:
[[ 0 43]
 [ 0 80]]

--- KNN ---
Accuracy: 0.5772357723577236
Precision: 0.6320754716981132
Recall: 0.8375
F1 Score: 0.7204301075268816
ROC AUC Score: 0.4652616279069768
Confusion Matrix:
[[ 4 39]
 [13 67]]



 Evaluation

In [None]:
# Using the best model, which is Logistic Regression, for predictions
best_model = log_reg  # Assuming Logistic Regression is the best based on F1 Score

# Prepare the test data (assuming the same preprocessing steps have been applied)
X_test = test_data.drop(columns=['Loan_ID'])

# Predict loan status on the test data
y_test_pred = best_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'Loan_ID': test_data['Loan_ID'],
    'Loan_Status': ['Y' if pred == 1 else 'N' for pred in y_test_pred]
})

# Save the submission file
print(submission)


      Loan_ID Loan_Status
0    LP001015           Y
1    LP001022           Y
2    LP001031           Y
3    LP001035           Y
4    LP001051           Y
..        ...         ...
362  LP002971           Y
363  LP002975           Y
364  LP002980           Y
365  LP002986           Y
366  LP002989           Y

[367 rows x 2 columns]


best_model = log_reg: