In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p05_loan.csv"

# Load input CSV which contains some data on loanees into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Drop ´Loan_ID´ column (not useful as a feature)
df = df.drop(columns=["Loan_ID"])
# print(df.info())  # prints concise summary about DataFrame's structure


# Identify numeric columns
numeric_cols = df.select_dtypes(include=["number"]).columns

# print("\nMissing values on numeric columns:\n", df[numeric_cols].isna().sum())

# Replace missing values in numeric columns with median
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Show sample data
# print(df.head())
# print("\nCheck missing values on numeric columns after imputation:\n", df[numeric_cols].isna().sum())

# Columns to check for outliers
outlier_cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]

# Compute mean and std for selected columns
means = df[outlier_cols].mean()
stds  = df[outlier_cols].std()

# print("Shape before removing outliers", df.shape)

# Keep only rows where ALL selected columns are within ±3 std
df = df[
    ((df[outlier_cols] - means).abs() <= 3 * stds).all(axis=1)
].reset_index(drop=True)

# print("Shape after removing outliers", df.shape)


# Identify nonnumeric columns
non_numeric_cols = df.select_dtypes(exclude=["number"]).columns

#print("\nMissing values on non-numeric columns:\n", df[non_numeric_cols].isna().sum())

for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

#print("\nCheck missing values on non-numeric columns post imputation:\n", df[non_numeric_cols].isna().sum())


# for col in non_numeric_cols:
#     print(f"\n--- {col} ---")
#     print(df[col].value_counts())

mappings = {
    "Loan_Status": {"Y": 1, "N": 0},
    "Married": {"Yes": 1, "No": 0},
    "Gender": {"Male": 1, "Female": 0},
    "Self_Employed": {"Yes": 1, "No": 0},
    "Education": {"Graduate": 1, "Not Graduate": 0}
}

for col, mp in mappings.items():
    df[col] = df[col].map(mp)


# for col in non_numeric_cols:
#     print(f"\n--- {col} ---")
#     print(df[col].value_counts())


# Add a dummy 1/0 variable to each of the ´Property_Area´ column value
df = pd.get_dummies(df, columns=["Property_Area"], drop_first=False, dtype=int)


# # Let see sample and information post preprocessing and feature engineering
# print(df.head())
# print(df.info())

# # Extract Feature (X) and Target (y)
X = df.drop(columns=["Loan_Status"])   
y = df["Loan_Status"]

# print(y.value_counts())   # confirm the values / distribution for Target ie ´type´ column

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    train_size=0.7, # to split data as 70% for training and rest 30% for testing
    stratify=y,     # to keep the same class ratio in training and test sets
    random_state=42 # to ensure same rows go to train and test sets in every run for consistency purpose
)

# Create Logistic Regression Model
model = LogisticRegression(
    class_weight="balanced",    # automatically handle imbalanced classes by adjusting weights
    max_iter=2000,              # allow more steps so the model can fully converge
    solver="liblinear"          # best solver for binary classification and smaller datasets   
)

# Train the Logistic Regression model
model.fit(X_train, y_train)

# Prediction with test data
y_pred = model.predict(X_test)

# Evaluate model with Accuracy Score and Confusion Matrix
acc_score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy Score with Logistic Regression Model: {acc_score:.4f}")
print(f"Confusion Matrix with Logistic Regression Model:\n{conf_matrix}")

# 
new_applicant = pd.DataFrame({
    "Gender": [1],
    "Married": [0],
    "Dependents": [0],
    "Education": [1],
    "Self_Employed": [0],
    "ApplicantIncome": [2400],
    "CoapplicantIncome": [2000],
    "LoanAmount": [36],
    "Loan_Amount_Term": [360],
    "Credit_History": [1],
    "Property_Area_Rural": [0],
    "Property_Area_Semiurban": [0],
    "Property_Area_Urban": [1]
})


# Ensure correct column order on new application
new_applicant = new_applicant[X.columns]

# Compute probability and Prediction
prob_yes = model.predict_proba(new_applicant)[0, 1]
prediction = model.predict(new_applicant)[0]

print(f"\nProbability of Loan Approval (Yes): {prob_yes:.4f}")
print("Predicted Loan_Status:", "Yes" if prediction == 1 else "No")

Accuracy Score with Logistic Regression Model: 0.8233
Confusion Matrix with Logistic Regression Model:
[[ 48  29]
 [ 21 185]]

Probability of Loan Approval (Yes): 0.7402
Predicted Loan_Status: Yes
