In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import joblib

In [42]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [43]:
df.info() # Tells column types and how many non-null values (for missing values)
df.describe() # Gives statistics on numerical columns (mean, std, min, etc.)
df.isnull().sum() # Shows which columns have missing values and how many

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [44]:
catergorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed'] # For columns like Gender, Married, Dependents, Self_Employed, we use the mode (most frequent value), assuming missingness is random
for col in catergorical_cols:
    df[col].fillna(df[col].mode()[0], inplace = True)

In [45]:
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace = True) #LoanAmount: Fill using the median, which is less affected by outliers

In [46]:
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace = True) # Loan_Amount_Term: Mode makes sense here (it's typically 360 months for most)

In [47]:
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace = True) # Credit_History: Crucial for predictions, and mode is usually appropriate

In [48]:
print(df.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [49]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy() #A copy to preserve the original data

catergorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Dependents']

le = LabelEncoder() # Initialize the LabelEncoder

for col in catergorical_cols: #Encode for each categorical column
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [50]:
df_encoded = df_encoded.drop('Loan_ID', axis=1) #Drop Loan_ID as it is not needed for modeling

y = df_encoded['Loan_Status'] #Target column

X = df_encoded.drop('Loan_Status', axis=1) #Features for the model

In [51]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

# 1) Load fresh (avoid leakage from earlier cells)
df = pd.read_csv('../data/train.csv')

# 2) Split features/target
target_col = 'Loan_Status'
X = df.drop(columns=['Loan_ID', target_col])
y = df[target_col]

# 3) Train/Val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# 5) Preprocessing
categorical_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numeric_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
preprocess = ColumnTransformer(transformers=[
    ('cat', categorical_preprocess, categorical_cols),
    ('num', numeric_preprocess, numeric_cols)
])

# 6) Model
model = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=200, n_jobs=None))
])

# 7) Train
model.fit(X_train, y_train)

# 8) Evaluate
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, pos_label='Y')
print('Accuracy:', round(acc, 4))
print('F1 (Y class):', round(f1, 4))
print(classification_report(y_val, y_pred))

# 9) Save trained pipeline
Path('../models').mkdir(parents=True, exist_ok=True)
joblib.dump(model, '../models/model.pkl')
print('Saved to ../models/model.pkl')


Accuracy: 0.8618
F1 (Y class): 0.9081
              precision    recall  f1-score   support

           N       0.96      0.58      0.72        38
           Y       0.84      0.99      0.91        85

    accuracy                           0.86       123
   macro avg       0.90      0.78      0.81       123
weighted avg       0.88      0.86      0.85       123

Saved to ../models/model.pkl


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
