# Decision Tree - Loan Approval (Synthetic dataset)

Notebook that demonstrates data creation, EDA, preprocessing, training a Decision Tree classifier, evaluation, and example prediction.

Files generated alongside this notebook:

- `/mnt/data/loan_approval_synthetic.csv` (the dataset)

Run each cell in order.

In [None]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt

# Make plots render in notebook
%matplotlib inline


In [None]:
# Load dataset
csv_path = '/mnt/data/loan_approval_synthetic.csv'
df = pd.read_csv(csv_path)
df.head()

In [None]:
# Basic EDA
print('Rows, cols:', df.shape)
print('\nTarget distribution:')
print(df['approved'].value_counts(normalize=True))
print('\nData types:')
print(df.dtypes)

In [None]:
# Preprocessing: separate features and target
X = df.drop(columns=['approved'])
y = df['approved']

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols, categorical_cols

In [None]:
# Build a preprocessing + model pipeline
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
], remainder='passthrough')

clf = DecisionTreeClassifier(random_state=42, max_depth=6)

pipe = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', clf)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

# Fit
pipe.fit(X_train, y_train)


In [None]:
# Evaluation
y_pred = pipe.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification report:\n', classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', cm)

# Simple confusion matrix plot
plt.figure(figsize=(5,4))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.colorbar()
for (i, j), val in np.ndenumerate(cm):
    plt.text(j, i, val, ha='center', va='center')
plt.tight_layout()
plt.show()

In [None]:
# Visualize the trained Decision Tree (requires matplotlib)
# Extract trained decision tree from pipeline
tree = pipe.named_steps['clf']
# Need feature names after OneHotEncoding
ohe = pipe.named_steps['pre'].named_transformers_['cat']
ohe_feature_names = list(ohe.get_feature_names_out(categorical_cols))
feature_names = ohe_feature_names + [c for c in numeric_cols]
plt.figure(figsize=(20,8))
plot_tree(tree, feature_names=feature_names, class_names=['not_approved','approved'], filled=True, rounded=True)
plt.show()

In [None]:
# Save model to disk
model_path = '/mnt/data/decision_tree_loan_model.joblib'
joblib.dump(pipe, model_path)
print('Saved model to', model_path)

In [None]:
# Example: predict a new applicant
sample = {
    'age': [35],
    'income': [48000],
    'loan_amount': [8000],
    'credit_score': [660],
    'employment_years': [4],
    'existing_debt': [2000],
    'has_coapplicant': [0],
    'previous_defaults': [0],
    'marital_status': ['married'],
    'education_level': ['bachelor']
}
sample_df = pd.DataFrame(sample)
pred = pipe.predict(sample_df)
prob = pipe.predict_proba(sample_df)[:,1]
print('Prediction (1=approved):', pred[0], ' - probability approved:', round(prob[0],3))

## Notes

- The dataset is synthetic but realistic for practice.
- Change `max_depth` or other DecisionTree hyperparameters to experiment.
- If you want a RandomForest instead, replace `DecisionTreeClassifier` with `RandomForestClassifier` and tune `n_estimators`.