In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df= pd.read_csv('credit.csv')


# -----------------------------
# 1. Define target and features
# -----------------------------
y = df["default"]
X = df.drop("default", axis=1)

# Separate numeric and categorical features
numeric_features = ["months_loan_duration", "amount", "percent_of_income", 
                    "years_at_residence", "age", "existing_loans_count", "dependents"]

categorical_features = ["checking_balance", "credit_history", "purpose", 
                        "savings_balance", "employment_duration", 
                        "other_credit", "housing", "job", "phone"]

# Preprocessing: scale numerics, one-hot encode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

# -----------------------------
# 2. Define Models
# -----------------------------
log_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

decision_tree = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(max_depth=5, random_state=42))  # limit depth to prevent overfitting
])

# -----------------------------
# 3. Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 4. Fit Models
# -----------------------------
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)

# -----------------------------
# 5. Predictions
# -----------------------------
y_pred_log = log_reg.predict(X_test)
y_pred_tree = decision_tree.predict(X_test)

# -----------------------------
# 6. Evaluation
# -----------------------------
print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

print("\n=== Decision Tree ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

=== Logistic Regression ===
Accuracy: 0.745
[[122  18]
 [ 33  27]]
              precision    recall  f1-score   support

          no       0.79      0.87      0.83       140
         yes       0.60      0.45      0.51        60

    accuracy                           0.74       200
   macro avg       0.69      0.66      0.67       200
weighted avg       0.73      0.74      0.73       200


=== Decision Tree ===
Accuracy: 0.655
[[99 41]
 [28 32]]
              precision    recall  f1-score   support

          no       0.78      0.71      0.74       140
         yes       0.44      0.53      0.48        60

    accuracy                           0.66       200
   macro avg       0.61      0.62      0.61       200
weighted avg       0.68      0.66      0.66       200

