# Baseline Model: Logistic Regression
## 1. Load Data

In [None]:
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import json

X_train = pd.read_csv('../data/processed/model_ready/X_train.csv')
y_train = pd.read_csv('../data/processed/model_ready/y_train.csv').values.ravel()
X_val = pd.read_csv('../data/processed/model_ready/X_val.csv')
y_val = pd.read_csv('../data/processed/model_ready/y_val.csv').values.ravel()

## 2. Train Baseline Model

In [None]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

# Metrics
metrics = {
    'accuracy': accuracy_score(y_val, y_pred),
    'precision': precision_score(y_val, y_pred),
    'recall': recall_score(y_val, y_pred),
    'f1': f1_score(y_val, y_pred),
    'auc': roc_auc_score(y_val, y_prob)
}

print(json.dumps(metrics, indent=4))

## 3. Confusion Matrix

In [None]:
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Baseline Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()