In [16]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

In [17]:
# Opening file

root = Path.cwd().parent

path = root / "data" / "interim" / "application_train.csv"

df = pd.read_csv(path)

In [18]:
# Determining the aggregated missingness of columns in the dataframe
missing = df.isna().mean()

In [19]:
# Creating an index of which columns to drop from the data frame (those with more than 50% missing data)
cols_to_drop = missing > 0.5

In [20]:
# Dropping the columns using indexing
df = df.loc[:, ~cols_to_drop].copy()

In [21]:
# Replacing sentinel values ('365243') in DAYS_EMPLOYED with NaN values
df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].replace(365243, np.nan)

In [22]:
# Don't drop TARGET even if threshold ever changes
cols_to_drop = [c for c in cols_to_drop if c != "TARGET"]

In [23]:
# Separate target
y = df["TARGET"].astype(int).values
X = df.drop(columns=["TARGET"])

In [24]:
# Identify numeric/categorical
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

In [25]:
print("Numeric:", len(numeric_cols), "Categorical:", len(categorical_cols))

Numeric: 67 Categorical: 13


In [26]:
# Numeric pipeline: median impute + standardize
num_imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()


X_num = num_imputer.fit_transform(X[numeric_cols])
X_num_scaled = scaler.fit_transform(X_num)

In [27]:
# Categorical pipeline: one-hot encode
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = ohe.fit_transform(X[categorical_cols]) if categorical_cols else np.empty((len(X), 0))

In [28]:
# Combine
X_model = np.hstack([X_num_scaled, X_cat])

In [29]:
# Train logistic regression
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(X_model, y)

# Predicted probabilities for default
proba = lr.predict_proba(X_model)[:, 1]

print("Proba summary:", np.min(proba), np.median(proba), np.max(proba))

Proba summary: 1.59503296804577e-10 0.4023607369775381 0.9990641275586989


In [30]:
# Selecting columns for tableau reporting
df_tab = df[[
    "SK_ID_CURR",
    "TARGET",
    "AMT_CREDIT",
    "AMT_GOODS_PRICE",
    "AMT_INCOME_TOTAL",
    "AMT_ANNUITY",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "ORGANIZATION_TYPE",
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
]].copy()


# If any of these are missing in your current df, drop them safely
df_tab = df_tab[[c for c in df_tab.columns if c in df.columns]].copy()

In [31]:
# Re-apply sentinel fix for reporting columns
if "DAYS_EMPLOYED" in df_tab.columns:
    df_tab["DAYS_EMPLOYED"] = df_tab["DAYS_EMPLOYED"].replace(365243, np.nan)

In [32]:
# Add model outputs
df_tab["predicted_pd"] = proba
df_tab["predicted_default_05"] = (df_tab["predicted_pd"] >= 0.5).astype(int)

In [33]:
# Engineer LTV
if "AMT_CREDIT" in df_tab.columns and "AMT_GOODS_PRICE" in df_tab.columns:
    df_tab["LTV"] = df_tab["AMT_CREDIT"] / df_tab["AMT_GOODS_PRICE"]
    df_tab["LTV"] = df_tab["LTV"].replace([np.inf, -np.inf], np.nan)

In [34]:
# Human-friendly age/employment years
if "DAYS_BIRTH" in df_tab.columns:
    df_tab["AGE_YEARS"] = (-df_tab["DAYS_BIRTH"] / 365.25).round(1)


if "DAYS_EMPLOYED" in df_tab.columns:
    df_tab["EMPLOYED_YEARS"] = (-df_tab["DAYS_EMPLOYED"] / 365.25).round(1)

In [35]:
OUT_PATH = Path("reports/tableau_dataset.csv")
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

df_tab.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH, "Shape:", df_tab.shape)

Saved: reports/tableau_dataset.csv Shape: (307511, 18)
