In [None]:
# Core libraries
import numpy as np
import pandas as pd
from collections import defaultdict

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn: model selection, modeling, preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV

# Scikit-learn: metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
    make_scorer,
    precision_recall_curve
)

# Oversampling Method (Like SMOTE)
from imblearn.over_sampling import RandomOverSampler

# AIF360: fairness-aware tools
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric

# Outlier detection
from scipy import stats

# Display settings
pd.set_option('display.max_columns', None)

# Data Exploration

In [None]:
df = pd.read_csv("german.data", sep=r'\s+', header=None) # Loading the dataset
df2 = pd.read_csv("german.data-numeric", sep=r'\s+', header=None) # Loading the dataset
# dimension investigation of head
print(df.head())
print(df.shape)
for val in sorted(df[8].unique()):
    print(f"Value: {val}")
for val in sorted(df2[8].unique()):
    print(f"Value: {val}")
print(df2.head())
print(df2.shape)

In [None]:
df.isnull().sum() # Doulbe checking for missing values

In [None]:
df.dtypes # Checking data types

In [None]:
# Convert categorical columns as needed (optional, for modeling purposes later)
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [None]:
# Summary of numeric variables
print(df.describe())

# Summary of categorical variables
# print(df[categorical_cols].describe())

# Summary of numeric variables
print(df2.describe())

# Data Treatment German Credit

In [None]:
# Renaming columns for df for readabiliyt
column_naming = [
    'checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount',
    'savings_account', 'employment', 'installment_rate', 'personal_status_sex',
    'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans',
    'housing', 'existing_credits', 'job', 'num_dependents', 'telephone',
    'foreign_worker', 'target'
]
df.columns = column_naming
#renaming the encoded df2 (looks like it has extra columns)
df2_main_encodings = df2.iloc[:, :20].copy()
df2_main_encodings.columns = [f"{col}_encoded" for col in column_naming[:-1]] 

# Add target c olumn back (assuming it's the 24th column due to similarity to the df's 'target' column)
df2_main_encodings['target_encoded'] = df2[24] # 1 = "good" , 2 = "bad" for target

# Merge side-by-side
df_merged = pd.concat([df, df2_main_encodings], axis=1)


In [None]:
# print(df2_main_encodings.head())

In [None]:
print(df_merged.head())
for val in sorted(df_merged['personal_status_sex'].unique()):
    print(f"Value: {val}")

# Provided human-readable mapping
label_dict = {
    'A91': 2,
    'A92': 3,
    'A93': 4,
    'A94': 5,
    'A95': 6
}

# Map the labels to the column
df_merged['personal_status_sex_label'] = df_merged['personal_status_sex'].map(label_dict)

le = LabelEncoder()
df_merged['personal_status_sex_encoded'] = le.fit_transform(df_merged['personal_status_sex_label'])

# Get unique value pairs between original and encoded columns
unique_pairs = df_merged[['personal_status_sex', 'personal_status_sex_encoded']].drop_duplicates()

# Sort the result for easier viewing (optional)
unique_pairs = unique_pairs.sort_values(by='personal_status_sex_encoded').reset_index(drop=True)

# Display the result
print(unique_pairs)

In [None]:
attribute_mapping_german_credit = {
    'checking_account': {
        'A11': '... < 0 DM',
        'A12': '0 <= ... < 200 DM',
        'A13': '... >= 200 DM / salary assignments for at least 1 year',
        'A14': 'no checking account'
    },
    'credit_history': {
        'A30': 'no credits taken/ all credits paid back duly',
        'A31': 'all credits at this bank paid back duly',
        'A32': 'existing credits paid back duly till now',
        'A33': 'delay in paying off in the past',
        'A34': 'critical account/ other credits existing (not at this bank)'
    },
    'purpose': {
        'A40': 'car (new)', 'A41': 'car (used)', 'A42': 'furniture/equipment',
        'A43': 'radio/television', 'A44': 'domestic appliances', 'A45': 'repairs',
        'A46': 'education', 'A47': '(vacation - does not exist?)',
        'A48': 'retraining', 'A49': 'business', 'A410': 'others'
    },
    'savings_account': {
        'A61': '... < 100 DM', 'A62': '100 <= ... < 500 DM',
        'A63': '500 <= ... < 1000 DM', 'A64': '... >= 1000 DM',
        'A65': 'unknown/ no savings account'
    },
    'employment': {
        'A71': 'unemployed', 'A72': '... < 1 year',
        'A73': '1 <= ... < 4 years', 'A74': '4 <= ... < 7 years',
        'A75': '... >= 7 years'
    },
    'personal_status_sex': {
        'A91': 'male : divorced/separated',
        'A92': 'female : divorced/separated/married',
        'A93': 'male : single',
        'A94': 'male : married/widowed',
        'A95': 'female : single'
    },
    'other_debtors': {
        'A101': 'none', 'A102': 'co-applicant', 'A103': 'guarantor'
    },
    'property': {
        'A121': 'real estate',
        'A122': 'building society savings agreement/ life insurance',
        'A123': 'car or other, not in attribute 6',
        'A124': 'unknown / no property'
    },
    'other_installment_plans': {
        'A141': 'bank', 'A142': 'stores', 'A143': 'none'
    },
    'housing': {
        'A151': 'rent', 'A152': 'own', 'A153': 'for free'
    },
    'job': {
        'A171': 'unemployed/ unskilled - non-resident',
        'A172': 'unskilled - resident',
        'A173': 'skilled employee / official',
        'A174': 'management/ self-employed/ highly qualified employee/ officer'
    },
    'telephone': {
        'A191': 'none', 'A192': 'yes, registered under the customers name'
    },
    'foreign_worker': {
        'A201': 'yes', 'A202': 'no'
    }
}

In [None]:
label_encoders = {}  # Dictionary to store LabelEncoder instances for each categorical column

for col in categorical_cols:  # Loop through each categorical column
    le = LabelEncoder()  # Create a new LabelEncoder instance
    df_merged[col + '_encoded'] = le.fit_transform(df_merged[col])  # Encode the column and add it to the DataFrame with a new name
    label_encoders[col] = le  # Store the encoder for potential reverse lookup later

encoded_to_original_mapping = {}  # Dictionary to store mappings from encoded values back to original values

for col in df_merged.columns:  # Loop through all columns in the DataFrame
    if col.endswith('_encoded'):  # Only process columns that were encoded
        original_col = col.replace('_encoded', '')  # Derive the original column name
        mapping_df = df_merged[[original_col, col]].drop_duplicates()  # Create a DataFrame of unique original-encoded pairs
        
        mapping_dict = defaultdict(set)  # Use a defaultdict to collect original values for each encoded value
        for _, row in mapping_df.iterrows():  # Iterate through each row of the mapping DataFrame
            mapping_dict[row[col]].add(row[original_col])  # Add the original value to the set for the encoded key
        
        encoded_to_original_mapping[col] = dict(mapping_dict)  # Convert defaultdict to regular dict and store in the mapping

In [None]:
protected_attributes = [
    'age_encoded',                # Age
    'personal_status_sex_encoded',  # Gender/Marital status
    'foreign_worker_encoded',    # Nationality/Immigration status
    'telephone_encoded',         # Proxy for access/stability
    'job_encoded'                # Employment status
]

In [None]:
# Define the target column and feature columns
target_column_good_bad = 'target_encoded'
encoded_features_columns = [col for col in df2_main_encodings.columns 
                            if col.endswith('_encoded') and col != target_column_good_bad]

# Apply one-hot encoding only to categorical features (i.e., those with object or int representing discrete categories)
categorical_cols = df2_main_encodings[encoded_features_columns].select_dtypes(include=['object', 'category', 'int']).columns.tolist()

# Perform one-hot encoding
df2_one_hot = pd.get_dummies(df2_main_encodings, columns=categorical_cols, prefix=categorical_cols)

# Extract features and target
X = df2_one_hot.drop(columns=[target_column_good_bad])
y = df2_one_hot[target_column_good_bad]

# Split dataset into training and testing (50% / 50%) <--per original instructions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)

X_test_copy = X_test.copy()

# remove protected classes from training and testing data
X_train = X_train.drop(columns=[
    col for col in X_train.columns
    if any(col.startswith(prefix) for prefix in protected_attributes)
])

X_test = X_test.drop(columns=[
    col for col in X_test.columns
    if any(col.startswith(prefix) for prefix in protected_attributes)
])

In [None]:
print(X.describe())

In [None]:
# dropping feattures with zero correlation to logistic model
# step was done later in the code but I'm removing it here so I don't have to code a "re-run" of the model itself

drop_features_zero_correlation = [
    'duration_encoded_5',
    'duration_encoded_13',
    'duration_encoded_16',
    'duration_encoded_40',
    'duration_encoded_54',
    'duration_encoded_72',
    'purpose_encoded_2',
    'purpose_encoded_54',
    'purpose_encoded_56',
    'purpose_encoded_82',
    'purpose_encoded_85',
    'purpose_encoded_91',
    'purpose_encoded_94',
    'purpose_encoded_100',
    'purpose_encoded_101',
    'purpose_encoded_102',
    'purpose_encoded_105',
    'purpose_encoded_106',
    'purpose_encoded_109',
    'purpose_encoded_110',
    'purpose_encoded_113',
    'purpose_encoded_118',
    'purpose_encoded_119',
    'purpose_encoded_124',
    'purpose_encoded_126',
    'purpose_encoded_127',
    'purpose_encoded_130',
    'purpose_encoded_138',
    'purpose_encoded_140',
    'purpose_encoded_143',
    'purpose_encoded_144',
    'purpose_encoded_146',
    'purpose_encoded_148',
    'purpose_encoded_149',
    'purpose_encoded_157',
    'purpose_encoded_159',
    'purpose_encoded_184',
    'other_debtors_encoded_68'
]

X_train = X_train.drop(columns=drop_features_zero_correlation)

X_test = X_test.drop(columns=drop_features_zero_correlation)

# Logistic Regression Model

In [None]:
# Create a pipelining that scales features before applying logistic regression
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        solver='liblinear', 
        random_state=42, 
        max_iter=1000,
        class_weight={1: 1.0, 2: 10.0}
    ))
])

#Randmon Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Fit the model
clf.fit(X_resampled, y_resampled) # clf.fit(X_train, y_train)

In [None]:
probs = clf.predict_proba(X_test)  # Returns the values of probabilities, on a scale of 0.00-1.00
creditworthiness_score = 100 * probs[:, 1] # creates a scoring of 0-100

for prob, score in zip(probs[:5, 1], creditworthiness_score[:5]):
    print(f"Predicted P(y=1): {prob:.2f}, Score: {score:.0f}")

logreg_model = clf.named_steps['logreg'] # Get the actual LogisticRegression instance 
coefs = logreg_model.coef_[0] 

for feature, weight in sorted(zip(X_train.columns, coefs), key=lambda x: abs(x[1]), reverse=True): 
        print(f"{feature}: {weight:.4f}")

# Detour: Analysis of impact of model, refining, and revamping

In [None]:
# INITIAL THRESHOLD ANALYSIS DONT REVISE FOR REPORT GENERATION

y_probs = clf.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs, pos_label=2)

# Calculate F1 scores for each threshold
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)  # add small value to avoid divide-by-zero

# Plot Precision-Recall Curve
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(recalls, precisions, label='PR Curve', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.legend()

# Plot F1 Score vs Threshold
plt.subplot(1, 2, 2)
plt.plot(thresholds, f1s[:-1], color='green')  # thresholds is 1 less than precision/recall length
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Threshold')
plt.grid(True)

# Show the plots
plt.tight_layout()
plt.show()


# Model Translation into scorecard

In [None]:
def convert_logistic_regression_to_scorecard(
    trained_logistic_regression_model,
    feature_column_names,
    minimum_credit_score=0,
    maximum_credit_score=100
):
    """
    Convert a trained logistic regression model into a scorecard that maps
    feature weights to interpretable credit scores.
    """
    model_coefficients = trained_logistic_regression_model.coef_[0]  # Coefficients for each feature
    model_intercept = trained_logistic_regression_model.intercept_[0]  # Intercept term

    scorecard_entries = []  # List to hold scorecard tuples

    # Calculate scaling factor to map coefficients to score range
    max_absolute_coefficient = max(abs(model_coefficients).max(), 1e-6)  # Avoid divide-by-zero
    coefficient_scaling_factor = (maximum_credit_score - minimum_credit_score) / (2 * max_absolute_coefficient)

    # Convert intercept to bias score
    intercept_bias_score = model_intercept * coefficient_scaling_factor + minimum_credit_score

    # Add intercept to scorecard
    scorecard_entries.append(("Intercept (bias)", intercept_bias_score))

    # Convert each feature coefficient to a score contribution
    for feature_name, feature_coefficient in zip(feature_column_names, model_coefficients):
        feature_score_contribution = feature_coefficient * coefficient_scaling_factor
        scorecard_entries.append((feature_name, feature_score_contribution))

    return scorecard_entries

In [None]:
logreg_model = clf.named_steps['logreg']

scorecard_rules = convert_logistic_regression_to_scorecard(
    trained_logistic_model=logreg_model,
    input_feature_names=X_train.columns,
    base_credit_score=0,
    high_risk_credit_score=100
)

for rule_description, assigned_score in scorecard_rules:
    print(f"If {rule_description} => Assign Score: {assigned_score:.2f}")

# Model Testing

In [None]:
# Predicting probabilities (for class 2, i.e., "bad credit" score)
y_prob = clf.predict_proba(X_test)[:, 1]
optimal_threshold = 0.3 # Custom threshold as default is 0.5, or sweep from 0.06 to 0.10
y_pred = (y_prob > optimal_threshold).astype(int) + 1

# Convert probabilities to creditworthiness scores (0–100 scale)
test_scores = y_prob * 100

# Accuracy "score"
accuracy = accuracy_score(y_test, y_pred)

# F1 Score (us0e average='binary' if binary classification, else 'macro' for overall)
f1 = f1_score(y_test, y_pred, average='macro')  

# Confusion Matrix: [[TN, FP], [FN, TP]]
conf_matrix = confusion_matrix(y_test, y_pred)

# Extractingg false positives and false negatives
tn, fp, fn, tp = conf_matrix.ravel()

# Full classification report
report = classification_report(y_test, y_pred)

In [None]:
print(f"Accuracy Score: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix Results:")
print(conf_matrix)
print(f"False Positives (defined as bad credit when they are actually good credit): {fp}")
print(f"False Negatives (defined as good credit when they're actually bad credit): {fn}")
print("\nFull Classification Report Details:\n", report)

In [None]:
print("Unique predictions:", np.unique(y_pred))  # Display the unique predicted class labels

# Create a copy of X_test to reattach protected attributes for subgroup analysis
X_test_with_protected_attributes = X_test.copy()

# Extract protected attributes from the original encoded DataFrame using the test set index
protected_attributes_subset = df2_main_encodings.loc[X_test.index, protected_attributes]

# Join the protected attributes back into the copied test set
X_test_with_protected_attributes = X_test_with_protected_attributes.join(protected_attributes_subset)

# Convert predictions to a Series with the same index as X_test for alignment
y_pred_series_indexed = pd.Series(y_pred, index=X_test.index)

# Loop through each protected attribute for subgroup performance breakdown
for protected_attribute in protected_attributes:
    print(f"\n=== Breakdown for {protected_attribute} ===")

    # Loop through each unique group within the protected attribute
    for group_value in sorted(X_test_with_protected_attributes[protected_attribute].unique()):
        group_filter_mask = X_test_with_protected_attributes[protected_attribute] == group_value  # Boolean mask for group

        true_labels_for_group = y_test[group_filter_mask]  # True labels for the group
        predicted_labels_for_group = y_pred_series_indexed[group_filter_mask]  # Predicted labels for the group

        unique_class_labels = set(true_labels_for_group) | set(predicted_labels_for_group)  # Union of true and predicted classes

        # Handle edge case: if only one class is present, F1 score is undefined
        if len(unique_class_labels) < 2:
            f1_bad_credit = float('nan')  # F1 score can't be computed meaningfully
        elif len(unique_class_labels) == 2:
            f1_bad_credit = f1_score(true_labels_for_group, predicted_labels_for_group, pos_label=2, average='binary')  # F1 for binary classification (bad credit = 2)
        else:
            f1_bad_credit = f1_score(true_labels_for_group, predicted_labels_for_group, pos_label=2, average='macro')  # F1 for multiclass (fallback)

        accuracy_for_group = accuracy_score(true_labels_for_group, predicted_labels_for_group)  # Accuracy for the group

        confusion_matrix_for_group = confusion_matrix(true_labels_for_group, predicted_labels_for_group, labels=[1, 2])  # Confusion matrix for good/bad credit

        # Pad confusion matrix if it's not 2x2 (e.g., missing one class)
        if confusion_matrix_for_group.shape != (2, 2):
            padded_confusion_matrix = np.zeros((2, 2), dtype=int)  # Create empty 2x2 matrix
            for i, row_label in enumerate(np.unique(true_labels_for_group)):
                for j, col_label in enumerate(np.unique(predicted_labels_for_group)):
                    padded_confusion_matrix[row_label - 1, col_label - 1] = confusion_matrix_for_group[i, j]  # Fill in values
            confusion_matrix_for_group = padded_confusion_matrix  # Replace with padded version

        true_negatives, false_positives, false_negatives, true_positives = confusion_matrix_for_group.ravel()  # Unpack confusion matrix values

        # Retrieve original label from encoded mapping
        original_label_lookup = encoded_to_original_mapping.get(protected_attribute, {}).get(group_value)
        if original_label_lookup is None:
            readable_group_label = str(group_value)  # Fallback to raw value
        else:
            if isinstance(original_label_lookup, set):
                readable_group_label = ', '.join(sorted(str(val) for val in original_label_lookup))  # Join multiple values
            else:
                readable_group_label = attribute_mapping_german_credit.get(
                    protected_attribute.replace("_encoded", ""), {}
                ).get(original_label_lookup, str(original_label_lookup))  # Map to human-readable label

        # Print performance metrics for the group
        print(f"\nGroup {group_value} ({readable_group_label}):")
        print(f"  Accuracy: {accuracy_for_group:.4f}")
        print(f"  F1 Score (Bad Credit): {f1_bad_credit:.4f}")
        print(f"  Confusion Matrix:\n{confusion_matrix_for_group}")
        print(f"    False Positives: {false_positives}")
        print(f"    False Negatives: {false_negatives}")

In [None]:
# Initialize a list to store fairness metrics for each group
fairness_metrics_by_group = []

# Loop through each protected attribute (e.g., gender, age)
for protected_attribute in protected_attributes:
    
    # Loop through each unique group within the protected attribute (e.g., male, female)
    for group_value in X_test_with_protected[protected_attribute].unique():
        
        # Get indices of rows in the test set that belong to the current group
        group_sample_indices = X_test_with_protected[X_test_with_protected[protected_attribute] == group_value].index
        
        # Extract true labels for the current group
        true_labels_for_group = y_test.loc[group_sample_indices]
        
        # Extract predicted labels for the current group
        predicted_labels_for_group = y_pred_series.loc[group_sample_indices]
        
        # Compute accuracy for the current group
        group_accuracy = accuracy_score(true_labels_for_group, predicted_labels_for_group)
        
        # Compute F1 score for bad credit (label = 2) for the current group
        group_f1_bad_credit = f1_score(true_labels_for_group, predicted_labels_for_group, pos_label=2)
        
        # Compute confusion matrix and unpack values
        true_negatives, false_positives, false_negatives, true_positives = confusion_matrix(
            true_labels_for_group, predicted_labels_for_group, labels=[1, 2]
        ).ravel()

        # Retrieve original label from encoded mapping, if available
        original_group_label = encoded_to_original_mapping.get(protected_attribute, {}).get(group_value)
        
        # Determine readable label for the group
        if original_group_label is None:
            readable_group_label = str(group_value)  # Fallback to encoded value
        else:
            if isinstance(original_group_label, set):
                readable_group_label = ', '.join(sorted(str(val) for val in original_group_label))  # Join multiple values
            else:
                readable_group_label = attribute_mapping_german_credit.get(
                    protected_attribute.replace("_encoded", ""), {}
                ).get(original_group_label, str(original_group_label))  # Map to human-readable label

        # Append computed metrics and metadata to the results list
        fairness_metrics_by_group.append({
            'protected_attribute': protected_attribute,
            'group_value': group_value,
            'group_label': readable_group_label,
            'num_samples': len(group_sample_indices),
            'accuracy': group_accuracy,
            'f1_score': group_f1_bad_credit,
            'false_positives': false_positives,
            'false_negatives': false_negatives,
            'true_positives': true_positives,
            'true_negatives': true_negatives
        })

# Convert the list of dictionaries into a DataFrame for inspection and analysis
fairness_metrics_df = pd.DataFrame(fairness_metrics_by_group)
print(fairness_metrics_df)  # Display the fairness metrics for each group


# ML model

In [None]:
# actual results of y_train data T's vs F's
print(y_train.value_counts(normalize=True))

# predicted results of y_pred data T's vs F's
print(np.bincount(y_pred))

# Profit Model

In [None]:
def calculate_profit(y_true, predicted_probs, approval_threshold):
    profit = 0
    for actual, prob in zip(y_true, predicted_probs):
        predicted_approval = prob >= approval_threshold  # threshold on probability
        if actual == 1 and predicted_approval:
            profit += 10   # Approved, should approve
        elif actual == 1 and not predicted_approval:
            profit -= 5    # Approved, but denied
        elif actual == 2 and predicted_approval:
            profit -= 3    # Denied, but approved
        # actual == 2 and denied -> 0 profit (no change)
    return profit

In [None]:
def train_logistic_and_score(
    X_train, y_train, X_test, y_test,
    base_credit_score=0,
    high_risk_credit_score=100,
    **logistic_kwargs
):
    # Train logistic regression
    clf = LogisticRegression(**logistic_kwargs, random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)

    # Predict classes and probabilities on test set
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]  # prob of class "2" (bad credit)

    # Evaluate metrics if needed (accuracy, f1, confusion matrix) here or outside

    # Instead of manual scorecard rules, you might map probabilities directly to scores:
    # For example, scale prob between base_credit_score and high_risk_credit_score:
    test_scores = base_credit_score + (high_risk_credit_score - base_credit_score) * y_prob

    return clf, y_pred, y_prob, test_scores

In [None]:
best_profit = float('-inf')
best_threshold = None
for threshold in range(0, 101):
    profit = calculate_profit(y_test, y_prob, threshold)
    if profit > best_profit:
        best_profit = profit
        best_threshold = threshold

print(f"Best Profit: {best_profit} at Threshold: {best_threshold}")

In [None]:
# Sweep over possible thresholds between 0 and 1 (or score range) to find best profit:
best_profit = float('-inf')
best_threshold = None

for threshold in np.linspace(0, 1, 101):
    profit = calculate_profit(y_test, y_prob, threshold)
    if profit > best_profit:
        best_profit = profit
        best_threshold = threshold

print(f"Best Profit: {best_profit} at Threshold: {best_threshold:.2f}")

#Debug Mode

In [None]:
def calculate_profit_v2(y_true, predicted_probs, approval_threshold):
    profit = 0
    for actual, prob in zip(y_true, predicted_probs):
        predicted_approval = prob >= approval_threshold  # threshold on probability
        if actual == 1 and predicted_approval:
            profit += 10   # Approved, should approve
        elif actual == 1 and not predicted_approval:
            profit -= 5    # Approved, but denied
        elif actual == 2 and predicted_approval:
            profit -= 10   # Denied, but approved
        elif actual == 2 and not predicted_approval:
            profit += 5    # Denied, and correctly denied
    return profit

In [None]:
best_profit = float('-inf')
best_threshold = None

# Finer granularity sweep
for threshold in [x/100 for x in range(5, 21)]:
    profit = calculate_profit_v2(y_test, y_prob, threshold)
    if profit > best_profit:
        best_profit = profit
        best_threshold = threshold

print(f"Best Profit: {best_profit} at Threshold: {best_threshold:.2f}")

In [None]:
plt.hist(y_prob, bins=20)
plt.xlabel("Predicted Probability (Good Credit)")
plt.ylabel("Number of Samples")
plt.title("Distribution of Predicted Probabilities")
plt.show()

In [None]:

print(classification_report(y_test, clf.predict(X_test)))

In [None]:
for threshold in range(0, 101, 5):
    profit = calculate_profit_v2(y_test, test_scores, threshold)
    print(f"Threshold: {threshold}, Profit: {profit}")

In [None]:
thresholds = [x/100 for x in range(5, 21)]
profits = [calculate_profit_v2(y_test, y_prob, t) for t in thresholds]

plt.plot(thresholds, profits, marker='o')
plt.title("Profit vs. Threshold (0.05–0.20 Range)")
plt.xlabel("Approval Threshold")
plt.ylabel("Profit")
plt.grid(True)
plt.show()

In [None]:
f1_scores = []
for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

plt.plot(thresholds, f1_scores, marker='x', color='orange')
plt.title("F1 Score vs. Threshold")
plt.xlabel("Threshold")
plt.ylabel("F1 Score")
plt.grid(True)
plt.show()

In [None]:
# Get precision, recall, and thresholds
# Note: You must binarize the y_test for this to work correctly
y_test_bin = (y_test == 2).astype(int)  # class "2" = bad credit

precision, recall, thresholds = precision_recall_curve(y_test_bin, y_prob)

# Calculate F1 scores for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label="Precision", color="green", marker='o')
plt.plot(thresholds, recall[:-1], label="Recall", color="blue", marker='s')
plt.plot(thresholds, f1_scores[:-1], label="F1 Score", color="orange", marker='x')

plt.title("Precision, Recall, and F1 Score vs. Threshold")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

plt.hist(test_scores, bins=50)
plt.title("Histogram of Predicted Creditworthiness Scores")
plt.xlabel("Score (0-100)")
plt.ylabel("Number of test samples")
plt.show()

In [None]:
plt.hist([y_prob[y_test == 1], y_prob[y_test == 2]], bins=20, label=["Good Credit", "Bad Credit"], stacked=True)
plt.legend()
plt.title("Score Distributions by Class")
plt.show()

In [None]:
coefs = clf.named_steps['logreg'].coef_[0] 
for feature, weight in sorted(zip(X_train.columns, coefs), key=lambda x: abs(x[1]), reverse=True):
    print(f"{feature}: {weight:.4f}")

# Bias Mitigation

In [None]:
# print(df_merged)

results_df = df_merged.loc[X_test_copy.index].copy()
results_df['y_true'] = y_test
results_df['y_pred'] = y_pred
results_df['prob'] = y_prob

print(results_df)

# Define the correct mapping from numeric labels to binary sex
label_sex_dict_cleaned = {
    1: 1,  # male
    2: 0,  # female
    3: 1,  # male
    4: 1,  # male
    5: 0   # female
}

# Map the personal_status_sex_label to 0 or 1
results_df['sex_binary'] = results_df['personal_status_sex_label'].map(label_sex_dict_cleaned)

# Start with the numerical feature columns
numerical_cols = X_test.select_dtypes(include=[np.number]).columns.tolist()

# Add required columns
numerical_cols += ['y_true', 'sex_binary']

# Subset to only numerical data
aif_df = results_df[numerical_cols].copy()


In [None]:
# Select numeric columns from X_test for fairness analysis
numeric_feature_columns = X_test.select_dtypes(include=[np.number]).columns.tolist()

# Add true labels and binary sex attribute to the list of columns
numeric_feature_columns += ['y_true', 'sex_binary']

# Create a subset DataFrame with only numeric columns and fairness-relevant attributes
aif360_input_df = results_df[numeric_feature_columns].copy()

# Create BinaryLabelDataset for AIF360 using true labels and protected attribute
aif360_true_label_dataset = BinaryLabelDataset(
    df=aif360_input_df,  # DataFrame with features and protected attribute
    label_names=['y_true'],  # Column containing true labels
    protected_attribute_names=['sex_binary'],  # Column containing protected attribute
    favorable_label=1,  # Label for favorable outcome (e.g., good credit)
    unfavorable_label=2  # Label for unfavorable outcome (e.g., bad credit)
)

# Copy the dataset to assign predicted labels and scores
aif360_predicted_dataset = aif360_true_label_dataset.copy()

# Assign predicted labels to the copied dataset
aif360_predicted_dataset.labels = results_df['y_pred'].values.reshape(-1, 1)

# Assign predicted probabilities to the copied dataset
aif360_predicted_dataset.scores = results_df['prob'].values.reshape(-1, 1)

# Define privileged group (male) for fairness evaluation
privileged_group_definition = [{'sex_binary': 1}]

# Define unprivileged group (female) for fairness evaluation
unprivileged_group_definition = [{'sex_binary': 0}]

# Instantiate ClassificationMetric to evaluate fairness between groups
fairness_metric_evaluator = ClassificationMetric(
    aif360_true_label_dataset,  # Dataset with true labels
    aif360_predicted_dataset,   # Dataset with predicted labels and scores
    privileged_groups=privileged_group_definition,  # Privileged group definition
    unprivileged_groups=unprivileged_group_definition  # Unprivileged group definition
)

# Print key fairness metrics computed by AIF360
print("Disparate Impact:", fairness_metric_evaluator.disparate_impact())  # Ratio of favorable outcomes
print("Statistical Parity Difference:", fairness_metric_evaluator.statistical_parity_difference())  # Difference in approval rates
print("Equal Opportunity Difference:", fairness_metric_evaluator.equal_opportunity_difference())  # Difference in true positive rates
print("Average Odds Difference:", fairness_metric_evaluator.average_odds_difference())  # Difference in both TPR and FPR
print("Equalized Odds Difference (TPR - FPR):", 
      fairness_metric_evaluator.equal_opportunity_difference() - fairness_metric_evaluator.false_positive_rate_difference())  # Net difference in TPR and FPR

In [None]:
# Compute the two fairness metrics
disparate_impact = metric.disparate_impact()
stat_parity_diff = metric.statistical_parity_difference()

# Put metrics into a DataFrame for plotting
fairness_df = pd.DataFrame({
    'Metric': ['Disparate Impact', 'Statistical Parity Difference'],
    'Value': [disparate_impact, stat_parity_diff]
})

# Define fairness thresholds for reference
thresholds = {
    'Disparate Impact': (0.8, 1.25),
    'Statistical Parity Difference': (-0.1, 0.1)
}

# Create the plot
plt.figure(figsize=(8, 5))
sns.barplot(data=fairness_df, x='Metric', y='Value', palette='coolwarm')
plt.axhline(0, color='gray', linewidth=1)

# Add threshold lines
for metric_name, (low, high) in thresholds.items():
    if metric_name == 'Disparate Impact':
        plt.axhline(low, color='green', linestyle='--', label='Acceptable Range' if metric_name == 'Disparate Impact' else "")
        plt.axhline(high, color='green', linestyle='--')
    else:
        plt.axhline(low, color='purple', linestyle='--', label='Acceptable Range' if metric_name == 'Statistical Parity Difference' else "")
        plt.axhline(high, color='purple', linestyle='--')

plt.title('Fairness Metrics: Disparate Impact & Statistical Parity Difference')
plt.ylim(-1, 2)  # Adjust range as needed
plt.legend()
plt.tight_layout()
plt.show()