# <span style="font-family: Arial, sans-serif; color:#97f788">Fisher Scoring</span>
## <span style="font-family: Arial, sans-serif; color:navyblue">Example with WOE Logistic Regression</span>

<span style="font-family: Arial, sans-serif; color:navyblue">Repo: <a href="https://github.com/xRiskLab/fisher-scoring" title="GitHub link">https://github.com/xRiskLab/fisher-scoring</a></span>

```python
%%capture
!pip install pandas scikit-learn optbinning fisher-scoring statsmodels
````

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch blended credit data
url = (
    "https://drive.google.com/file/d/1Is8UZnPRExI-SLJMle8GRbFGpha5IvYZ/view?usp=sharing"
)
url = "https://drive.google.com/uc?id=" + url.split("/")[-2]
dataset = pd.read_csv(url, index_col=False)

features = [
    "revolving_utilization_of_unsecured_lines",
    "account_never_delinq_percent",
    "net_fraction_revolving_burden",
    "external_risk_estimate",
    "num_total_cc_accounts",
    "average_months_in_file",
]

target = 'is_bad'

X, y = dataset[features], dataset[target]

ix_train, ix_test = train_test_split(
    X.index, stratify=y, test_size=0.3, random_state=62
)

In [2]:
import time
from typing import Callable, Any

from optbinning import BinningProcess
from fisher_scoring import FisherScoringLogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

# Define the timing decorator
def timing(func: Callable) -> Callable:
    def wrapper(*args, **kwargs) -> Any:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.2f} seconds")
        return result
    return wrapper

# Create the binning process and logistic model pipeline
binning_process = BinningProcess(variable_names=features, categorical_variables=[])
model = FisherScoringLogisticRegression(use_bias=True, information='expected', verbose=True)
woe_logistic_model = make_pipeline(binning_process, model)

@timing
def fit_model(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

# Fit the model and time it
fit_model(woe_logistic_model, X.loc[ix_train, :], y[ix_train])

# Extract model weights and calculate Gini coefficient
model_weights = woe_logistic_model[-1].beta.T
print(f"Coefficients: {model_weights}")

predictions = woe_logistic_model.predict_proba(X.loc[ix_test, :])[:, 1]
gini = 2 * roc_auc_score(y[ix_test], predictions) - 1
print(f"Gini (test): {gini:.2%}")

# Display the summary of the model
woe_logistic_model[-1].display_summary(style='cyan3')

Starting Fisher Scoring Iterations...
Iteration: 1, Log Loss: 0.6931
Iteration: 2, Log Loss: 0.2724
Iteration: 3, Log Loss: 0.2002
Iteration: 4, Log Loss: 0.1740
Iteration: 5, Log Loss: 0.1665
Iteration: 6, Log Loss: 0.1654
Iteration: 7, Log Loss: 0.1653
Iteration: 8, Log Loss: 0.1653
Iteration: 9, Log Loss: 0.1653
Iteration: 10, Log Loss: 0.1653
Convergence reached after 10 iterations.
fit_model took 0.85 seconds
Coefficients: [[-2.25705117 -1.01248996 -1.02589516 -0.39071063 -0.77812967 -1.25793343
  -0.45646565]]
Gini (test): 88.30%


In [3]:
import statsmodels.api as sm

# Fit the model using statsmodels
X_train = sm.add_constant(woe_logistic_model[:-1].transform(X.loc[ix_train, :]))
y_train = y[ix_train].values

model = sm.Logit(y_train, X_train)
result = model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.165340
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 7000
Model:                          Logit   Df Residuals:                     6993
Method:                           MLE   Df Model:                            6
Date:                Wed, 31 Jul 2024   Pseudo R-squ.:                  0.4914
Time:                        14:33:20   Log-Likelihood:                -1157.4
converged:                       True   LL-Null:                       -2275.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------
const                                       -2.2

In [4]:
import numpy as np
from scipy.stats import chi2

def likelihood_ratio_test(model, y, ix_train):
    ll_model = model.loss_history[-1]
    ll_p = np.ones_like(y[ix_train]) * np.mean(y[ix_train])
    ll_null = model.compute_loss(y[ix_train], ll_p)
    likelihood_ratio_test = 2 * (ll_model - ll_null)
    p_value = 1 - chi2.cdf(likelihood_ratio_test, 1)
    return likelihood_ratio_test, p_value


ll_ratio, p_value = likelihood_ratio_test(woe_logistic_model[-1], y, ix_train)
print(f"Likelihood ratio: {ll_ratio:.2f}, p-value: {p_value:.4f}")

Likelihood ratio: 2236.40, p-value: 0.0000
