In [None]:
import pandas as pd

dm_inputdf = pd.read_csv("/workspaces/myfolder/demo/credit_report_woe.csv", header=0)
print(dm_inputdf.dtypes)

In [None]:
from sklearn.utils import shuffle

### define macro variables for model
dm_dec_target = 'customer_event'
dm_partitionvar = 'analytic_partition'
create_new_partition = 'yes' # 'yes', 'no'
dm_key = 'account_id' 
dm_classtarget_level = ['0', '1']
dm_partition_validate_val, dm_partition_train_val, dm_partition_test_val = [0, 1, 2]
dm_partition_validate_perc, dm_partition_train_perc, dm_partition_test_perc = [0.3, 0.6, 0.1]

### create list of regressors
keep_predictors = [
            'woe_num_credit_accounts_open',
            'woe_age',
            'woe_debt_to_income',
            'woe_credit_utilization_ratio',
            'woe_length_of_last_job_mos',
            'woe_credit_history_mos',
            'woe_credit_score',
            'woe_scheduled_payments_per_month'
    ]

### create partition column, if not already in dataset
if create_new_partition == 'yes':
    dm_inputdf = shuffle(dm_inputdf)
    dm_inputdf.reset_index(inplace=True, drop=True)
    validate_rows = round(len(dm_inputdf)*dm_partition_validate_perc)
    train_rows = round(len(dm_inputdf)*dm_partition_train_perc) + validate_rows
    test_rows = len(dm_inputdf)-train_rows
    dm_inputdf.loc[0:validate_rows,dm_partitionvar] = dm_partition_validate_val
    dm_inputdf.loc[validate_rows:train_rows,dm_partitionvar] = dm_partition_train_val
    dm_inputdf.loc[train_rows:,dm_partitionvar] = dm_partition_test_val

### create list of model variables
dm_input = list(dm_inputdf.columns.values)
macro_vars = (dm_dec_target + ' ' + dm_partitionvar + ' ' + dm_key).split()
rejected_predictors = [i for i in dm_input if i not in keep_predictors]
rejected_vars = rejected_predictors # + macro_vars (include macro_vars if rejected_predictors are explicitly listed - not contra keep_predictors)
for i in rejected_vars:
    dm_input.remove(i)
print(dm_input)

### create train, test, validate datasets using existing partition column
dm_traindf = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = dm_traindf.loc[:, dm_input]
y_train = dm_traindf[dm_dec_target]
dm_testdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_test_val)]
X_test = dm_testdf.loc[:, dm_input]
y_test = dm_testdf[dm_dec_target]
dm_validdf = dm_inputdf.loc[(dm_inputdf[dm_partitionvar] == dm_partition_validate_val)]
X_valid = dm_validdf.loc[:, dm_input]
y_valid = dm_validdf[dm_dec_target]

In [None]:
from sasviya.ml.linear_model import LogisticRegression

### estimate & fit model
dm_model = LogisticRegression(
        tol=1e-8,
        fit_intercept=True,
        solver="newrap",
        selection=None,
        verbose=0,
        max_iter=None,
        max_time=None
        )
dm_model.fit(X_train, y_train)

In [None]:
dm_model.describe()

In [None]:
dm_model.score(X_test, y_test)

In [None]:
dm_model.predict_proba(X_test)

In [None]:
fullX = dm_inputdf.loc[:, dm_input]
fully = dm_inputdf[dm_dec_target]
_PRED_ = dm_model.predict_proba(fullX)

In [None]:
### print logit odds ratios
import numpy as np
predictions = dm_model.predict(fullX)
cols = X_train.columns
predictors = np.array(cols)
orat = np.exp(dm_model.coef_, out=None)
c1 = np.vstack([predictors,orat])
c2 = np.transpose(c1)
c = pd.DataFrame(c2, columns=['predictors', 'odds_ratio'])
print('intercept:')
print(dm_model.intercept_)
print('odds_ratios:')
print(c)

In [None]:
coeff = dm_model.coef_
coeff = np.vstack([predictors,coeff])
logodds = coeff[1]
logodds

In [None]:
_XBETA_ = np.dot(fullX,logodds)
print(_XBETA_)

In [None]:
target_score = 600
target_odds = 30
points_to_double_the_odds = 20
factor = points_to_double_the_odds / np.log(2)
offset = target_score - factor * np.log(target_odds)

log_odds = _XBETA_
prob_good_credit = _PRED_
score = offset + (factor * log_odds)

In [None]:
import matplotlib.pyplot as plt
from numpy.lib.histograms import histogram
plt.hist(score, bins=50, label="Distribution of Score")
plt.title('Distribution of Scores')
plt.xlabel('score')
plt.ylabel('Percent')
plt.show()