# Weighted Logistic Regression

In [1]:
%run -m ipy_startup
%matplotlib inline
from py_utils import math

In [64]:
from scipy.stats import binom

# Generate random data to model
np.random.seed(1)
n, n_var = 1000, 25

X = np.random.randn(n, n_var)

b_act = np.random.randn(n_var)
b_int = -1.5
p = math.sigmoid(b_int + np.dot(X, b_act))

w = np.random.randint(1, 100, size=n)

y_success = binom.rvs(w, p)
y_sm = np.hstack([y_success[:,np.newaxis], (w - y_success)[:,np.newaxis]])
y = y_success / w

X.shape, y.shape

((1000, 25), (1000,))

In [65]:
y_sm[:5], w[:5]

(array([[ 2, 56],
        [ 1, 90],
        [ 0, 68],
        [34,  6],
        [ 0, 76]]), array([58, 91, 68, 40, 76]))

In [66]:
pd.Series(w).describe()

count    1000.000000
mean       47.685000
std        28.382977
min         1.000000
25%        23.000000
50%        46.000000
75%        73.000000
max        99.000000
dtype: float64

In [67]:
pd.Series(y).describe()

count    1000.000000
mean        0.366408
std         0.395953
min         0.000000
25%         0.000000
50%         0.166687
75%         0.801087
max         1.000000
dtype: float64

In [68]:
from ml.scipy import optimize, models
param_names = ['X{}'.format(i) for i in range(n_var)]
constraints = optimize.ScipyConstraints()

model = (
    models.ScipyLogisticRegressionModelBuilder()
    .add_linear_params(param_names)
    .add_intercept()
    .build()
)

In [76]:
%%time

import statsmodels.api as sm
from statsmodels.tools import add_constant

fit_w = w

# Weight scaling doesn't seem to matter except when it produces very small / large values
# - dividing by the mean seems to be a good practical method
fit_w_scaled = (fit_w / fit_w.mean())

est_wt = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wt.fit(X, y, sample_weight=fit_w)

est_wtscaled = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wtscaled.fit(X, y, sample_weight=fit_w_scaled)

est_wt1 = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wt1.fit(X, y, sample_weight=np.ones(len(y)))

est_nowt = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_nowt.fit(X, y)

est_sm = sm.GLM(y, add_constant(X, prepend=True), family=sm.families.Binomial()).fit()
est_smwt = sm.GLM(y_sm, add_constant(X, prepend=True), family=sm.families.Binomial()).fit()

CPU times: user 1.05 s, sys: 61.5 ms, total: 1.12 s
Wall time: 296 ms


In [77]:
print(est_wt.get_fit_summary())

Optimization converged successfully:

    Success: True
    Status Code: 0
    Message: Optimization terminated successfully.
    Number of iterations: 14
    Number of function evaluations: 26
    Objective Function Value: 12.356062052577487
    


In [79]:
from collections import OrderedDict

df_coef = pd.DataFrame({
    'est_sm': est_sm.params[1:],
    'est_smwt': est_smwt.params[1:],
    'est_wt': est_wt.inference()['linear'].values,
    'est_wtscaled': est_wtscaled.inference()['linear'].values,
    'est_ones': est_wt1.inference()['linear'].values,
    'est_nowt': est_nowt.inference()['linear'].values
}, index=est_nowt.inference()['linear'].index)

df_int = pd.DataFrame({
    'est_sm': est_sm.params[0],
    'est_smwt': est_smwt.params[0],
    'est_wt': est_wt.inference()['intercept'].values,
    'est_wtscaled': est_wtscaled.inference()['intercept'].values,
    'est_ones': est_wt1.inference()['intercept'].values,
    'est_nowt': est_nowt.inference()['intercept'].values
}, index=est_nowt.inference()['intercept'].index)

df = pd.concat([df_coef, df_int])
df[df.columns.sort_values()].style.bar(axis=1)

Unnamed: 0,est_nowt,est_ones,est_sm,est_smwt,est_wt,est_wtscaled
X0,0.753731,0.753731,0.755407,0.714508,0.714655,0.712997
X1,-0.790022,-0.790022,-0.790008,-0.792727,-0.792784,-0.792845
X2,0.654608,0.654608,0.656697,0.638195,0.63831,0.638301
X3,0.410251,0.410251,0.41038,0.390479,0.390447,0.389749
X4,0.820153,0.820153,0.821583,0.811171,0.811201,0.812198
X5,0.544219,0.544219,0.543571,0.552852,0.552924,0.552258
X6,-0.220916,-0.220916,-0.219316,-0.252062,-0.251986,-0.253943
X7,-0.881521,-0.881521,-0.881324,-0.846651,-0.846731,-0.845685
X8,-0.642165,-0.642165,-0.639359,-0.594353,-0.594388,-0.594678
X9,1.04616,1.04616,1.0455,1.00422,1.00427,1.00492
