# Weighted Linear Regression

In [1]:
%run -m ipy_startup
%matplotlib inline
from scipy.stats import norm

In [80]:
# Generate random data to model
np.random.seed(1)
n, n_var = 100000, 10
positive_only = False
#n, n_class, n_var = 10000, 4, 5

# Generate per-sample variance
y_variance = .05 + 100 * np.random.rand(n)

X = np.random.randn(n, n_var)

b_act = np.random.randn(n_var)
b_int = -.3

ym = b_int + np.dot(X, b_act)

y = norm.rvs(ym, np.sqrt(y_variance))
if positive_only:
    y = np.exp(y)

X.shape, y.shape

((100000, 10), (100000,))

In [81]:
pd.Series(y_variance).describe()

count    100000.000000
mean         49.971911
std          28.936320
min           0.051037
25%          24.758765
50%          50.034988
75%          75.027099
max         100.049028
dtype: float64

In [82]:
pd.Series(y).describe()

count    100000.000000
mean         -0.363414
std           7.746547
min         -42.206046
25%          -5.119224
50%          -0.359733
75%           4.366119
max          37.984447
dtype: float64

In [83]:
from ml.scipy import optimize, models
from ml.sklearn.preprocessing import MeanScaler

param_names = ['X{}'.format(i) for i in range(n_var)]


model = (
    models.ScipyLinearRegressionModelBuilder()
    .add_linear_params(param_names)
    .add_intercept()
    .add_constraints(constraints)
    .build()
)


In [104]:
%%time
from sklearn.linear_model import LinearRegression

fit_w = 1/y_variance

# Weight scaling doesn't seem to matter except when it produces very small / large values
# - dividing by the mean seems to be a good practical method
fit_w_scaled = (fit_w / fit_w.mean())

est_wt = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wt.fit(X, y, sample_weight=fit_w)

est_wtscaled = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wtscaled.fit(X, y, sample_weight=fit_w_scaled)

est_wt1 = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_wt1.fit(X, y, sample_weight=np.ones(len(y)))

est_nowt = optimize.ScipyRegressor(model, analytical_gradients=True, monitor_gradient=True, raise_on_failure=False)
est_nowt.fit(X, y)

est_skl = LinearRegression().fit(X, y)
est_wtskl = LinearRegression().fit(X, y, 1/y_variance)

CPU times: user 1.67 s, sys: 169 ms, total: 1.84 s
Wall time: 498 ms


In [106]:
from sklearn.linear_model import LinearRegression
from collections import OrderedDict

df_coef = pd.DataFrame({
    'est_sklearn': est_skl.coef_,
    'est_wtsklearn': est_wtskl.coef_,
    'est_wt': est_wt.inference()['linear'].values,
    'est_wtscaled': est_wtscaled.inference()['linear'].values,
    'est_ones': est_wt1.inference()['linear'].values,
    'est_nowt': est_nowt.inference()['linear'].values
}, index=est_nowt.inference()['linear'].index)

df_int = pd.DataFrame({
    'est_sklearn': est_skl.intercept_,
    'est_wtsklearn': est_wtskl.intercept_,
    'est_wt': est_wt.inference()['intercept'].values,
    'est_wtscaled': est_wtscaled.inference()['intercept'].values,
    'est_ones': est_wt1.inference()['intercept'].values,
    'est_nowt': est_nowt.inference()['intercept'].values
}, index=est_nowt.inference()['intercept'].index)

df = pd.concat([df_coef, df_int])
df[df.columns.sort_values()].style.bar(axis=1)

Unnamed: 0,est_nowt,est_ones,est_sklearn,est_wt,est_wtscaled,est_wtsklearn
X0,-2.18951,-2.18951,-2.18953,-2.19341,-2.19446,-2.19446
X1,-0.654724,-0.654724,-0.654661,-0.656224,-0.652624,-0.65262
X2,-0.536536,-0.536536,-0.536532,-0.552601,-0.551261,-0.551261
X3,-1.28862,-1.28862,-1.28859,-1.32018,-1.31935,-1.31934
X4,0.519195,0.519195,0.519189,0.536287,0.536452,0.53645
X5,1.0469,1.0469,1.04705,1.05104,1.05332,1.05332
X6,-0.407603,-0.407603,-0.407522,-0.424926,-0.425453,-0.425466
X7,0.389374,0.389374,0.389269,0.390334,0.389451,0.389472
X8,0.26432,0.26432,0.26431,0.285174,0.286693,0.286681
X9,1.02629,1.02629,1.02622,1.0037,1.00239,1.00238
