## Interpreting logistic regression coefficients!

---

In [2]:
import pandas as pd
import numpy as np
import sklearn.datasets as datasets
from sklearn.linear_model import LogisticRegression
import patsy

In [20]:
bc = datasets.load_breast_cancer()

In [21]:
X = pd.DataFrame(bc.data, columns=bc.feature_names)
Y = bc.target

In [23]:
X.head(2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [25]:
Y[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### 1. What are logistic regression coefficients, really?

Logistic regression coefficients correspond to log odds. Not very useful.

log odds P(y = 1) = b0 + b1x1 ... bnxn

Like any regression this is a linear combination of our predictors times their coefficients.

Just to refresh you on ordinary least squares regression:

E[y] = b0 + b1x1 ... bnxn

In OLS the coefficients are very interpretable. So, for example, if b1 = 3.5, then every unit increase in x1 corresponds to an expected 3.5 unit increase in the mean of y.

Understanding logreg coefs is more of a process...

---

### 2. log odds --> odds, odds --> probability

In [47]:
# np.exp is the equivalent of e^ whatever
# this is the inverse function of natural log
def logodds_to_odds(lo):
    return np.exp(lo)

# 
def logodds_to_prob(lo):
    return np.exp(lo) / (1 + np.exp(lo))

# Odds Note:
# an odds ratio of 1:1 --> P = 0.5
# most common appearance is in horse racing
# odds ratio 4:1 --> P = 0.8
# odds ratio 1:5 --> P = 0.1666
#

### 3. Intercept in the logistic regression


In [31]:
Xsub = X[['mean symmetry','worst radius']]
Xsub = (Xsub - Xsub.mean()) / Xsub.std()
lr = LogisticRegression()
lr.fit(Xsub, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
# get the coefs for the predictors
print lr.coef_
print lr.intercept_

[[-1.08904585 -4.69834081]]
[ 0.45678836]


In [41]:
# predicted probabilities for classes at Y index 100
y100_pp = lr.predict_proba(Xsub.iloc[100,:])[0]
print np.round(y100_pp*1000)/1000.

[ 0.363314  0.636686]




In [45]:
# do this manually:
X100 = Xsub.iloc[100, :].values
print X100

intercept = lr.intercept_
coefs = lr.coef_[0]

[-0.73909882  0.14913597]


In [46]:
X100_lo = lr.intercept_ + X100[0]*coefs[0] + X100[1]*coefs[1]
print X100_lo

[ 0.56100922]


In [48]:
# this is how to manually calculate for the whole row
print logodds_to_prob(X100_lo[0])

0.636686021862


In [50]:
# actual probability of 1 vs 0, probability of having cancer in our sample:
print np.mean(Y)

0.627416520211


### 4. Interpret individual coefficients - their impact on the probability!

In [49]:
intercept_p = logodds_to_prob(intercept)
print intercept_p

[ 0.61225201]


In [51]:
# what if mean symmetry was 1 standard deviation higher than the mean?
# but worst radius for the person is the mean
print coefs

[-1.08904585 -4.69834081]


In [52]:
prob_1std_ms = logodds_to_prob(intercept + coefs[0]*1 + coefs[1]*0)
print prob_1std_ms

[ 0.34699883]


In [54]:
# logodds of different values turned into probabilities
print 'probability of logodds 1:', logodds_to_prob(1)

# coefficient of 0 indicates equal odds for that predictor - no effect on the probability
# no matter what your predictor value is, it's multiplied by 0, so no effect
print 'probability of logodds 1:', logodds_to_prob(0)

probability of logodds 1: 0.73105857863
probability of logodds 1: 0.5


In [93]:
# ok, what is the effect, the change in odds, of a predictor, based on its coefficient?
# worst radius has a big, negative coefficient
# If I had 1 standard deviation higher worst radius, what is the change in my probability of having cancer.
# Here a 1 is NOT having cancer, a 0 IS having cancer
my_worst_radius_increase = 1
print X['worst radius'].std()
worst_radius_unit_increase = my_worst_radius_increase / X['worst radius'].std()
print worst_radius_unit_increase
change_prob_having_cancer_1std_worst_radius =  (1 - (logodds_to_prob(coefs[1]*worst_radius_unit_increase)))
print change_prob_having_cancer_1std_worst_radius

my_mean_symmetry_increase = 1
mean_symmetry_std = X['mean symmetry'].std()
#print mean_symmetry_std
change_prob_mean_symmetry = (1 - (logodds_to_prob(coefs[0]*my_mean_symmetry_increase)))
#print change_prob_mean_symmetry


4.83324158047
0.206900479389
0.725535677129


In [90]:
X[['mean symmetry']].describe()

Unnamed: 0,mean symmetry
count,569.0
mean,0.181162
std,0.027414
min,0.106
25%,0.1619
50%,0.1792
75%,0.1957
max,0.304
