In [97]:
# Setpu
import pandas as pd
import sklearn
import re
import numpy as np

diabetes_df = pd.read_csv("../data/diabetes_df.csv")


In [98]:
readmitted_outcome = [0 if x == 'NO' else 1 for x in diabetes_df['readmitted']]
first_number_regex = re.compile(r'^\[(\d*)') # Matches a bracket followed by numbers (0 or more, greedy)

def extract_minimum_age(age):
    matching_number = first_number_regex.match(age).group(1) # first matching group only extract the number
    return int(matching_number)

In [99]:
diabetes_df['age'].apply(extract_minimum_age)
model_subset = diabetes_df.loc[:, ['encounter_id', 'race', 'gender', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'time_in_hospital']].copy()
id_subset = diabetes_df.loc[:, ['encounter_id', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']].copy()
ccs_subset = diabetes_df.loc[:, ['encounter_id', 'CCS Category Description 1', 'CCS Category Description 2', 'CCS Category Description 3']].copy()
model_subset = pd.get_dummies(model_subset, prefix = "ind_", dummy_na = True, drop_first = True)
id_subset = pd.get_dummies(id_subset, columns = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id'],prefix = {x:x for x in id_subset.columns if x is not 'encounter_id'}, dummy_na = True, drop_first = True)
ccs_subset = pd.get_dummies(ccs_subset, prefix = "ind_", dummy_na = True, drop_first = True)
ccs_subset = ccs_subset.groupby(ccs_subset.columns, axis = 1).sum()
model_dataset = (model_subset.merge(id_subset, how = "left", on = "encounter_id")
                             .merge(ccs_subset, how = "left", on = "encounter_id")
                )

In [103]:
model_dataset.head()

Unnamed: 0,encounter_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,time_in_hospital,ind__Asian,...,ind__Ulcerat col,ind__Unclassified,ind__Urin stone,ind__Uterus cancr,ind__Varicose vn,ind__Viral infect,ind__Wht blood dx,ind__chf;nonhp,ind__gu cong anom,ind__nan_y
0,2278392,41,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2
1,149190,59,0,18,0,0,0,9,3,0,...,0,0,0,0,0,0,0,0,0,0
2,64410,11,5,13,2,0,1,6,2,0,...,0,0,0,0,0,0,0,0,0,2
3,500364,44,1,16,0,0,0,7,2,0,...,0,0,0,0,0,0,0,0,0,0
4,16680,51,0,8,0,0,0,5,1,0,...,0,0,0,0,0,0,0,0,0,1


In [102]:
model_dataset.shape

(101766, 266)

So far, all of the methods that we have described assume that we have some data, $\textbf{X}$. Our goal has been to learn a mapping from the data, $\textbf{X}$, to some labels $\textbf{y}$. However, what if some elements of $\textbf{X}$ are not useful in this mapping?

> ## Variable Selection
Given some predictors $x_1, x_2, \dots$, how do we select the subset that are most useful in our model?

## Regression Selection Methods

For this example, let's say we have $p$ predictors. That is, our predictors are $x_j, j = 1 \dots p$. How do we determine which indices $j$ should belong in a final model?

### Best Subset Selection

The naive way to do this is to just fit every single possible model and then select the best one according to some criterion.


For example, we would fit every model:

$$ y = \beta_0 $$
$$ y = \beta_0 + \beta_1x_1 $$
$$ y = \beta_0 + \beta_1x_2 $$ 
$$ y = \beta_0 + \beta_1x_1 + \beta_2x_2 $$


This will result in $2^p$ models, which is ... a lot

Practically, this subset selection is done in stages, that is:

 1. for $ j = 1 \dots p $:
     - Fit all ${p \choose j}$ possible models that have $j$ predictors
     - Pick the best model among those according to the smallest RSS or largest $R^2$
 2. for $ j = 1 \dots p $ and the null model, $y = \beta_0$, select the model among the "best" models according to some criterion ($C_p$, BIC, adjusted $R^2$)

The problem with this method is obvious -- with $p$ = 20, there are over 1 million possible models to consider. Consider the problem of multiple comparisons.

How do we narrow down the number of models to consider?

## Stepwise Selection

ISL - http://www-bcf.usc.edu/~gareth/ISL/

### Forward Stepwise Selection

Instead of choosing from all $2^p$ models, we start with a model with no predictors, and then add predictors one at a time to see if it helps the model until either all the predictors are added, or there is no additional benefit to adding a predictor.

1. Start with the simple model: $y = \beta_0$

2. for $ j = 0 \dots, p-1 $:
    - Fit all $p - j$ models that add 1 predictor to the current model
    - Select the best among these
        + This can be done by looking at minimizing RSS, or looking at $R^2$
3. Select the best model according to $C_p$, BIC, or adjusted $R^2$

#### OR

1. Start with the simple model: $y = \beta_0$

2. for $ j = 0 \dots, p-1 $:
    - Fit all $p - j$ models that add 1 predictor to the current model
    - Select the best among these
        + Run a F-test to check for model significance. Pick the model with the highest $p-value$

3. Continue until the F-test is no longer significant at the desired $\alpha$

In [12]:
# From https://planspace.org/20150423-forward_selection_with_statsmodels/
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

These methods are greedy -- they consider only a small subset of the possible models. 

### Backward Stepwise Selection

Exactly the opposite procedure of Forward Stepwise Selection

1. Start with the full model: $ y = \beta_0 + \beta_1x_1 + \beta_2x_2 \dots \beta_px_p $
2. for $j = p, p-1 \dots 1$:
    - Consider all $j$ models that contain every predictor except 1
    - Choose the best model out of these
3. Select the best model according to $C_p$, BIC, or adjusted $R^2$

These two approaches may arrive at different models! Backwards selection tends to select too many variables, while forwards selection selects too few

### Forward-Backward selection (Stepwise)

This procedure, which was inspired by the forward stepwise selection and backward stepwise selection procedure, works as follows:

Define $\alpha_1$ to be the significance cutoff to add a variable and $\alpha_2$ to be the significance level for removing a variable.

1. Start with the null model: $y = \beta_0$
2. Perform 1 step of the Forward Stepwise Selection Procedure, looking at the the F-test p-value to determine which variable to add.
3. After each Forward Stepwise Selection Procedure step, check the significance of each of the other predictors in the model, and remove them if they fall below $\alpha_2$.
4. Continue until all variables are in the model, or Forward Stepwise Selection Procedure falls above $\alpha_1$

## Choosing a best model

Remember that $R^2$, which is a function of RSS, always increases when you add predictors. Therefore, it is not a good measure for which model is best. 

RSS = Residual Sum of Squares = $\sum_{i=1}^{m} (y_i - f(\boldsymbol{x_i}))^2$

### $C_p$

If a model has $d$ predictors, $C_p$ estimates the *test* MSE:

$$ C_p = \frac{1}{n}(RSS + 2d\hat{\sigma}^2) $$

where $\sigma^2$ is an estimate of the variance of the error $\epsilon_i$. (For linear regression, the full form is $y = \beta_0 + \beta_1x_1 \dots \beta_dx_d + \epsilon_i, \epsilon_i \sim N(0, \sigma^2)$ )

Notice how this goes up as the the number of predictors increases

### Bayesian Information Criterion (BIC)

The BIC looks like $C_p$, but is derived from the bayesian point of view.

It is given by:

$$ BIC = \frac{1}{n}(RSS + log(n)d\hat{\sigma}^2) $$

It replaces the factor of 2 in the $C_p$ with log(n). However, for any n > 7, this will end up being >2. Therefore, the BIC is often said to result in smaller models than $C_p$

### Adjusted $R^2$

Recall that $R^2 = RSS/TSS$

OR

$$R^2 = \frac{var(\boldsymbol{\beta}^T\textbf{x})}{var(y)} = \frac{\Sigma_{i=1}^n(\hat{y}_i - \bar{y})^2}{\Sigma_{i=1}^n(y_i - \bar{y})} $$

The Adjusted $R^2$ is a method to use $R^2$ to select models. As we have seen, $R^2$ tends to increase when you add variables to a model. Therfore, adjusted $R^2$ penalizes the number of parameters:

$$Adjusted R^2 = 1 - \frac{RSS/(n-d-1)}{TSS/(n-1)} $$


## Bias-Variance Tradeoff

In linear regression, we try to minimize the mean square error, which in general is given by

$$ MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{f}(x_i))^2 $$ where $\hat{f}(x_i)$ is your model prediction.

Let's say the true model for Y is given by $Y = f(x)$

We can define a bias as being $E[\hat{f}(x)] - f(x)$

Then, we can decompose the expression as follows:

$$ E[(\hat{f}(x) - f(x))^2] = MSE$$

define $\mu = E[\hat{f}(x)]$

$$ E[(\hat{f}(x) - f(x))^2] = E[(\hat{f}(x) - \mu) + (\mu - f(x))^2] $$

$$ = E[(\hat{f}(x) - \mu)^2 + 2(\hat{f}(x) - \mu)(\mu - f(x)) + (\mu-f(x))^2] $$

Note that $E[\hat{f}(x) - \mu] = E[\hat{f}(x) - E[\hat{f}(x)]] = 0 $

$$ E[(\hat{f}(x) - f(x))^2] = E[(\hat{f}(x) - E[\hat{f}(x)])^2] + (E[\hat{f}(x)] - f(x))^2 $$
$$ = var(\hat{f}(x)) + bias(\hat{f}(x))^2 $$

*note:* Here, we've ommitted another term which is the *irreducible* error from $Var(\epsilon)$

What is $var(\hat{f}(x))$? What is the bias?

The *variance* refers to how much $\hat{f}(x)$ changes if it were estimated from a different training set. Ideally, this variance would be small, since small changes in the dataset should not produce a large change in the model estimate. 

In general, more flexible methods have higher variance. 


The *bias* arises because we are often trying to model a complex phenomenon, but the model is too simplistic, or does not have enough information to perfectly model the phenomenon. Does a linear regression actually capture all of the intracacies and nonlinear patterns in the data?

Generally, more flexible methods tend to have lower bias.

In general, as the methods become more flexible (more parameters, etc.), the variance will increase and the bias will decrease. This is where the trade-off comes from. The goal of model selection should be to find the right mix of bias and variance.

You could draw a line through every point (low bias, high variance) or draw just a horizontal line (high bias, low variance). 

![](./assets/biasvariance.png)
http://www-bcf.usc.edu/~gareth/ISL/

## Other variables selection methods: 

Other methods that can be used *prior* to fitting any models generally involve looking at how related each of the features are with the outcome of interest. For example, you could choose some metric (correlation, mutual information, etc.) and rank all of the features and then select the top *k* or *x%* of the features.

[Scitkit learn reference](https://scikit-learn.org/stable/modules/feature_selection.html)

For example, we could look at our matrix, which has 266 variables, and select the top 25

In [22]:
model_dataset.shape

(101766, 266)

In [26]:
model_dataset.head()

Unnamed: 0,encounter_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,time_in_hospital,ind__Asian,...,ind__Ulcerat col,ind__Unclassified,ind__Urin stone,ind__Uterus cancr,ind__Varicose vn,ind__Viral infect,ind__Wht blood dx,ind__chf;nonhp,ind__gu cong anom,ind__nan_y
0,2278392,41,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2
1,149190,59,0,18,0,0,0,9,3,0,...,0,0,0,0,0,0,0,0,0,0
2,64410,11,5,13,2,0,1,6,2,0,...,0,0,0,0,0,0,0,0,0,2
3,500364,44,1,16,0,0,0,7,2,0,...,0,0,0,0,0,0,0,0,0,0
4,16680,51,0,8,0,0,0,5,1,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [25]:
# Initialize the object
selector = SelectKBest(mutual_info_classif, 50)

In [29]:
selector.fit(X=model_dataset.iloc[:, 2:], y=model_dataset.iloc[:, 1])

SelectKBest(k=50, score_func=<function mutual_info_classif at 0xa20958d40>)

In [30]:
selector.scores_

array([1.65203387e-02, 4.50796479e-02, 9.79541668e-03, 0.00000000e+00,
       3.95922976e-03, 2.49536205e-02, 6.73975440e-02, 1.84576769e-03,
       5.97348107e-03, 0.00000000e+00, 6.09655091e-04, 0.00000000e+00,
       0.00000000e+00, 1.89419422e-03, 6.38848470e-03, 2.62574806e-02,
       0.00000000e+00, 4.08005845e-02, 1.11957573e-02, 1.39189889e-03,
       1.80932427e-03, 1.38836470e-03, 1.57159179e-03, 5.56767675e-03,
       7.32163538e-04, 1.00246477e-03, 0.00000000e+00, 0.00000000e+00,
       1.11667256e-04, 2.95424998e-03, 5.69640462e-03, 5.89616849e-03,
       6.68179474e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.75013711e-03, 0.00000000e+00,
       0.00000000e+00, 8.80164792e-04, 0.00000000e+00, 9.26600912e-04,
       2.06453426e-03, 0.00000000e+00, 0.00000000e+00, 2.40303979e-04,
       4.40237361e-03, 0.00000000e+00, 4.39732772e-04, 1.83095728e-03,
       7.81963554e-04, 4.64377420e-02, 3.45312042e-03, 1.61844221e-03,
      

In [38]:
np.argsort(selector.scores_)[::-1]

array([  6,  60,  53,   1,  17,  15,   5,   0,  18,   2,  62, 146, 227,
       106, 123,  93,  32,  14, 125,   8,  31,  30,  23, 252,  98, 141,
       189, 110,  69, 261,  65, 185,  91, 231, 194,  83, 122,  48,  59,
       195, 236,  95, 217,   4,  70, 209, 213, 102, 222, 190,  54, 228,
       138, 226, 204, 120, 172, 147,  80, 158,  29, 116, 254, 165, 159,
        61,  38, 139, 238, 135, 229,  81,  77, 221, 119, 223, 107, 198,
       211, 182, 247,  44,  76, 109,  13, 215,  82, 156,   7, 150,  51,
       249,  20,  97, 230, 115,  55, 205,  22, 129, 202, 233,  89, 168,
       127, 212,  88,  19,  21,  73, 186, 248, 171,  58, 161, 126, 245,
       203, 243, 113, 155, 137, 262, 145, 259,  25, 225, 200, 128, 237,
        43, 256,  41, 169, 255, 101,  52, 220, 176, 179, 103, 143,  24,
       163, 175, 170, 144,  71, 131,  10, 208, 173,  57, 246, 193,  50,
       140,  56, 160, 177, 183,  47,  85, 133, 214,  28,  63, 178, 239,
        68, 206, 260,  37,  36,  39, 251,  40, 201,  42,  34, 19

In [41]:
[list(model_dataset.iloc[:, 2:].columns)[x] for x in np.argsort(selector.scores_)[::-1][0:50]]

['time_in_hospital',
 'admission_source_id_17.0',
 'admission_source_id_7.0',
 'num_medications',
 'admission_type_id_5.0',
 'admission_type_id_3.0',
 'number_diagnoses',
 'num_procedures',
 'admission_type_id_6.0',
 'number_outpatient',
 'admission_source_id_22.0',
 'ind__Int obstruct',
 'ind__Precere occl',
 'ind__DiabMel w/cm',
 'ind__Exam/eval',
 'ind__Cervix cancr',
 'discharge_disposition_id_12.0',
 'admission_type_id_2.0',
 'ind__Fluid/elc dx',
 'ind__Caucasian',
 'discharge_disposition_id_11.0',
 'discharge_disposition_id_10.0',
 'discharge_disposition_id_3.0',
 'ind__UTI',
 'ind__Complic proc',
 'ind__Htn complicn',
 'ind__Ot nutrit dx',
 'ind__E Codes: Fire/burn',
 'ind__Adjustment disorders',
 'ind__chf;nonhp',
 'ind__Abdomnl pain',
 'ind__Ot hematl dx',
 'ind__Cardiac anom',
 'ind__Rehab',
 'ind__Oth CNS infx',
 'ind__Bnign ut neo',
 'ind__Esophgeal dx',
 'admission_source_id_2.0',
 'admission_source_id_14.0',
 'ind__Oth bact inf',
 'ind__Septicemia',
 'ind__Coag/hemr dx',


In [32]:
selector.transform(model_dataset.iloc[:, 2:])

array([[ 0,  1,  0, ...,  0,  0,  0],
       [ 0, 18,  0, ...,  0,  0,  0],
       [ 5, 13,  2, ...,  0,  0,  0],
       ...,
       [ 0,  9,  1, ...,  1,  0,  0],
       [ 2, 21,  0, ...,  0,  0,  0],
       [ 3,  3,  0, ...,  0,  0,  0]])

## Shrinkage methods and Regularization

Rather than explicitly selecting variables, we can *constrain* or *regularize* the coefficient estimates. These are known as *shrinkage* methods because they *shrink* the coefficient estimates towards 0. The reason why this might be a good idea is that it can often dramatically reduce the variance in the estimates, which we will discuss later.

The two most popular methods for regularization are known as Ridge Regression and LASSO

## Ridge Regression

Remember that in linear regression, we are interested in minimizing the squared-error loss, or maximizing the log likelihood. 

$$ \hat{\beta} = \underset{\beta}{\arg\min} \sum_{i=1}^{n}(y_i - \boldsymbol{\beta}^T\boldsymbol{x_i})$$

In Ridge Regression, we add a penalty so that sum of squared $\beta$ terms does not get too large. 

$$ \hat{\beta} = \underset{\beta}{\arg\min} \sum_{i=1}^{n}(y_i - \beta_0 - \sum_{j=1}^{p}x_{ij}\beta_j)^2 + \lambda\sum_{j=1}^{p}\beta_j^2$$

Where $\lambda$ controls the amount of shrinkage. Higher values of $\lambda$ will result in more shrinkage. This applies for more than just the linear regression problem, and is known *weight decay* in neural networks.

Often, we want to standardize the inputs before running regularization so that the penalty is applied evenly. We can standardize a given variable x, by

$$ z_i = \frac{x_i - \bar{x}}{\sigma(x)} $$

in addition, we only want to penalize the non-intercept terms. If all of the predictors have been standardized, then $\beta_0$ is just the mean of the y values. 

## LASSO

The LASSO penalty is similar to the ridge penalty:

$$ \hat{\beta} = \underset{\beta}{\arg\min} \sum_{i=1}^{n}(y_i - \beta_0 - \sum_{j=1}^{p}x_{ij}\beta_j)^2 + \lambda\sum_{j=1}^{p}|\beta_j|$$

Again, the same $\lambda$ parameter controls the regularization. Unlike the ridge regression, this can actually shrink some of the coefficients, $\beta$ to 0. Therefore, unlike Ridge Regression, the LASSO also preforms *variable selection*. 

### Goal: 

![](./assets/full.png)
https://www.linkedin.com/pulse/intuitive-visual-explanation-differences-between-l1-l2-xiaoli-chen/

## Constraint (penalty)

![](./assets/constraint.png)

## MSE 

![](./assets/elipses.png)

## Increasing Penalty 

![](./assets/increase_penalty.png)

## Elastic Net

There are other methods that take both the ridge approach as well as the LASSO approach. One such method is known as the Elastic Net.

$$ \hat{\beta} = \underset{\beta}{\arg\min} \sum_{i=1}^{n}(y_i - \beta_0 - \sum_{j=1}^{p}x_{ij}\beta_j)^2 + \lambda_1\sum_{j=1}^{p}|\beta_j| + \lambda_2\sum_{j=1}^{p}\beta_j^2$$

This is often used for problems known as "large $p$, small $n$", where the number of predictors is quite large in comparison to the number of samples, which is common in genomics. Ridge Regression may not be able to run due to the large number of predictors, but the LASSO may select too few parameters (it will shrink too many to 0).

This is a good middle ground that will select groups of correlated predictors, in theory.

### Lasso in action: 

In [52]:
from sklearn.linear_model import Lasso

> class sklearn.linear_model.Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')[source]

In [77]:
# Initialize the object: 
lr = Lasso(alpha=.001, normalize=True)

In [78]:
lr.fit(X=model_dataset.iloc[:, 2:],
       y=model_dataset.iloc[:, 1])

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [79]:
np.count_nonzero(lr.coef_)

41

In [83]:
[list(model_dataset.iloc[:, 2:].columns)[x] for x in np.argsort(lr.coef_)[::-1][0:41]]

['admission_type_id_6.0',
 'admission_source_id_6.0',
 'admission_source_id_7.0',
 'discharge_disposition_id_11.0',
 'ind__DiabMel w/cm',
 'ind__UTI',
 'ind__Septicemia',
 'ind__Fluid/elc dx',
 'admission_source_id_9.0',
 'ind__Alcohol-related disorders',
 'ind__Pancreas dx',
 'ind__Abdomnl pain',
 'ind__Hrt valve dx',
 'time_in_hospital',
 'ind__Wht blood dx',
 'ind__Pleurisy',
 'ind__Coag/hemr dx',
 'num_medications',
 'ind__Oth liver dx',
 'number_diagnoses',
 'ind__chf;nonhp',
 'ind__Acute MI',
 'ind__Bone/ct cncr',
 'ind__Brain/ns can',
 'ind__Coma/brn dmg',
 'ind__Colon cancer',
 'ind__Biliary dx',
 'ind__Chr kidney disease',
 'ind__Art embolism',
 'ind__Cervix cancr',
 'ind__Carditis',
 'ind__Cardiac anom',
 'ind__Bladder cncr',
 'ind__COPD',
 'ind__Anxiety disorders',
 'ind__Asp pneumon',
 'ind__Burns',
 'ind__Contraceptiv',
 'ind__Appendicitis',
 'ind__Bronchitis',
 'ind__Breast dx']

In [84]:
[list(model_dataset.iloc[:, 2:].columns)[x] for x in np.argsort(selector.scores_)[::-1][0:50]]

['time_in_hospital',
 'admission_source_id_17.0',
 'admission_source_id_7.0',
 'num_medications',
 'admission_type_id_5.0',
 'admission_type_id_3.0',
 'number_diagnoses',
 'num_procedures',
 'admission_type_id_6.0',
 'number_outpatient',
 'admission_source_id_22.0',
 'ind__Int obstruct',
 'ind__Precere occl',
 'ind__DiabMel w/cm',
 'ind__Exam/eval',
 'ind__Cervix cancr',
 'discharge_disposition_id_12.0',
 'admission_type_id_2.0',
 'ind__Fluid/elc dx',
 'ind__Caucasian',
 'discharge_disposition_id_11.0',
 'discharge_disposition_id_10.0',
 'discharge_disposition_id_3.0',
 'ind__UTI',
 'ind__Complic proc',
 'ind__Htn complicn',
 'ind__Ot nutrit dx',
 'ind__E Codes: Fire/burn',
 'ind__Adjustment disorders',
 'ind__chf;nonhp',
 'ind__Abdomnl pain',
 'ind__Ot hematl dx',
 'ind__Cardiac anom',
 'ind__Rehab',
 'ind__Oth CNS infx',
 'ind__Bnign ut neo',
 'ind__Esophgeal dx',
 'admission_source_id_2.0',
 'admission_source_id_14.0',
 'ind__Oth bact inf',
 'ind__Septicemia',
 'ind__Coag/hemr dx',


In [85]:
np.intersect1d([list(model_dataset.iloc[:, 2:].columns)[x] for x in np.argsort(lr.coef_)[::-1][0:41]], [list(model_dataset.iloc[:, 2:].columns)[x] for x in np.argsort(selector.scores_)[::-1][0:50]])

array(['admission_source_id_7.0', 'admission_type_id_6.0',
       'discharge_disposition_id_11.0', 'ind__Abdomnl pain',
       'ind__Alcohol-related disorders', 'ind__Cardiac anom',
       'ind__Cervix cancr', 'ind__Coag/hemr dx', 'ind__DiabMel w/cm',
       'ind__Fluid/elc dx', 'ind__Pleurisy', 'ind__Septicemia',
       'ind__UTI', 'ind__chf;nonhp', 'num_medications',
       'number_diagnoses', 'time_in_hospital'], dtype='<U50')

## Ridge doesn't select 

In [87]:
from sklearn.linear_model import Ridge

> sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)

In [88]:
ridge_reg = Ridge(alpha=10, normalize=True)

In [89]:
ridge_reg.fit(X=model_dataset.iloc[:, 2:],
       y=model_dataset.iloc[:, 1])

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)

In [90]:
ridge_reg.coef_

array([ 5.63141555e-02,  5.57795899e-02, -1.06345410e-02, -9.85371200e-03,
        4.19108084e-02,  1.20198868e-01,  1.78330056e-01, -1.63561261e-01,
       -9.02865313e-02, -1.62663655e-02,  2.82350828e-02,  1.10341920e-01,
       -4.48945520e-03, -7.40645651e-01, -2.45934158e-01, -8.50890319e-01,
        3.45694555e-01, -1.47567534e+00,  9.25172976e-01,  3.53737523e-01,
        1.77048200e-01,  0.00000000e+00,  1.19028012e-01,  2.02286478e-01,
       -1.29325954e-01,  3.06342153e-01,  6.50002532e-02,  1.87483249e-01,
        1.72275517e-01,  2.51683006e-04,  9.22448441e-01,  7.74883598e-01,
       -4.69846619e-01,  3.40267457e-01,  3.64956200e-01,  1.41041267e-01,
       -2.07732440e+00, -2.10051542e+00,  1.67666335e-03, -9.15946557e-02,
       -7.44698531e-01, -9.27818252e-02,  1.53752283e-01, -1.76740830e-01,
       -4.14179079e-01,  3.50081458e-01,  3.14408491e-01,  0.00000000e+00,
       -1.28779517e-01, -1.90840564e-01, -3.58385458e-01,  3.31973463e-01,
        7.30968826e-01,  