# Stepwise Regression

### Forward selection with adjusted R-squared:

In [4]:
# code from http://planspace.org/20150423-forward_selection_with_statsmodels/

import statsmodels.formula.api as smf

def forward_selection(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)  # remove dependent variable
    selected = []  # to hold selected independent variables
    current_score, best_new_score = 0.0, 0.0  # set scores to 0 before iterations
    while remaining and current_score == best_new_score:  # while there are still independent vars to test
        scores_with_candidates = []
        for candidate in remaining:  # each possible ind. var.
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))  # add to already selected ind. vars
            
            score = smf.ols(formula, data).fit().rsquared_adj  # run the reg. and get the adj. rsquared
            scores_with_candidates.append((score, candidate))  # append the adj. rsquared and ind. var. name
        scores_with_candidates.sort()  # sort scores low to high
        best_new_score, best_candidate = scores_with_candidates.pop()  # assign and remove highest score and name
        if current_score < best_new_score:  # if the new score is better than the old
            remaining.remove(best_candidate)  # remove ind. var. from remaining
            selected.append(best_candidate)  # add ind. var. to final selection
            current_score = best_new_score  # make this score the new one to beat
            
    # if all variables were tested or the score did not improve
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))  # format the formula string for smf
    model = smf.ols(formula, data).fit()  # fit and return the final model
    return model

The `statsmodel` library prefers `pandas` data frames over `numpy` arrays as in `scikit-learn`, but luckily `pandas` has some nice methods that can read in data directly from the web. Let's grab a dataset from a Princeton class used for linear regression. The data consists "of observations on six variables for 52 tenure-track professors in a small college."

In [5]:
import pandas as pd

url = "http://data.princeton.edu/wws509/datasets/salary.dat"
data = pd.read_csv(url, sep='\\s+')

data description: http://data.princeton.edu/wws509/datasets/#salary 

- sx = Sex, coded 1 for female and 0 for male
- rk = Rank, coded
- 1 for assistant professor,
- 2 for associate professor, and
- 3 for full professor
- yr = Number of years in current rank
- dg = Highest degree, coded 1 if doctorate, 0 if masters
- yd = Number of years since highest degree was earned
- sl = Academic year salary, in dollars.

Used in: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.

In [6]:
data

Unnamed: 0,sx,rk,yr,dg,yd,sl
0,male,full,25,doctorate,35,36350
1,male,full,13,doctorate,22,35350
2,male,full,10,doctorate,23,28200
3,female,full,7,doctorate,27,26775
4,male,full,19,masters,30,33696
5,male,full,16,doctorate,21,28516
6,female,full,0,masters,32,24900
7,male,full,16,doctorate,18,31909
8,male,full,13,masters,30,31850
9,male,full,13,masters,31,32850


In [7]:
model = forward_selection(data, 'sl')

In [8]:
model.model.formula

'sl ~ rk + yr + 1'

In [9]:
model.rsquared_adj

0.83519076053798602

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,sl,R-squared:,0.845
Model:,OLS,Adj. R-squared:,0.835
Method:,Least Squares,F-statistic:,87.15
Date:,"Fri, 17 Feb 2017",Prob (F-statistic):,1.95e-19
Time:,13:40:07,Log-Likelihood:,-476.48
No. Observations:,52,AIC:,961.0
Df Residuals:,48,BIC:,968.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.62e+04,638.677,25.370,0.000,1.49e+04 1.75e+04
rk[T.associate],4262.2847,882.891,4.828,0.000,2487.113 6037.457
rk[T.full],9454.5232,905.830,10.437,0.000,7633.230 1.13e+04
yr,375.6956,70.918,5.298,0.000,233.106 518.285

0,1,2,3
Omnibus:,26.224,Durbin-Watson:,1.808
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51.449
Skew:,1.516,Prob(JB):,6.73e-12
Kurtosis:,6.814,Cond. No.,32.2


### scikit-learn's F Regression

In [11]:
import numpy as np

cols_to_transform = ["sx", "rk", "dg"]
df_with_dummies = pd.get_dummies(data, columns = cols_to_transform )
np.array(df_with_dummies.drop("sl", 1))
print(df_with_dummies.drop("sl", 1))

    yr  yd  sx_female  sx_male  rk_assistant  rk_associate  rk_full  \
0   25  35          0        1             0             0        1   
1   13  22          0        1             0             0        1   
2   10  23          0        1             0             0        1   
3    7  27          1        0             0             0        1   
4   19  30          0        1             0             0        1   
5   16  21          0        1             0             0        1   
6    0  32          1        0             0             0        1   
7   16  18          0        1             0             0        1   
8   13  30          0        1             0             0        1   
9   13  31          0        1             0             0        1   
10  12  22          0        1             0             0        1   
11  15  19          0        1             0             1        0   
12   9  17          0        1             0             0        1   
13   9

In [12]:
from sklearn.feature_selection import f_regression
f_regression(np.array(df_with_dummies.drop("sl", 1)), np.array(data["sl"]), center=True)

(array([ 48.21967542,  41.81524639,   3.4130356 ,   3.4130356 ,
         63.73033808,   0.20821107,  83.42291275,   0.24427162,   0.24427162]),
 array([  7.34137944e-09,   4.10171640e-08,   7.06039364e-02,
          7.06039364e-02,   1.76627996e-10,   6.50149119e-01,
          3.09650171e-12,   6.23302309e-01,   6.23302309e-01]))