# Question 1 [20 Points] Linear Model Selection

In [125]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import itertools
import time

In [2]:
boston_df = pd.read_csv('./data/boston.csv')

In [3]:
boston_df = boston_df.drop([boston_df.columns[0],'medv', 'town', 'tract'],axis=1)

In [47]:
y = boston_df.cmedv

In [48]:
x = boston_df.drop(['cmedv'],axis=1)

In [49]:
X2 = sm.add_constant(x)

  return ptp(axis=axis, out=out, **kwargs)


In [50]:
est = sm.OLS(y, X2)

In [51]:
est2 = est.fit()

In [140]:
boston_df.drop(['cmedv'],axis=1).columns

Index(['lon', 'lat', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis',
       'rad', 'tax', 'ptratio', 'b', 'lstat'],
      dtype='object')

In [53]:
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                  cmedv   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     95.82
Date:                Fri, 20 Sep 2019   Prob (F-statistic):          5.77e-135
Time:                        22:18:45   Log-Likelihood:                -1492.9
No. Observations:                 506   AIC:                             3018.
Df Residuals:                     490   BIC:                             3086.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -434.9958    303.210     -1.435      0.1

## 5A - [5 Points] Report the most significant variable from this full model with all features.

Answer: 

One of the most significant variables is `nox`.

### 5B - [5 Points] Starting from this full model, use stepwise regression with both forward and backward and BIC criterion to select the best model. Which variables are removed from the full model?

In [95]:
def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]

In [105]:
def forward_regression(x, y,p_threshold):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(x.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(x[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < p_threshold:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True

        if not changed:
            break

    return diff(list(x.columns), included)

In [115]:
def backward_regression(x, y, p_threshold):
    included=list(x.columns)
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(x[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() 
        if worst_pval > p_threshold:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
        if not changed:
            break
    return diff(list(x.columns), included)

In [116]:
forward_regression(x, y, 0.05)

['lon', 'lat', 'indus', 'age']

In [117]:
backward_regression(x,y,0.05)

['lon', 'lat', 'indus', 'age']

Answer:

The features removed are: `lon, lat, indus, and age.`

## 5C - [5 Points] Starting from this full model, use the best subset selection and list the best model of each model size.

In [127]:
def processSubset(feature_set):
    model = sm.OLS(y,x[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(x[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

In [136]:
def getBestModel(k):
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(x.columns, k):
        results.append(processSubset(combo))
    
    models = pd.DataFrame(results)
    
    best_model = models.loc[models['RSS'].values.argmin()]
    
    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    
    return best_model

In [137]:
models_best = pd.DataFrame(columns=["RSS", "model"])

In [144]:
tic = time.time()
for i in range(1,15):
    models_best.loc[i] = getBestModel(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

Processed 15 models on 1 predictors in 0.05645895004272461 seconds.
Processed 105 models on 2 predictors in 0.27325010299682617 seconds.
Processed 455 models on 3 predictors in 1.111016035079956 seconds.
Processed 1365 models on 4 predictors in 4.137294054031372 seconds.
Processed 3003 models on 5 predictors in 8.624067068099976 seconds.
Processed 5005 models on 6 predictors in 14.818722248077393 seconds.
Processed 6435 models on 7 predictors in 20.54478406906128 seconds.
Processed 6435 models on 8 predictors in 22.214275121688843 seconds.
Processed 5005 models on 9 predictors in 15.250459909439087 seconds.
Processed 3003 models on 10 predictors in 10.26376724243164 seconds.
Processed 1365 models on 11 predictors in 4.095693826675415 seconds.
Processed 455 models on 12 predictors in 1.39158034324646 seconds.
Processed 105 models on 13 predictors in 0.3173072338104248 seconds.
Processed 15 models on 14 predictors in 0.04716086387634277 seconds.
Total elapsed time: 103.86371397972107 sec

In [145]:
models_best

Unnamed: 0,RSS,model
1,29423.376266,<statsmodels.regression.linear_model.Regressio...
2,15226.522528,<statsmodels.regression.linear_model.Regressio...
3,14181.164611,<statsmodels.regression.linear_model.Regressio...
4,13395.36578,<statsmodels.regression.linear_model.Regressio...
5,12980.2895,<statsmodels.regression.linear_model.Regressio...
6,12257.279438,<statsmodels.regression.linear_model.Regressio...
7,11931.141396,<statsmodels.regression.linear_model.Regressio...
8,11657.539412,<statsmodels.regression.linear_model.Regressio...
9,11464.331057,<statsmodels.regression.linear_model.Regressio...
10,11339.974458,<statsmodels.regression.linear_model.Regressio...


# Question 2 (50 Points) Code Your Own Lasso