# Step Forward Variable Selection
In this notebook we have a function that performs variable selection in the input dataset.
In this example I use linear regression with adjusted R square metric as parameter of selection, but this depends on the problem to solve.

In [0]:
from IPython.display import HTML, display

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

import pandas as pd
import numpy as np

In [0]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    
    #Insert your dataset here instead of your_df
    remaining = set(your_df.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [0]:
model = forward_selected(your_df, 'y')

print(model.model.formula)

print(model.rsquared_adj)

In [0]:
#Copy the best regression model to get the table with the results
import statsmodels.formula.api as sm
model = sm.ols(formula='age_diff ~ Age + CD8T + CD4T + NK + hdl + Bcell + alcoholisme + CI + talla + linfos + ldl + colling + monos', data=your_df)
fitted1 = model.fit()
fitted1.summary()#This gives a nice table as an output