# Selekcja atrybutów - stepwise

### Algorytm (forward)

1. Zaczynamy od pustego modelu (0 zmiennych wybranych) i przyjmujemy, że wartość kryterium (np. BIC, AIC): C = Inf
2. Szukamy zmiennej, której dodanie do modelu najbardziej zmniejszy kryterium C. 
   * Jeśli wartość C jest mniejsza od poprzedniej wartości - dodajemy zmienną i kontynuujemy
   * Jeśli wartość C dla najlepszego znalezionego w tym kroku modelu nie spadła w stosunku do poprzedniego kroku,
     to nie dołączamy zmiennej i kończymy

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import sklearn.metrics

In [103]:
def forward_selection(X,y):
    (n,p) = X.shape
    
    #print(n)
    #print(p)
    
    model = linear_model.LinearRegression()
    kf = model_selection.KFold(n_splits=5,shuffle=True)
    
    bic_min = np.inf
    i_min = 0
    
    indexes = []
    
    #print(bic_min)
    
    while(i_min >= 0):
        
        print('bic_min = {}'.format(bic_min))
        
        if bic_min != np.inf:
            indexes.append(i_min)
        
        print('indexes = {}'.format(indexes))
        i_min = -1

        for i in range(p):
            
            if i in indexes:
                #print('{} already in list'.format(i))
                continue
                
            indexes_i = indexes.copy()
            indexes_i.append(i)

            X_i = X.iloc[:,indexes_i]
            #print(indexes_i)
        
            y_pred = model_selection.cross_val_predict(model,X_i,y,cv=kf)
            mse = sklearn.metrics.mean_squared_error(y,y_pred)
            bic = n * np.log(mse) + p* np.log(n)
            
            #print('{} : bic = {}'.format(i,bic))
            if bic < bic_min:
                bic_min = bic
                i_min = i


    return indexes


## Test

In [104]:
df_adv = pd.read_csv('slack/winequality-red.csv',sep=';')
df_adv.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [105]:
y = df_adv['alcohol']

X = df_adv.iloc[:,0:10]

In [109]:
best_indexes = forward_selection(X,y)
print("Best parameters ({}) : {}".format(len(best_indexes),df_adv.columns[best_indexes]))

bic_min = inf
indexes = []
bic_min = -170.71214819551125
indexes = [7]
bic_min = -476.98627351944987
indexes = [7, 0]
bic_min = -849.5324406304541
indexes = [7, 0, 8]
bic_min = -1257.4511747364108
indexes = [7, 0, 8, 3]
bic_min = -1415.4189071839137
indexes = [7, 0, 8, 3, 9]
bic_min = -1436.5464206458976
indexes = [7, 0, 8, 3, 9, 6]
bic_min = -1445.271313903757
indexes = [7, 0, 8, 3, 9, 6, 2]
bic_min = -1465.3659607089558
indexes = [7, 0, 8, 3, 9, 6, 2, 5]
Best parameters (8) : Index(['density', 'fixed acidity', 'pH', 'residual sugar', 'sulphates',
       'total sulfur dioxide', 'citric acid', 'free sulfur dioxide'],
      dtype='object')
