### Data Mining and Machine Learning
### Feature Selection in Supervised  Classification: Wrapper Methods 
### Edgar Acuna

Methods Forward and backward using the module mlxtend and function RFE from scikit-learn. Only LDA and Naive Bayes classifiers are used

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
%matplotlib inline

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)

(768, 9)


### Forward using the  LDA classifier

In [3]:
y=data['class']
X=data.iloc[:,0:8]
y1=y.as_matrix()
X1=X.as_matrix()
names=X.columns
estimator = LinearDiscriminantAnalysis()

In [4]:
sfs1 = SFS(estimator, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)

In [5]:
sfs1.subsets_

{1: {'avg_score': 0.74609375,
  'cv_scores': array([ 0.74609375]),
  'feature_idx': (1,)},
 2: {'avg_score': 0.77083333333333337,
  'cv_scores': array([ 0.77083333]),
  'feature_idx': (1, 5)},
 3: {'avg_score': 0.77213541666666663,
  'cv_scores': array([ 0.77213542]),
  'feature_idx': (1, 5, 6)}}

Comentario. las mejores tres variables son: plas, mass y pedi

### Forward using Gaussian Naive Bayes

In [6]:
clf = GaussianNB()
sfs1 = SFS(clf, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)

In [7]:
sfs1.subsets_

{1: {'avg_score': 0.75, 'cv_scores': array([ 0.75]), 'feature_idx': (1,)},
 2: {'avg_score': 0.76692708333333337,
  'cv_scores': array([ 0.76692708]),
  'feature_idx': (1, 5)},
 3: {'avg_score': 0.76692708333333337,
  'cv_scores': array([ 0.76692708]),
  'feature_idx': (1, 2, 5)}}

Comentario. las 3 mejores variables son: plas,pres y mass

Backward Elimination using Gaussian Naive Bayes for Diabetes

In [8]:
clf = GaussianNB()
sfs1 = SFS(clf, k_features=3, forward=False, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

{3: {'avg_score': 0.76692708333333337,
  'cv_scores': array([ 0.76692708]),
  'feature_idx': (1, 2, 5)},
 4: {'avg_score': 0.77213541666666663,
  'cv_scores': array([ 0.77213542]),
  'feature_idx': (1, 2, 5, 6)},
 5: {'avg_score': 0.77604166666666663,
  'cv_scores': array([ 0.77604167]),
  'feature_idx': (0, 1, 2, 5, 6)},
 6: {'avg_score': 0.7734375,
  'cv_scores': array([ 0.7734375]),
  'feature_idx': (0, 1, 2, 4, 5, 6)},
 7: {'avg_score': 0.76822916666666663,
  'cv_scores': array([ 0.76822917]),
  'feature_idx': (0, 1, 2, 4, 5, 6, 7)},
 8: {'avg_score': 0.76302083333333337,
  'cv_scores': array([ 0.76302083]),
  'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7)}}

### Recursive Feature Elimination for Diabetes

In [9]:
#Selecting the three best features with  RFE
selector = RFE(estimator, 3, step=1)
selector = selector.fit(X, y)
print "Features sorted by their rank:"
print sorted(zip(map(lambda x: round(x, 4), selector.ranking_), names))
# evaluanado el modelo con las 3 features seleccionadas
pred_y = selector.predict(X1)
#Calculando accuracy
acc=(y1==pred_y).sum()
print("Accuracy: %.2f%%" % (acc*100.0/float(len(y1))))

Features sorted by their rank:
[(1.0, 'mass'), (1.0, 'pedi'), (1.0, 'preg'), (2.0, 'plas'), (3.0, 'age'), (4.0, 'pres'), (5.0, 'test'), (6.0, 'skin')]
Accuracy: 69.53%


Forward using LDA for Vehicle

In [10]:
data=pd.read_csv("c://PW-PR/vehicle.csv")
y=data['Class']
X=data.iloc[:,0:18]
y1=y.as_matrix()
X1=X.as_matrix()
names=X.columns
print names
estimator = LinearDiscriminantAnalysis()
sfs1 = SFS(estimator, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

Index([u'COMPACTNESS', u'CIRCULARITY', u'DISTANCE_CIRCULARITY',
       u'RADIUS_RATIO', u'PR.AXIS_ASPECT_RATIO', u'MAX.LENGTH_ASPECT_RATIO',
       u'SCATTER_RATIO', u'ELONGATEDNESS', u'PR.AXIS_RECTANGULARITY',
       u'MAX.LENGTH_RECTANGULARITY', u'SCALED_VARIANCE_MAJOR',
       u'SCALED_VARIANCE_MINOR', u'SCALED_RADIUS_OF_GYRATION',
       u'SKEWNESS_ABOUT_MAJOR', u'SKEWNESS_ABOUT_MINOR',
       u'KURTOSIS_ABOUT_MAJOR', u'KURTOSIS_ABOUT_MINOR', u'HOLLOWS_RATIO'],
      dtype='object')


{1: {'avg_score': 0.41843971631205673,
  'cv_scores': array([ 0.41843972]),
  'feature_idx': (10,)},
 2: {'avg_score': 0.6028368794326241,
  'cv_scores': array([ 0.60283688]),
  'feature_idx': (5, 10)},
 3: {'avg_score': 0.62293144208037821,
  'cv_scores': array([ 0.62293144]),
  'feature_idx': (2, 5, 10)}}

This three features are the most important ones: CIRCULARITY', PR.AXIS_ASPECT_RATIO', 'MAX.LENGTH_RECTANGULARITY'

RFE for vehicle

In [11]:
#Selecting six best features with  RFE
selector = RFE(estimator, 6, step=1)
selector = selector.fit(X, y)
print "Features sorted by their rank:"
print sorted(zip(map(lambda x: round(x, 4), selector.ranking_), names))
# evaluanado el modelo con las 3 features seleccionadas
pred_y = selector.predict(X1)
#Calculando accuracy
acc=(y1==pred_y).sum()
print("Accuracy: %.2f%%" % (acc*100.0/float(len(y1))))

Features sorted by their rank:
[(1.0, 'CIRCULARITY'), (1.0, 'ELONGATEDNESS'), (1.0, 'HOLLOWS_RATIO'), (1.0, 'KURTOSIS_ABOUT_MINOR'), (1.0, 'MAX.LENGTH_RECTANGULARITY'), (1.0, 'PR.AXIS_RECTANGULARITY'), (2.0, 'PR.AXIS_ASPECT_RATIO'), (3.0, 'RADIUS_RATIO'), (4.0, 'MAX.LENGTH_ASPECT_RATIO'), (5.0, 'COMPACTNESS'), (6.0, 'DISTANCE_CIRCULARITY'), (7.0, 'SKEWNESS_ABOUT_MAJOR'), (8.0, 'SKEWNESS_ABOUT_MINOR'), (9.0, 'SCATTER_RATIO'), (10.0, 'SCALED_VARIANCE_MINOR'), (11.0, 'KURTOSIS_ABOUT_MAJOR'), (12.0, 'SCALED_RADIUS_OF_GYRATION'), (13.0, 'SCALED_VARIANCE_MAJOR')]
Accuracy: 64.66%


In [12]:
#Forward usando el clasificador Naive Bayes en Landsat
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]
X=data.iloc[:,0:36]
y1=y.as_matrix()
X1=X.as_matrix()
features, labels = X.values, y.values
clf = GaussianNB()
sfs1 = SFS(clf, k_features=5, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

{1: {'avg_score': 0.58151071025930101,
  'cv_scores': array([ 0.58151071]),
  'feature_idx': (19,)},
 2: {'avg_score': 0.77700112739571592,
  'cv_scores': array([ 0.77700113]),
  'feature_idx': (16, 19)},
 3: {'avg_score': 0.80180383314543402,
  'cv_scores': array([ 0.80180383]),
  'feature_idx': (16, 17, 19)},
 4: {'avg_score': 0.80270574971815112,
  'cv_scores': array([ 0.80270575]),
  'feature_idx': (16, 17, 19, 27)},
 5: {'avg_score': 0.80789177001127399,
  'cv_scores': array([ 0.80789177]),
  'feature_idx': (9, 16, 17, 19, 27)}}