### Data Mining and Machine Learning
### Feature Selection in Supervised  Classification: Wrapper Methods 
### Edgar Acuna
#### Febrero 2019

#### Methods Forward and backward using the module mlxtend and function RFE from scikit-learn. Only LDA and Naive Bayes classifiers are used

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
%matplotlib inline

In [2]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)

(768, 9)


### Forward using the  LDA classifier

In [3]:
y=data['class']
X=data.iloc[:,0:8]
y1=y.as_matrix()
X1=X.as_matrix()
names=X.columns
estimator = LinearDiscriminantAnalysis()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [4]:
sfs1 = SFS(estimator, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)

In [5]:
sfs1.subsets_

{1: {'avg_score': 0.74609375,
  'cv_scores': array([0.74609375]),
  'feature_idx': (1,),
  'feature_names': ('1',)},
 2: {'avg_score': 0.7708333333333334,
  'cv_scores': array([0.77083333]),
  'feature_idx': (1, 5),
  'feature_names': ('1', '5')},
 3: {'avg_score': 0.7721354166666666,
  'cv_scores': array([0.77213542]),
  'feature_idx': (1, 5, 6),
  'feature_names': ('1', '5', '6')}}

Comentario. las mejores tres variables son: plas, mass y pedi

### Forward using Gaussian Naive Bayes

In [6]:
clf = GaussianNB()
sfs1 = SFS(clf, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)

In [7]:
sfs1.subsets_

{1: {'avg_score': 0.75,
  'cv_scores': array([0.75]),
  'feature_idx': (1,),
  'feature_names': ('1',)},
 2: {'avg_score': 0.7669270833333334,
  'cv_scores': array([0.76692708]),
  'feature_idx': (1, 5),
  'feature_names': ('1', '5')},
 3: {'avg_score': 0.7669270833333334,
  'cv_scores': array([0.76692708]),
  'feature_idx': (1, 2, 5),
  'feature_names': ('1', '2', '5')}}

Comentario. las 3 mejores variables son: plas,pres y mass

Backward Elimination using Gaussian Naive Bayes for Diabetes

In [8]:
clf = GaussianNB()
sfs1 = SFS(clf, k_features=3, forward=False, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

{3: {'avg_score': 0.7669270833333334,
  'cv_scores': array([0.76692708]),
  'feature_idx': (1, 2, 5),
  'feature_names': ('1', '2', '5')},
 4: {'avg_score': 0.7721354166666666,
  'cv_scores': array([0.77213542]),
  'feature_idx': (1, 2, 5, 6),
  'feature_names': ('1', '2', '5', '6')},
 5: {'avg_score': 0.7760416666666666,
  'cv_scores': array([0.77604167]),
  'feature_idx': (0, 1, 2, 5, 6),
  'feature_names': ('0', '1', '2', '5', '6')},
 6: {'avg_score': 0.7734375,
  'cv_scores': array([0.7734375]),
  'feature_idx': (0, 1, 2, 4, 5, 6),
  'feature_names': ('0', '1', '2', '4', '5', '6')},
 7: {'avg_score': 0.7682291666666666,
  'cv_scores': array([0.76822917]),
  'feature_idx': (0, 1, 2, 4, 5, 6, 7),
  'feature_names': ('0', '1', '2', '4', '5', '6', '7')},
 8: {'avg_score': 0.7630208333333334,
  'cv_scores': array([0.76302083]),
  'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7),
  'feature_names': ('0', '1', '2', '3', '4', '5', '6', '7')}}

### Recursive Feature Elimination for Diabetes

In [10]:
#Selecting the three best features with  RFE
selector = RFE(estimator, 3, step=1)
selector = selector.fit(X, y)
print ("Features sorted by their rank:")
print(sorted(zip(map(lambda x: round(x, 4), selector.ranking_), names)))
# evaluanado el modelo con las 3 features seleccionadas
pred_y = selector.predict(X1)
#Calculando accuracy
acc=(y1==pred_y).sum()
print("Accuracy: %.2f%%" % (acc*100.0/float(len(y1))))

Features sorted by their rank:
[(1, 'mass'), (1, 'pedi'), (1, 'preg'), (2, 'plas'), (3, 'age'), (4, 'pres'), (5, 'test'), (6, 'skin')]
Accuracy: 69.53%


Forward using LDA for Vehicle

In [11]:
data=pd.read_csv("c://PW-PR/vehicle.csv")
y=data['Class']
X=data.iloc[:,0:18]
y1=y.as_matrix()
X1=X.as_matrix()
names=X.columns
print(names)
estimator = LinearDiscriminantAnalysis()
sfs1 = SFS(estimator, k_features=3, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

Index(['COMPACTNESS', 'CIRCULARITY', 'DISTANCE_CIRCULARITY', 'RADIUS_RATIO',
       'PR.AXIS_ASPECT_RATIO', 'MAX.LENGTH_ASPECT_RATIO', 'SCATTER_RATIO',
       'ELONGATEDNESS', 'PR.AXIS_RECTANGULARITY', 'MAX.LENGTH_RECTANGULARITY',
       'SCALED_VARIANCE_MAJOR', 'SCALED_VARIANCE_MINOR',
       'SCALED_RADIUS_OF_GYRATION', 'SKEWNESS_ABOUT_MAJOR',
       'SKEWNESS_ABOUT_MINOR', 'KURTOSIS_ABOUT_MAJOR', 'KURTOSIS_ABOUT_MINOR',
       'HOLLOWS_RATIO'],
      dtype='object')


  after removing the cwd from sys.path.
  """


{1: {'avg_score': 0.41843971631205673,
  'cv_scores': array([0.41843972]),
  'feature_idx': (10,),
  'feature_names': ('10',)},
 2: {'avg_score': 0.6028368794326241,
  'cv_scores': array([0.60283688]),
  'feature_idx': (5, 10),
  'feature_names': ('5', '10')},
 3: {'avg_score': 0.6229314420803782,
  'cv_scores': array([0.62293144]),
  'feature_idx': (2, 5, 10),
  'feature_names': ('2', '5', '10')}}

This three features are the most important ones: CIRCULARITY', PR.AXIS_ASPECT_RATIO', 'MAX.LENGTH_RECTANGULARITY'

RFE for vehicle

In [15]:
#Selecting six best features with  RFE
selector = RFE(estimator, 6, step=1)
selector = selector.fit(X, y)
print("Features sorted by their rank:")
print(sorted(zip(map(lambda x: round(x, 4), selector.ranking_), names)))
# evaluanado el modelo con las 3 features seleccionadas
pred_y = selector.predict(X1)
#Calculando accuracy
acc=(y1==pred_y).sum()
print("Accuracy: %.2f%%" % (acc*100.0/float(len(y1))))

Features sorted by their rank:
[(1, 'HOLLOWS_RATIO'), (1, 'KURTOSIS_ABOUT_MAJOR'), (1, 'KURTOSIS_ABOUT_MINOR'), (2, 'PR.AXIS_RECTANGULARITY'), (3, 'MAX.LENGTH_RECTANGULARITY'), (5, 'SCALED_RADIUS_OF_GYRATION'), (6, 'SKEWNESS_ABOUT_MAJOR'), (11, 'COMPACTNESS'), (12, 'CIRCULARITY'), (13, 'SKEWNESS_ABOUT_MINOR'), (16, 'PR.AXIS_ASPECT_RATIO'), (17, 'SCALED_VARIANCE_MINOR'), (20, 'SCALED_VARIANCE_MAJOR'), (24, 'MAX.LENGTH_ASPECT_RATIO'), (26, 'RADIUS_RATIO'), (27, 'SCATTER_RATIO'), (28, 'ELONGATEDNESS'), (31, 'DISTANCE_CIRCULARITY')]
Accuracy: 82.89%


In [16]:
#Forward usando el clasificador Naive Bayes en Landsat
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]
X=data.iloc[:,0:36]
y1=y.as_matrix()
X1=X.as_matrix()
features, labels = X.values, y.values
clf = GaussianNB()
sfs1 = SFS(clf, k_features=5, forward=True, floating=False, verbose=0,scoring='accuracy',cv=0)
sfs1 = sfs1.fit(X1, y1)
sfs1.subsets_

  
  import sys


{1: {'avg_score': 0.581510710259301,
  'cv_scores': array([0.58151071]),
  'feature_idx': (19,),
  'feature_names': ('19',)},
 2: {'avg_score': 0.7770011273957159,
  'cv_scores': array([0.77700113]),
  'feature_idx': (16, 19),
  'feature_names': ('16', '19')},
 3: {'avg_score': 0.801803833145434,
  'cv_scores': array([0.80180383]),
  'feature_idx': (16, 17, 19),
  'feature_names': ('16', '17', '19')},
 4: {'avg_score': 0.8027057497181511,
  'cv_scores': array([0.80270575]),
  'feature_idx': (16, 17, 19, 27),
  'feature_names': ('16', '17', '19', '27')},
 5: {'avg_score': 0.807891770011274,
  'cv_scores': array([0.80789177]),
  'feature_idx': (9, 16, 17, 19, 27),
  'feature_names': ('9', '16', '17', '19', '27')}}