In [2]:
# Loading an dataset
import scipy.io
mat = scipy.io.loadmat("./data/COIL20.mat")

In [3]:
X = mat['X']
print(X)

[[ 0.01568627  0.01568627  0.01568627 ...,  0.01568627  0.01568627
   0.01568627]
 [ 0.01960784  0.01960784  0.01960784 ...,  0.01960784  0.01960784
   0.01960784]
 [ 0.01568627  0.01568627  0.01568627 ...,  0.01568627  0.01568627
   0.01568627]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [4]:
y = mat['Y'][:, 0]
print(y)

[ 1  1  1 ..., 20 20 20]


In [7]:
# Shape of the data arrays
import numpy as np
n_samples, n_features = np.shape(X)
print(np.shape(X))

(1440, 1024)


In [8]:
n_labels = np.shape(y)
print(n_labels)

(1440,)


In [9]:
"""
    For Supervised Learning Problems
        Split Data into Training and Testing Set
"""
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.02352941  0.02352941  0.02352941 ...,  0.02352941  0.02352941
   0.02352941]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.01568627  0.01568627  0.01568627 ...,  0.01568627  0.01568627
   0.01568627]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.01568627  0.01568627  0.01568627 ...,  0.01568627  0.01568627
   0.01568627]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
[17  2 12 ...,  1 



In [13]:
"""
    Perform Feature Selection on the Training Set
"""
# Compute fisher score and output the score of each feature
from skfeature.function.similarity_based import fisher_score
score = fisher_score.fisher_score(X_train, y_train)
print(score)

[ 13.96904931   0.5376816    0.19923194 ...,   3.71944606  14.01720752
  14.05075518]


In [15]:
# Ranking features in an descending order accroding to fisher scores and outputs the ranking index
idx = fisher_score.feature_ranking(score)
print(idx)

[1023 1022   31 ...,   34   97  897]


In [16]:
# Specify the number of selected features (e.g. 5) for the evaluation purpose.
num_fea = 5
selected_features_train = X_train[:, idx[0:num_fea]]
selected_features_test = X_test[:, idx[0:num_fea]]

print(selected_features_train)
print(selected_features_test)

[[ 0.          0.          0.          0.          0.        ]
 [ 0.02352941  0.02352941  0.02352941  0.02352941  0.02352941]
 [ 0.          0.          0.          0.          0.        ]
 ..., 
 [ 0.01568627  0.01568627  0.01568627  0.01568627  0.01568627]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]
[[ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.01568627  0.01568627  0.01568627  0.01568627  0.01568627]
 ..., 
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]


In [17]:
"""
    Training a Classification Model with Selected Features
"""
# Here we choose the linear SVM as an example.
from sklearn import svm
clf = svm.LinearSVC()

# Then we train a classifiction model with the selected features on the training set
clf.fit(selected_features_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [19]:
# Prediction Phase
y_predict = clf.predict(selected_features_test)
print(y_predict)

[8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 2 8 8 8 8 8 8
 8 8 2 8 8 8 8 2 8 2 2 8 8 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 2 8 8 2 8 8 8 8 8
 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 2 2 8 8 8 8 8 2 8 2 8 8 8 8 2 2 5 8 8 8 8 8
 8 2 8 8 8 8 8 2 8 8 8 8 8 2 8 1 8 8 2 8 8 8 8 8 2 2 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 2 8 8 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 8 8 8 8 2 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 2 8 8 8 2 8 8
 8 8 2 8 8 8 8 2 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 8]


In [20]:
# Performance Evaluation
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_predict)
print(acc)

0.09375


In [22]:
"""
    For Unsupervised Learning Problems
"""
# Feature Selection
from skfeature.utility import construct_W
kwargs_W = {'metric':'euclidean', 'neighbor': 'knn', 'weight_mode': 'heat_kernel', 'k': 5, 't': 1}
W = construct_W.construct_W(X, **kwargs_W)
print(W)

  (0, 0)	1.0
  (1, 0)	0.447090840654
  (2, 0)	0.0285384322997
  (3, 0)	0.00118838361012
  (69, 0)	0.0058397303046
  (70, 0)	0.0743253379721
  (71, 0)	0.318867729788
  (0, 1)	0.447090840654
  (1, 1)	1.0
  (2, 1)	0.207676333998
  (3, 1)	0.0160395254514
  (4, 1)	0.000287234973176
  (70, 1)	0.00630719411365
  (71, 1)	0.040773272425
  (0, 2)	0.0285384322997
  (1, 2)	0.207676333998
  (2, 2)	1.0
  (3, 2)	0.416556692173
  (4, 2)	0.0237129133917
  (5, 2)	0.000133833899097
  (71, 2)	0.00124441604146
  (0, 3)	0.00118838361012
  (1, 3)	0.0160395254514
  (2, 3)	0.416556692173
  (3, 3)	1.0
  :	:
  (1436, 1436)	1.0
  (1437, 1436)	0.242840724568
  (1438, 1436)	0.016476242794
  (1439, 1436)	0.00811143521271
  (1368, 1437)	0.0121318038093
  (1434, 1437)	0.00517555721693
  (1435, 1437)	0.0119552113312
  (1436, 1437)	0.242840724568
  (1437, 1437)	1.0
  (1438, 1437)	0.158641913241
  (1439, 1437)	0.0360336994113
  (1368, 1438)	0.0185343985505
  (1372, 1438)	0.0105227419161
  (1435, 1438)	0.00715919482434
  

In [23]:
# Compute and output the score of each feature
from skfeature.function.similarity_based import lap_score
score = lap_score.lap_score(X, W=W)
print(score)

[ 0.01269462  0.00637613  0.00333286 ...,  0.0123851   0.01271441
  0.01269681]


In [25]:
# Ranking features in an ascending order according to laplacian scores and output the ranking index
idx = lap_score.feature_ranking(score)
print(idx)

[ 34  65 966 ...,  28 963 996]


In [26]:
# Specify the number of selected features for the evaluation purpose
num_fea = 5
selected_features = X[:, idx[0:num_fea]]
print(selected_features)

[[ 0.01568627  0.01568627  0.01568627  0.01568627  0.01568627]
 [ 0.01960784  0.01960784  0.01960784  0.01960784  0.01960784]
 [ 0.01568627  0.01568627  0.01568627  0.01568627  0.01568627]
 ..., 
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]


In [30]:
# Performance Evaluation
from skfeature.utility import unsupervised_evaluation
import numpy as np
num_cluster = len(np.unique(y))
print(num_cluster)

20


In [36]:
# Here we use normalized mutual information score(NMI), accuracy(ACC)
nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
print(nmi)
print(acc)

0.409604275742
0.170138888889
