In [1]:
from utils import *

%matplotlib widget

# libraries for Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import Isomap

# libraries for features selectiona
# ANOVA F-test
from sklearn.feature_selection import f_classif
from sklearn import datasets

# library for standardize features
from sklearn.preprocessing import StandardScaler

# libraries for epsilon parameter
from sklearn.neighbors import NearestNeighbors

# libraries for Clustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

# libraries for Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score

# libraries for general utilities
import numpy as np 
from matplotlib import pyplot as plt

# global variables from work specifications
SEED = 42
set_seed(SEED)
NUMBER_FEATURES = 6
NEIGHBOURS = 5

In [2]:
def findEpsParams(features):
    
    neigh = NearestNeighbors(n_neighbors=NEIGHBOURS)
    nbrs = neigh.fit(features)
    distances, indices = nbrs.kneighbors(features)
    
    print(distances)

    distances = np.sort(distances, axis=0)[:,NEIGHBOURS-1]
    plt.plot(distances)
    plt.axhline(y=3.4, xmin=0, xmax=250, linestyle='-', c='red')
    plt.show()

    return 3.4

In [3]:
##
## DATASET LOADING
##

# create the loader class and load dataset and labels
loader = Loader().load()

# get dataset, labels, and dataset splitted by classes
dataset = loader.getDataset()
labels = loader.getLabels()
class_split_dataset = loader.getClassSplitDataset()



In [4]:
dataset.shape

(563, 2500)

In [5]:
##
## FEATURES CREATION
##

# extract 6 features wirh PCA
pca = PCA(n_components=NUMBER_FEATURES)
pca_dataset_embedded = pca.fit_transform(dataset)

# extract 6 features wirh t-sne
tsne = TSNE(n_components=NUMBER_FEATURES, method='exact')
tsne_dataset_embedded = tsne.fit_transform(dataset)

# extract 6 features wirh isomap
isomap = Isomap(n_components=NUMBER_FEATURES)
isomap_dataset_embedded = isomap.fit_transform(dataset)

# features concatenate (6,6,6) = 18 features
features = np.concatenate((pca_dataset_embedded,tsne_dataset_embedded, isomap_dataset_embedded), axis=1)

In [6]:
pca_dataset_embedded.shape

(563, 6)

In [7]:
# standardization
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
labels = labels[:,1]

In [8]:
print(features_std.shape)
print(labels.shape)

(563, 18)
(563,)


In [9]:
from mpl_toolkits.mplot3d import Axes3D

feat = features_std[labels!=0]
lbs = labels[labels!=0]

fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(feat[:, 13], feat[:, 2], feat[:, 1], c=lbs,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three Features Standardized")
ax.set_xlabel("1st feature (1 element of isomap)")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd feature (2d eigen PCA)")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd feature (1d eigen PCA)")
ax.w_zaxis.set_ticklabels([])

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
fig = plt.figure(2, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(features_std[:, 13], features_std[:, 2], features_std[:, 1], c=labels,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three Features Standardized (also non labeled features)")
ax.set_xlabel("1st feature (1 element of isomap)")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd feature (2d eigen PCA)")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd feature (1d eigen PCA)")
ax.w_zaxis.set_ticklabels([])

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
# #############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function to select the four
# most significant features
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

fig = plt.figure(3, figsize=(8, 6))

feat = features_std[labels!=0]
lbs = labels[labels!=0]

k_selection = 5

# Some noisy data not correlated
#E = np.random.RandomState(42).uniform(0, 0.1, size=(feat.shape[0], 20))

# Add the noisy data to the informative features
#feat = np.hstack((feat, E))

# Split dataset to select feature and evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(
        feat, lbs, stratify=lbs#, random_state=0
)

selector = SelectKBest(f_classif, k=k_selection)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
X_indices = np.arange(feat.shape[-1])
plt.bar(X_indices - .65, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange')#,edgecolor='black')

# #############################################################################
# Compare to the weights of an SVM
clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print('Classification accuracy without selecting features: {:.3f}'
      .format(clf.score(X_test, y_test)))

svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()

plt.bar(X_indices - .45, svm_weights, width=.2, label='SVM weight',
        color='navy')#,edgecolor='black')

# #############################################################################
# SVM on Normalized input

clf_selected = make_pipeline(
        SelectKBest(f_classif, k=k_selection), #MinMaxScaler(), 
        Normalizer(),
    LinearSVC()
)
clf_selected.fit(X_train, y_train)
print('Classification accuracy after selection (Normalized): {:.3f}'
      .format(clf_selected.score(X_test, y_test)))

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

plt.bar(X_indices[selector.get_support()] - .25, svm_weights_selected,
        width=.2, label='SVM selection (Normalized)', color='c')#,edgecolor='black')


# #############################################################################
# SVM on Standardize input

clf_selected = make_pipeline(
        SelectKBest(f_classif, k=k_selection), #MinMaxScaler(), 
        StandardScaler(),
    LinearSVC()
)
clf_selected.fit(X_train, y_train)
print('Classification accuracy after selection (Standardize): {:.3f}'
      .format(clf_selected.score(X_test, y_test)))

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
        width=.2, label='SVM selection (Standardize)', color='pink')#,edgecolor='black')


# #############################################################################
# SVM on MinMax Normalized input

clf_selected = make_pipeline(
        SelectKBest(f_classif, k=k_selection), 
    MinMaxScaler(),
    LinearSVC()
)
clf_selected.fit(X_train, y_train)
print('Classification accuracy after selection (MinMax norm): {:.3f}'
      .format(clf_selected.score(X_test, y_test)))

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

plt.bar(X_indices[selector.get_support()] + .15, svm_weights_selected,
        width=.2, label='SVM selection (MinMax norm)', color='red')#,edgecolor='black')

plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')


plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Classification accuracy without selecting features: 0.762
Classification accuracy after selection (Normalized): 0.762
Classification accuracy after selection (Standardize): 0.762
Classification accuracy after selection (MinMax norm): 0.762


In [12]:
selector.get_params()

{'k': 5,
 'score_func': <function sklearn.feature_selection.univariate_selection.f_classif(X, y)>}

In [13]:
from pandas.plotting import scatter_matrix
from pandas import DataFrame

fig = plt.figure(4, figsize=(8, 6))
scatter_matrix(DataFrame(features_std), alpha=0.5, diagonal='kde')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1c21365490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c21383450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c213b60d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c213f7190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c2143ff50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c2146b1d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c214a09d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c214e1210>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c214f3850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c21535150>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c21589a50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c215c9290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c215fca90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c216

In [14]:
fig = plt.figure(5, figsize=(6, 4))
corr = abs(DataFrame(features_std).corr())
corr.style.background_gradient(cmap='coolwarm').format("{:.3}")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,6.9e-17,2.15e-17,3.8e-17,7.67e-18,7.69e-18,0.165,0.218,0.416,0.0331,0.333,0.26,0.943,0.145,0.139,0.0152,0.0518,0.0286
1,6.9e-17,1.0,2.47e-16,2.3e-17,5.12e-17,4.49e-18,0.252,0.157,0.0647,0.0853,0.123,0.0296,0.207,0.61,0.573,0.23,0.0886,0.0434
2,2.15e-17,2.47e-16,1.0,3.94e-17,1.09e-17,4.55e-17,0.179,0.0848,0.0603,0.0292,0.119,0.21,0.0185,0.658,0.578,0.117,0.182,0.0338
3,3.8e-17,2.3e-17,3.94e-17,1.0,4.09e-17,3.34e-16,0.116,0.0498,0.0871,0.0837,0.00982,0.0655,0.00618,0.0926,0.173,0.728,0.351,0.139
4,7.67e-18,5.12e-17,1.09e-17,4.09e-17,1.0,2.51e-16,0.00432,0.0337,0.0894,0.0148,0.0282,0.0574,0.0571,0.0288,0.287,0.29,0.705,0.155
5,7.69e-18,4.49e-18,4.55e-17,3.34e-16,2.51e-16,1.0,0.0776,0.0532,0.0284,0.00997,0.032,0.155,0.0178,0.0116,0.0117,0.287,0.0282,0.278
6,0.165,0.252,0.179,0.116,0.00432,0.0776,1.0,0.168,0.267,0.00294,0.261,0.0275,0.192,0.272,0.0638,0.0177,0.0238,0.0287
7,0.218,0.157,0.0848,0.0498,0.0337,0.0532,0.168,1.0,0.02,0.281,0.0931,0.211,0.244,0.021,0.128,0.0218,0.042,0.0866
8,0.416,0.0647,0.0603,0.0871,0.0894,0.0284,0.267,0.02,1.0,0.145,0.138,0.178,0.447,0.00592,0.073,0.0144,0.148,0.0165
9,0.0331,0.0853,0.0292,0.0837,0.0148,0.00997,0.00294,0.281,0.145,1.0,0.118,0.0691,0.0233,0.126,0.0612,0.0116,0.0389,0.0571


In [15]:
(xs, ys) = np.where((corr>0.5)==True)

In [16]:
pairs = [ [x, y] for x,y in zip(xs, ys) if x>y ]

In [17]:
indexes = np.array([int(x) for x in np.union1d(np.ravel(pairs),[])])

In [18]:
pairs

[[12, 0], [13, 1], [13, 2], [14, 1], [14, 2], [15, 3], [16, 4]]

In [19]:
fig = plt.figure(6, figsize=(10, 10))
Axes = scatter_matrix(DataFrame(features_std[:,indexes], columns=indexes), alpha=0.5, diagonal='kde')

#y ticklabels
[plt.setp(item.yaxis.get_majorticklabels(), 'size', 5) for item in Axes.ravel()]
#x ticklabels
[plt.setp(item.xaxis.get_majorticklabels(), 'size', 5) for item in Axes.ravel()]
#y labels
[plt.setp(item.yaxis.get_label(), 'size', 13) for item in Axes.ravel()]
#x labels
[plt.setp(item.xaxis.get_label(), 'size', 13) for item in Axes.ravel()]

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[[None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, None],
 [None, No

In [20]:
##
## FEATURES REMOVING
##

b = np.arange(features_std.shape[1])
b = np.setdiff1d(b,np.array(pairs)[:,0])

features_sel = features_std[:,b]

In [21]:
b

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 17])

In [22]:
##
## FEATURES EXTRACTION
##

# selection with ANOVA F-test
f,prob = f_classif(features_sel, labels)
indexes = np.arange(len(f))
app = [ [i, ele] for i, ele in enumerate(f)]
app.sort(key = lambda app: app[1], reverse = True)
indexes = np.array([ int(a) for a in np.array(app)[:,0]])
print(indexes)
f = f[indexes]
print(np.round(f, 3))
prob = prob[indexes]
print(np.round(prob, 3))

[ 2  1  0 10  6  4 11  7  5 12  8  3  9]
[32.718 19.248 12.796  5.209  3.523  3.433  3.134  1.991  1.899  1.393
  0.88   0.632  0.346]
[0.    0.    0.    0.001 0.015 0.017 0.025 0.114 0.129 0.244 0.451 0.595
 0.792]


In [23]:
##
## FEATURES SELECTION
##
fig = plt.figure(7, figsize=(8, 6))
plt.plot(f)
plt.show()

In [24]:
##
## FEATURES SELECTION
##
fig = plt.figure(8, figsize=(8, 6))
plt.plot(prob)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …