In [58]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import fetch_mldata
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [226]:
def best_confusion(r):
    n = r.shape[0]
    inp_list = [i for i in range(n)]
    permutations = list(itertools.permutations(inp_list))
    max_sum = 0
    max_list = (i for i in range(n))
    for i in permutations:
        cur = np.sum(np.diagonal(r[:,i]))
        if cur > max_sum:
            max_sum = cur
            max_list = i
    return r[:,max_list], max_sum      

In [3]:
### load data
mnist = fetch_mldata('MNIST original', data_home = './datasets')
X, y = mnist['data'], mnist['target'] 
print(X.shape)
print(y.shape)
X_s, X_t, y_s, y_t = train_test_split(X, y, test_size = 0.5, random_state = 0)

(70000, 784)
(70000,)




## 1. Clustering

In [9]:
iris_dataset = load_iris()

In [10]:
iris_cluster = iris_dataset['data']

In [11]:
iris_cluster[0:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

### 1.1 Original data

In [35]:
estimator = KMeans(n_clusters=3)
estimator.fit(iris_cluster)
label_pred = estimator.labels_

In [45]:
r = confusion_matrix(iris_dataset['target'],label_pred)
r

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]], dtype=int64)

In [44]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(iris_cluster)
label_pred1 = estimator1.predict(iris_cluster)

In [46]:
r1 = confusion_matrix(iris_dataset['target'],label_pred1)
r1

array([[ 0, 50,  0],
       [ 5,  0, 45],
       [50,  0,  0]], dtype=int64)

In [48]:
r1[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 45,  5],
       [ 0,  0, 50]], dtype=int64)

In [190]:
estimator = KMeans(n_clusters=10)
estimator.fit(X)
label_pred = estimator.labels_

In [191]:
r = confusion_matrix(y,label_pred)
r

array([[ 292,  248,   34, 5506,  643,   14,   38,   82,    0,   46],
       [   9,   10, 3457,    0,    8,    9,    7,    4, 4364,    9],
       [ 446,  188,  477,   74,  164,   48,   72,  246,  368, 4907],
       [4563,   68,   91,   37, 1406,  166,   48,   83,  426,  253],
       [   0,  148,  241,    4,   11, 1861, 1818, 2604,  116,   21],
       [2062,  143,  772,   68, 2169,  318,  394,  228,  139,   20],
       [  34, 5476,  219,   91,  181,    2,    1,  464,  295,  113],
       [   3,    5,  279,   14,    3, 2813, 3182,  707,  248,   39],
       [1578,   65,  438,   36, 3680,  245,  279,  152,  301,   51],
       [  93,    9,   84,   40,   55, 2908, 1893, 1718,  149,    9]],
      dtype=int64)

In [212]:
r[:,(3,8,9,0,7,2,1,6,4,5)]

array([[5506,    0,   46,  292,   82,   34,  248,   38,  643,   14],
       [   0, 4364,    9,    9,    4, 3457,   10,    7,    8,    9],
       [  74,  368, 4907,  446,  246,  477,  188,   72,  164,   48],
       [  37,  426,  253, 4563,   83,   91,   68,   48, 1406,  166],
       [   4,  116,   21,    0, 2604,  241,  148, 1818,   11, 1861],
       [  68,  139,   20, 2062,  228,  772,  143,  394, 2169,  318],
       [  91,  295,  113,   34,  464,  219, 5476,    1,  181,    2],
       [  14,  248,   39,    3,  707,  279,    5, 3182,    3, 2813],
       [  36,  301,   51, 1578,  152,  438,   65,  279, 3680,  245],
       [  40,  149,    9,   93, 1718,   84,    9, 1893,   55, 2908]],
      dtype=int64)

In [214]:
np.sum(np.diagonal(r[:,(3,8,9,0,7,2,1,6,4,5)]))

37962

In [227]:
a, b  = best_confusion(r)

In [228]:
print(a)
print(b)

[[5506    0   46  292   82   34  248   38  643   14]
 [   0 4364    9    9    4 3457   10    7    8    9]
 [  74  368 4907  446  246  477  188   72  164   48]
 [  37  426  253 4563   83   91   68   48 1406  166]
 [   4  116   21    0 2604  241  148 1818   11 1861]
 [  68  139   20 2062  228  772  143  394 2169  318]
 [  91  295  113   34  464  219 5476    1  181    2]
 [  14  248   39    3  707  279    5 3182    3 2813]
 [  36  301   51 1578  152  438   65  279 3680  245]
 [  40  149    9   93 1718   84    9 1893   55 2908]]
37962


In [237]:
estimator1 = GaussianMixture(n_components=10)
estimator1.fit(X_s)
label_pred1 = estimator1.predict(X_s)

In [238]:
r1 = confusion_matrix(y_s,label_pred1)
r1

array([[  11, 2412,    1,   37,    2,  886,   11,   76,   37,    2],
       [  18,   48,   11,   13,    0,  543,   16, 3261,   15,    0],
       [ 129, 1550,    6,  175,   13,  212,   20,  237, 1178,    4],
       [  12,  981,    2,  611,   10, 1086,   95,  710,  117,    7],
       [  18,  689,   10,   77,  166,  621, 1335,  341,   28,  119],
       [  22,  692,   21,  141,   16, 1876,  102,  239,   16,   18],
       [2385,  403,   12,    5,   69,   88,    1,  391,   56,    0],
       [   0,   48,   13,   50,  117,  222, 2228,  306,    6,  663],
       [   6,  491,    5,  106,   19, 1754,  333,  646,    5,   24],
       [   0,   49,    6,   42,  203,  245, 2159,  578,    8,  156]],
      dtype=int64)

In [240]:
a, b  = best_confusion(r1)

In [241]:
print(a)
print(b)

[[2412   76   37   37    2  886   11    2    1   11]
 [  48 3261   15   13    0  543   18    0   11   16]
 [1550  237 1178  175   13  212  129    4    6   20]
 [ 981  710  117  611   10 1086   12    7    2   95]
 [ 689  341   28   77  166  621   18  119   10 1335]
 [ 692  239   16  141   16 1876   22   18   21  102]
 [ 403  391   56    5   69   88 2385    0   12    1]
 [  48  306    6   50  117  222    0  663   13 2228]
 [ 491  646    5  106   19 1754    6   24    5  333]
 [  49  578    8   42  203  245    0  156    6 2159]]
14716


### 1.2 Standrized Data

#### 1.2.1 normalizaztion

In [29]:
iris_cluster_s = (iris_cluster - iris_cluster.mean(axis = 0))/iris_cluster.std(axis = 0)

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648]])

In [37]:
estimator2 = KMeans(n_clusters=3)
estimator2.fit(iris_cluster_s)
label_pred2 = estimator2.labels_

In [49]:
r2 = confusion_matrix(iris_dataset['target'],label_pred2)
r2
r2[:,[1,0,2]]

array([[50,  0,  0],
       [ 0, 39, 11],
       [ 0, 14, 36]], dtype=int64)

In [51]:
estimator3 = GaussianMixture(n_components=3)
estimator3.fit(iris_cluster_s)
label_pred3 = estimator3.predict(iris_cluster_s)

In [53]:
r3 = confusion_matrix(iris_dataset['target'],label_pred3)
r3
r3[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 45,  5],
       [ 0,  0, 50]], dtype=int64)

#### 1.2.2 Minmax

In [54]:
iris_cluster_s2 = (iris_cluster - iris_cluster.min(axis = 0))/iris_cluster.max(axis = 0)

In [55]:
estimator4 = KMeans(n_clusters=3)
estimator4.fit(iris_cluster_s2)
label_pred4 = estimator4.labels_
r4 = confusion_matrix(iris_dataset['target'],label_pred4)
r4

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  4, 46]], dtype=int64)

In [67]:
estimator5 = GaussianMixture(n_components=3)
estimator5.fit(iris_cluster_s2)
label_pred5 = estimator5.predict(iris_cluster_s2)
r5 = confusion_matrix(iris_dataset['target'],label_pred5)
r5

array([[50,  0,  0],
       [ 0, 45,  5],
       [ 0,  0, 50]], dtype=int64)

## 2. Dimensionality Reduction

### 2.1 PCA

In [12]:
pca = PCA(n_components ='mle')
iris_pca = pca.fit_transform(iris_cluster)
iris_pca[0:10]

array([[-2.68412563,  0.31939725, -0.02791483],
       [-2.71414169, -0.17700123, -0.21046427],
       [-2.88899057, -0.14494943,  0.01790026],
       [-2.74534286, -0.31829898,  0.03155937],
       [-2.72871654,  0.32675451,  0.09007924],
       [-2.28085963,  0.74133045,  0.16867766],
       [-2.82053775, -0.08946138,  0.25789216],
       [-2.62614497,  0.16338496, -0.02187932],
       [-2.88638273, -0.57831175,  0.02075957],
       [-2.6727558 , -0.11377425, -0.19763272]])

In [13]:
pca.explained_variance_ratio_

array([0.92461872, 0.05306648, 0.01710261])

### 2.2 ICA

In [14]:
ica = FastICA()
iris_ica = ica.fit_transform(iris_cluster)
iris_ica[0:10]

array([[ 0.11369237,  0.03062106,  0.02135024,  0.00241868],
       [ 0.10855025, -0.07938029,  0.0315086 , -0.00666514],
       [ 0.11016515, -0.02850761, -0.02916445, -0.01300413],
       [ 0.09837921, -0.03104032, -0.07189678,  0.02545228],
       [ 0.11209209,  0.06012077, -0.01635511,  0.00882567],
       [ 0.10341204,  0.12038116,  0.02122351, -0.02066626],
       [ 0.10273859,  0.02570093, -0.08556198, -0.02592626],
       [ 0.1067219 ,  0.01996159, -0.00664445,  0.02172212],
       [ 0.09888465, -0.07673952, -0.08367323,  0.00660026],
       [ 0.10530034, -0.03954967, -0.0037228 ,  0.05722642]])

In [16]:
# ica_mnist = FastICA()
# mnist_ica = ica_mnist.fit_transform(X_s)
# mnist_ica[0:10]

### 2.3 Randomized Projections

In [31]:
rca = GaussianRandomProjection(n_components = 3, random_state = 10)
iris_rca = rca.fit_transform(iris_cluster)
iris_rca[0:10]

array([[4.11612083, 0.6015643 , 0.14874258],
       [3.75587921, 0.7376894 , 0.19864978],
       [3.77393787, 0.56746606, 0.15299238],
       [3.47731458, 0.60382601, 0.21282672],
       [4.0805383 , 0.52411729, 0.13841426],
       [4.24330571, 0.60140827, 0.32308081],
       [3.68994422, 0.47004126, 0.22704169],
       [3.90872127, 0.62259493, 0.18357614],
       [3.33018664, 0.59989931, 0.2074915 ],
       [3.70843617, 0.7051775 , 0.14411262]])

In [32]:
rca.components_

array([[ 0.76879183,  0.41296651, -0.89223727, -0.00484042],
       [ 0.35872849, -0.41574159,  0.15329319,  0.06267052],
       [ 0.00247766, -0.10080548,  0.25000779,  0.69457395]])

In [30]:
# rca_mnist = GaussianRandomProjection(random_state = 10)
# mnist_rca = rca_mnist.fit_transform(X_s)
# mnist_rca[0:10]

### 2.4 Decison Tree Feature Selection

In [35]:
X_new = SelectKBest(chi2,k=3).fit_transform(iris_dataset['data'],iris_dataset['target'])
X_new[0:10]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1]])

## 3 Reproduce

In [39]:
estimator = KMeans(n_clusters=3)
estimator.fit(iris_pca)
label_pred = estimator.labels_

In [40]:
r = confusion_matrix(iris_dataset['target'],label_pred)
r

array([[ 0, 50,  0],
       [ 2,  0, 48],
       [36,  0, 14]], dtype=int64)

In [41]:
r[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]], dtype=int64)

In [42]:
estimator = KMeans(n_clusters=3)
estimator.fit(iris_ica)
label_pred = estimator.labels_

In [43]:
r = confusion_matrix(iris_dataset['target'],label_pred)
r

array([[ 0, 50,  0],
       [41,  0,  9],
       [19,  0, 31]], dtype=int64)

In [44]:
r[:,[1,0,2]]

array([[50,  0,  0],
       [ 0, 41,  9],
       [ 0, 19, 31]], dtype=int64)

In [45]:
estimator = KMeans(n_clusters=3)
estimator.fit(iris_rca)
label_pred = estimator.labels_
r = confusion_matrix(iris_dataset['target'],label_pred)
r

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  0, 50]], dtype=int64)

In [46]:
estimator = KMeans(n_clusters=3)
estimator.fit(X_new)
label_pred = estimator.labels_
r = confusion_matrix(iris_dataset['target'],label_pred)
r

array([[ 0, 50,  0],
       [ 2,  0, 48],
       [36,  0, 14]], dtype=int64)

In [47]:
r[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]], dtype=int64)

In [49]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(iris_pca)
label_pred1 = estimator1.predict(iris_pca)
r1 = confusion_matrix(iris_dataset['target'],label_pred1)
r1

array([[ 0, 50,  0],
       [49,  0,  1],
       [ 6,  0, 44]], dtype=int64)

In [51]:
r1[:,[1,0,2]]

array([[50,  0,  0],
       [ 0, 49,  1],
       [ 0,  6, 44]], dtype=int64)

In [52]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(iris_ica)
label_pred1 = estimator1.predict(iris_ica)
r1 = confusion_matrix(iris_dataset['target'],label_pred1)
r1

array([[50,  0,  0],
       [ 0, 39, 11],
       [ 0, 40, 10]], dtype=int64)

In [53]:
r1[:,[0,2,1]]

array([[50,  0,  0],
       [ 0, 11, 39],
       [ 0, 10, 40]], dtype=int64)

In [54]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(iris_rca)
label_pred1 = estimator1.predict(iris_rca)
r1 = confusion_matrix(iris_dataset['target'],label_pred1)
r1

array([[ 0, 50,  0],
       [ 4,  0, 46],
       [50,  0,  0]], dtype=int64)

In [55]:
r1[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 46,  4],
       [ 0,  0, 50]], dtype=int64)

In [56]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(X_new)
label_pred1 = estimator1.predict(X_new)
r1 = confusion_matrix(iris_dataset['target'],label_pred1)
r1

array([[ 0, 50,  0],
       [ 2,  0, 48],
       [49,  0,  1]], dtype=int64)

In [57]:
r1[:,[1,2,0]]

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  1, 49]], dtype=int64)

## 4 rerun neural network

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.9733333333333334


In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(iris_pca, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.9333333333333333


In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(iris_ica, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.88


In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(iris_rca, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.9866666666666667


In [84]:
X_train, X_test, Y_train, Y_test = train_test_split(X_new, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.96


## 5 clustering + supervised learning

In [85]:
estimator = KMeans(n_clusters=3)
estimator.fit(iris_cluster)
label_pred = estimator.labels_

In [90]:
irirs_cluster1 = np.c_[iris_cluster,label_pred]

In [91]:
X_train, X_test, Y_train, Y_test = train_test_split(irirs_cluster1, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.9333333333333333


In [92]:
estimator1 = GaussianMixture(n_components=3)
estimator1.fit(iris_cluster)
label_pred1 = estimator1.predict(iris_cluster)

In [93]:
irirs_cluster2 = np.c_[iris_cluster,label_pred1]

In [94]:
X_train, X_test, Y_train, Y_test = train_test_split(irirs_cluster2, iris_dataset['target'], test_size = 0.5, random_state = 0)
NN_model = MLPClassifier(hidden_layer_sizes = (100), random_state = 0, max_iter = 10000).fit(X_train, Y_train)
print('train data accuracy:', NN_model.score(X_train, Y_train))
print('test data accuracy:', NN_model.score(X_test, Y_test))

train data accuracy: 0.9866666666666667
test data accuracy: 0.9733333333333334
