In [17]:
## Trying different clustering methods on the extracted feature arrays
## More note can be found in the trial and test note.docx


from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import make_pipeline
from time import time

import pandas as pd
import numpy as np


#import vectorized feature arrays for acts and scenes
act_Vect = pd.read_csv('VectorizedFeatures/AllComplied/AllAct_Features_Vectorize.txt',sep=',',header = None)
scene_Vect = pd.read_csv('VectorizedFeatures/AllComplied/AllScene_Features_Vectorize.txt',sep=',',header = None)

#import LSA reduced vectorized feature arrays for acts and scenes
act_Vect_LSA = pd.read_csv('LSA_VectorizedFeatures/Act_Features_Vectorize_LSA.txt',sep=',',header = None, \
                        dtype={'0': np.float64} )
scene_Vect_LSA = pd.read_csv('LSA_VectorizedFeatures/Scene_Features_Vectorize_LSA.txt',sep=',',header = None, \
                        dtype={'0': np.float64} )

#     explained_variance = svd.explained_variance_ratio_.sum()
#     print(explained_variance)



In [18]:
### K cluster the 5000 feature vectorized array, much better clustering than LSA reduced ones
true_k = 37
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

print("Clustering acts sparse data with %s" % km)
t0 = time()
km.fit(act_Vect)
print("done in %0.3fs" % (time() - t0))
print(km.labels_)

print("Clustering scene sparse data with %s" % km)
t0 = time()
km.fit(scene_Vect)
print(km.labels_)
print("done in %0.3fs" % (time() - t0))

Clustering acts sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=37, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
done in 0.372s
[ 7  7  7  7  7  1  1  1  1  1  3  3  3  3  3  7  7  7  7  7 11 11 11 11  7
 35 35 35 35 35  8  8  8  8  8 19 19 19 19 19 20 20 20 20 20 16 16 16 16 16
 23 23 23 23 23 17 17 17 32 32 32 21 21 21 21 21  4  4  4  4  4  9  9  9  9
  9 27 27 27 27 27 15 15 15 15 15 36 36 36 36 36 26 26 26 26 26 26 29 29 34
 34 34 33 33 33 33 33  2  2  2  2  2  2  2 25 25 25 13 13 13 13 13  7 29 36
 36 36  6  6  6  6  6 25 25 25  6  6 18 18 18 18 18 30 30 30 30 30 28 28 28
 28 28  5  5  5  5  5 22 22 22 22 22  0  0  0 31 31 22 14 14 14 14 24 24 24
 24 24 12 12 12 12 12 10 10 10 10 10]
Clustering scene sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=37, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
[27 27 27  1  1 

In [19]:
### K cluster the LSA reduced feature vectorized array

print("Clustering acts sparse LSA reduced data with %s" % km)
t0 = time()
km.fit(act_Vect_LSA)
print("done in %0.3fs" % (time() - t0))
print(km.labels_)

print("Clustering scene sparse data LSA reduced with %s" % km)
t0 = time()
km.fit(scene_Vect_LSA)
print(km.labels_)
print("done in %0.3fs" % (time() - t0))

Clustering acts sparse LSA reduced data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=37, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
done in 0.014s
[30 17  4 27 17  6  4 17 12 24 24  3 23 32 32 23 29 17 17  4  4 17 17 35 35
  6 29 35 35 18 35 36 21 21 36 36  0 24 12  6 35  0 33 12 12 18  0  0 28 24
 18 20  2  4 17  4  4  0 21 28 24 12 20 30  4 27 30 33 18 17 17 29  8 15 15
 36 24 33  0 24 33 24  6 29 27 17 17  5  3 22 16 32  3 10 30 11 11 22 31 19
  5 34  1 19 19 25  7 25 14  7  1 19  9 14 31  7 13 14 23 23 11 16 16 26 11
 16 26 26  1  1  1 25 25  5  9  9  1  7 33 33 12  6 35 18 12 12  6 17  2  4
 29  4 27  8 21  8 28 18 30  2 30 20 20 30 27 27  2 30 30  6 24 24 33 18 35
 29  4 29 17 17 17  2  2 10 10 10 20 30]
Clustering scene sparse data LSA reduced with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=37, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    