In [1]:
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score

In [2]:
DATA_ALL = sio.loadmat("subjects_40_v5.mat")

SIG    = DATA_ALL['DATA']              # raw sEMG signals
FEAT   = DATA_ALL['FEAT']              # Orignally calculated features
FEAT_N = DATA_ALL['FEAT_N']            # Normalized features
LABEL  = DATA_ALL['LABEL']             # Labels
SUBJECT_ID = DATA_ALL['SUBJECT_ID']    # Sujbect ID
LABEL_VOWEL = DATA_ALL['LABEL_VOWEL']
VOWEL_REP   = DATA_ALL['VOWEL_REP']
VFI         = DATA_ALL['SUBJECT_VFI']

In [3]:
print(DATA_ALL.keys())
print(DATA_ALL['FEAT_LABEl'][:,[1,13,25,37]])

dict_keys(['__header__', '__version__', '__globals__', 'DATA', 'FEAT', 'FEAT_LABEl', 'FEAT_N', 'I', 'LABEL', 'LABEL_VOWEL', 'SUBJECT_ID', 'SUBJECT_VFI', 'VOWEL_REP'])
[[array(['ZC_Ch1'], dtype='<U6') array(['ZC_Ch2'], dtype='<U6')
  array(['ZC_Ch3'], dtype='<U6') array(['ZC_Ch4'], dtype='<U6')]]


In [4]:
# Combine all features into one single numpy array
features_all = np.zeros((0,48))
labels_all   = np.zeros((0,1))
subject_id   = np.zeros((0,1))
labels_vowel = np.zeros((0,1))
vowels_rep   = np.zeros((0,1))
subject_vfi  = np.zeros((0,1))

# subject_

for i_sub in range(40):
    features_all = np.append(features_all, FEAT_N[i_sub,0], axis=0)
    labels_all   = np.append(labels_all, LABEL[i_sub,0]   , axis=0)
    subject_id   = np.append(subject_id, SUBJECT_ID[i_sub,0], axis=0)
    labels_vowel = np.append(labels_vowel, LABEL_VOWEL[i_sub,0], axis=0)
    vowels_rep   = np.append(vowels_rep, VOWEL_REP[i_sub,0], axis=0)
    subject_vfi  = np.append(subject_vfi, VFI[i_sub,0], axis=0)
    
num_data = np.size(features_all,0)
labels_indices = np.arange(num_data)
print('Total # of samples: %d'%num_data)

Total # of samples: 6472


In [None]:
# print(np.shape(features_all))

# features_freq = features_all[:,[1,13,25,37]]
# print(np.shape(features_freq))

In [None]:
# Re-order Mark's label to be in the same sample sequences as the rest
notes_mark = pd.read_excel('Notes/acoustic_classifier_table.xls')

counter = 1
labels_Mark = []
for i in range(num_data):
    sid  = subject_id[i][0]
    vow  = labels_vowel[i][0]
    vrep = vowels_rep[i][0]

    if vow == 1:
        v = 'a'
    elif vow == 2:
        v = 'u'
    elif vow == 3:
        v = 'i'

    filename = 'R%03d_'%sid + v +'_%02d.wav'%vrep
    sample_id = notes_mark[notes_mark['Filename'] == filename]
    
    if sample_id.size == 0:
        filename = 'R%03dS1_'%sid + v +'_%02d.wav'%vrep
        sample_id = notes_mark[notes_mark['Filename'] == filename]
    
    if sample_id.size == 0:
        filename = 'R%03dS2_'%sid + v +'_%02d.wav'%vrep
        sample_id = notes_mark[notes_mark['Filename'] == filename]
        
    if np.isnan(sample_id['idx'].values):
        print(counter)
        print(filename)        
        
    labels_Mark = np.append(labels_Mark, sample_id['idx'].values, axis=0)
    counter = counter + 1
        
labels_Mark = labels_Mark.astype(int)        


## Three Clusters

In [None]:
num_cluster = 3

X = features_freq

random_state = None
Y_pred = KMeans(n_clusters=num_cluster, random_state=random_state).fit_predict(X)
Y_pred = KMeans(n_clusters=num_cluster, 
                random_state=random_state,
                verbose=1).fit_predict(X)

In [None]:
for i in range(num_cluster):
    idx = (Y_pred == i)
    print('K-Mean Cluster %d' % i)
    print('Percentage: %.2f%%' % (100 * sum(idx)/np.size(Y_pred)))
    print('# of Samples: %d' % sum(idx))
    print('# of  /a/: %.2f%%' % (100 * sum(np.ravel(labels_vowel[idx] == 1))/sum(idx)))
    print('# of  /u/: %.2f%%' % (100 * sum(np.ravel(labels_vowel[idx] == 2))/sum(idx)))
    print('# of  /i/: %.2f%%' % (100 * sum(np.ravel(labels_vowel[idx] == 3))/sum(idx)))

    print('# of label 1 from Mark: %.2f%%' % (100 * sum(labels_Mark[idx] == 1)/sum(idx)))
    print('# of label 2 from Mark: %.2f%%' % (100 * sum(labels_Mark[idx] == 2)/sum(idx)))
    print('# of label 3 from Mark: %.2f%%' % (100 * sum(labels_Mark[idx] == 3)/sum(idx)))

    print('Positive Samples: %.2f%% ' % (100 * sum(np.ravel(labels_all[idx] == 1))/sum(idx)))
    print('Negative Samples: %.2f%% ' % (100 * sum(np.ravel(labels_all[idx] == -1))/sum(idx)))
    
    print('Average VFI: %.2f ' % (np.mean(np.ravel(subject_vfi[idx]))))
    
    print()


## Plots


In [None]:
y_idx

In [None]:
# Plot the mean and standard deviation of each cluster
x = np.array([1, 2, 3])
labels =['Cluster 1', 'Cluster 2', 'Cluster 3']

y = np.zeros((3))
e = np.zeros((3))

idx = (Y_pred == 0)
y[0] = np.mean(np.ravel(subject_vfi[idx]))
e[0] = np.std(np.ravel(subject_vfi[idx]))

idx = (Y_pred == 1)
y[1] = np.mean(np.ravel(subject_vfi[idx]))
e[1] = np.std(np.ravel(subject_vfi[idx]))

idx = (Y_pred == 2)
y[2] = np.mean(np.ravel(subject_vfi[idx]))
e[2] = np.std(np.ravel(subject_vfi[idx]))

y_idx = np.argsort(y)

plt.figure(figsize=(8,6))
plt.errorbar(x, y[y_idx], e[y_idx], 
             linestyle='None', 
             marker='X', 
             ecolor=['#1f77b4','#ff7f0e','#d62728'],
             elinewidth=2)

plt.xticks(x, labels, rotation='vertical')
plt.ylabel("VFI-1")

plt.show()

In [None]:
# Sort subjects based on their VFI-1
# ONLY NEED TO BE RUN ONCE IN FACT
subject_sort = np.zeros((40))
vfi_sort     = np.zeros((40))
for i in range(40):
    subject_sort[i] = SUBJECT_ID[i,0][0]
    vfi_sort[i]     = VFI[i,0][0]
    
sort_idx = np.argsort(vfi_sort)

subject_sort = subject_sort[sort_idx]
vfi_sort = vfi_sort[sort_idx]

print(subject_sort)
print(vfi_sort)

In [None]:
y[y_idx[2]]

In [None]:
%matplotlib inline
# Plot the stacked bar graph based on the sorted subject list
sample_count = np.zeros((40,3))

labels = []
for i in range(40):

    sub = subject_sort[i]
    vfi = vfi_sort[i]
    labels.append('R%03d, VFI=%2d'%(sub, vfi))

    for c in range(3):
        idx_sub     = (subject_id == sub)
        idx_cluster = (Y_pred == y_idx[c])

        sample_count[i,c] = sum(np.ravel(idx_sub) & np.ravel(idx_cluster))

# print(labels)
# print(sample_count)  
# idx = np.arrange(0,10)

fig, ax = plt.subplots(figsize=(16,12))
ax.bar(labels, sample_count[:,0], 0.25, label='Cluster 1', color='#1f77b4')
ax.bar(labels, sample_count[:,1], 0.25, label='Cluster 2', color='#ff7f0e')
ax.bar(labels, sample_count[:,2], 0.25, label='Cluster 3', color='#d62728')

ax.set_ylabel('Sample Counts')
ax.legend(loc='upper right')
plt.xticks(rotation=90)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16,12))
ax.bar(labels[10:20], sample_count[10:20,0], 0.35, label='Cluster 1')
ax.bar(labels[10:20], sample_count[10:20,1], 0.35, label='Cluster 2')
ax.bar(labels[10:20], sample_count[10:20,2], 0.35, label='Cluster 3')

ax.set_ylabel('Sample Counts')
ax.legend(loc='upper right')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16,12))
ax.bar(labels[20:30], sample_count[20:30,0], 0.35, label='Cluster 1')
ax.bar(labels[20:30], sample_count[20:30,1], 0.35, label='Cluster 2')
ax.bar(labels[20:30], sample_count[20:30,2], 0.35, label='Cluster 3')

ax.set_ylabel('Sample Counts')
ax.legend(loc='upper right')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16,12))
ax.bar(labels[30:40], sample_count[30:40,0], 0.35, label='Cluster 1')
ax.bar(labels[30:40], sample_count[30:40,1], 0.35, label='Cluster 2')
ax.bar(labels[30:40], sample_count[30:40,2], 0.35, label='Cluster 3')

ax.set_ylabel('Sample Counts')
ax.legend(loc='upper right')

plt.show()

In [None]:
for i in range(3):
    idx = (labels_Mark == i+1)
    print('Mark Cluster %d' % (i+1))
    print('Percentage: %.2f%%' % (100 * sum(idx)/np.size(labels_Mark)))   
    print('# of  /a/: %.2f%%' % (100 * sum(labels_vowel[idx] == 1)/sum(idx)))
    print('# of  /u/: %.2f%%' % (100 * sum(labels_vowel[idx] == 2)/sum(idx)))
    print('# of  /i/: %.2f%%' % (100 * sum(labels_vowel[idx] == 3)/sum(idx)))        
    print('# of cluster 0 from K-Means: %.2f%%' % (100 * sum(Y_pred[idx] == 0)/sum(idx)))
    print('# of cluster 1 from K-Means: %.2f%%' % (100 * sum(Y_pred[idx] == 1)/sum(idx)))
    print('# of cluster 2 from K-Means: %.2f%%' % (100 * sum(Y_pred[idx] == 2)/sum(idx)))

    print('Positive Samples: %.2f%% ' % (100 * sum(np.ravel(labels_all[idx] == 1))/sum(idx)))
    print('Negative Samples: %.2f%% ' % (100 * sum(np.ravel(labels_all[idx] == -1))/sum(idx)))    
    print('Average VFI: %.2f ' % (np.mean(np.ravel(subject_vfi[idx]))))    
    
    print()

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score


adjusted_rand_score(np.ravel(labels_vowel), labels_Mark)

In [None]:
np.shape(np.ravel(labels_vowel))

## Four Clusters

In [None]:
num_cluster = 4

X = features_all

random_state = 170
Y_pred = KMeans(n_clusters=num_cluster, random_state=random_state).fit_predict(X)

In [None]:
for i in range(num_cluster):
    idx = (Y_pred == i)
    print('Cluster %d' % i)
    print('Percentage: %.2f%%' % (100 * sum(idx)/np.size(Y_pred)))
    print('# of Samples: %d' % sum(idx))
    print('# of  /a/: %d' % sum(np.ravel(labels_vowel[idx] == 1)))
    print('# of  /u/: %d' % sum(np.ravel(labels_vowel[idx] == 2)))
    print('# of  /i/: %d' % sum(np.ravel(labels_vowel[idx] == 3)))

    print('# of label 1 from Mark: %d' % sum(labels_Mark[idx] == 1))
    print('# of label 2 from Mark: %d' % sum(labels_Mark[idx] == 2))
    print('# of label 3 from Mark: %d' % sum(labels_Mark[idx] == 3))

    print('Positive Samples: %d ' % sum(np.ravel(labels_all[idx] == 1)))
    print('Negative Samples: %d ' % sum(np.ravel(labels_all[idx] == -1)))
    print()


## Correlations with Acoustic Labels

In [None]:
x = [1, 1, 1, 2, 2, 2, 3, 3, 3]
y = [3, 3, 3, 1, 1, 1, 2, 2, 2]
np.corrcoef(x, y)

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(labels_vowel, label_acoustic.astype('int32'))

In [None]:
subject_id