In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from os import path
from scipy.cluster.vq import vq
import seaborn as sns
import os

In [2]:
DATA_PATH = os.path.join(os.getcwd(),'data/HMP_Dataset')
filelist = [os.path.join(DATA_PATH,f) for f in os.listdir('data/HMP_Dataset') if (not f.endswith(".txt") and (not f.endswith(".m")))] # I put my data in this folder
def generate_segments(data,time_unit):
    no_of_rows = np.shape(data)[0]  
    mod = no_of_rows % time_unit  
    if mod != 0:
        data_to_segment = np.array(data)[:-mod, :]  
    else:
        data_to_segment = np.array(data)
    vector_segment = data_to_segment.reshape(int(no_of_rows / time_unit),
                                             time_unit * 3)
    return pd.DataFrame(vector_segment)

def read_attribute_from_all_file(dir,time_unit):
    files = os.listdir(dir) 
    full_data_train = pd.DataFrame() 
    for file in files: 
        file_path = os.path.join(dir, file)  
        data = pd.read_csv(file_path, sep=" ", index_col=None, names=['x', 'y', 'z'],
                           skip_blank_lines=True).dropna() 
        segmented_data_train = generate_segments(data,
                                                 time_unit) 
        full_data_train = full_data_train.append(segmented_data_train,
                                                 ignore_index=True) 
    return full_data_train
def generate_vectors(n_cluster, time_unit):
    feature_vector = []
    for folder_path in filelist:
        vec = read_attribute_from_all_file(folder_path,time_unit)
        feature_vector.append(vec)
    return np.vstack(feature_vector)
def generate_classifier_feature(feature_vector, n_cluster, time_unit):
    k_means = KMeans(n_clusters=n_cluster, random_state=0).fit(feature_vector)
    train_classifier = pd.DataFrame()
    test_classifier = pd.DataFrame()
    for dir in filelist:
        train, test = create_feature_for_classifier(k_means, dir, n_cluster, time_unit)
        train_classifier = train_classifier.append(train)
        test_classifier = test_classifier.append(test)
    return train_classifier.append(test_classifier)
def create_feature_for_classifier(model, dir, n_cluster, time_unit):
    files = os.listdir(dir)
    train_per = int(0.67 * len(files))
    feature_train = pd.DataFrame()
    feature_test = pd.DataFrame()
    for file in files[:train_per]: 
        file_path = os.path.join(dir, file) 
        data = pd.read_csv(file_path, sep=" ", index_col=None, names=['x', 'y', 'z'], skip_blank_lines=True).dropna()
        segmented_data_train = generate_segments(data,
                                                 time_unit)

        assignment = vq(segmented_data_train,
                        model.cluster_centers_)[0]
        assignment_array = np.array(assignment)
        feature = [0 for s in
                   range(n_cluster + 1)]
        for i in assignment_array: 
            feature[i] += 1
        feature[n_cluster] = filelist.index(dir) + 1 
        feature_df = pd.DataFrame(np.array(feature).reshape(1, n_cluster + 1))
        feature_df.columns = range(1, n_cluster + 2) 
        feature_train = feature_train.append(feature_df) 
    for file in files[train_per:]:
        file_path = os.path.join(dir, file)
        data = pd.read_csv(file_path, sep=" ", index_col=None, names=['x', 'y', 'z'], skip_blank_lines=True).dropna()
        segmented_data_test = generate_segments(data,
                                                time_unit)
        assignment = vq(segmented_data_test,
                        model.cluster_centers_)[0] 
        assignment_array = np.array(assignment)
        feature = [0 for s in
                   range(0, n_cluster + 1)] 
        for i in assignment_array:
            feature[i] += 1 
        feature[n_cluster] = filelist.index(dir) + 1
        feature_df = pd.DataFrame(np.array(feature).reshape(1, n_cluster + 1))
        feature_df.columns = range(1, n_cluster + 2)
        feature_test = feature_test.append(feature_df)
    return feature_train, feature_test

In [31]:
ks = [160,320]
segment_lengths = [16,32]
for n_cluster in ks:
    for segment_l in segment_lengths :
        kf = KFold(3,shuffle = True)
        feature_vector = generate_vectors(n_cluster, segment_l)
        X = generate_classifier_feature(feature_vector, n_cluster, segment_l)
        acc = []
        for train_index,test_index in kf.split(X):
            random_forest_model = RandomForestClassifier(max_depth=32, random_state=8, n_estimators=90)
            random_forest_model.fit(X.iloc[train_index, :n_cluster], X.iloc[train_index, n_cluster])
            prediction = random_forest_model.predict(X.iloc[test_index, :n_cluster])
            acc.append(accuracy_score(X.iloc[test_index, n_cluster], prediction))
        print("Average accuracy achieved by 3-fold validation is " + str(np.mean(acc) * 100) + "%")
        print("Error rate for the classifier with k value " + str(n_cluster) +  " and segment length " + str(segment_l) + " is " + str((1-np.mean(acc))*100)+"%")

Average accuracy achieved by 3-fold validation is 75.0878989588667%
Error rate for the classifier with k value 160 and segment length 16 is 24.912101041133294%
Average accuracy achieved by 3-fold validation is 73.78093531319338%
Error rate for the classifier with k value 160 and segment length 32 is 26.219064686806615%
Average accuracy achieved by 3-fold validation is 74.97269158559482%
Error rate for the classifier with k value 320 and segment length 16 is 25.02730841440518%
Average accuracy achieved by 3-fold validation is 72.58320532514082%
Error rate for the classifier with k value 320 and segment length 32 is 27.41679467485918%


In [32]:
names = [f for f in os.listdir('data/HMP_Dataset') if (not f.endswith(".txt") and (not f.endswith(".m")))]
best_k = 320
best_l = 16
kf = KFold(3,shuffle = True)
feature_vector = generate_vectors(best_k,best_l)
X = generate_classifier_feature(feature_vector,best_k, best_l)
cms = []
acc = []
for train_index,test_index in kf.split(X):
    random_forest_model = RandomForestClassifier(max_depth=32, random_state=8, n_estimators=90)
    random_forest_model.fit(X.iloc[train_index, :best_k], X.iloc[train_index, best_k])
    prediction = random_forest_model.predict(X.iloc[test_index, :best_k])
    acc.append(accuracy_score(X.iloc[test_index, n_clust], prediction))
    cms.append(confusion_matrix(y_true = X.iloc[test_index, best_k]-1, y_pred = prediction-1,labels=range(14)))
plt.figure(figsize=(30,1))
ax = plt.subplot(1,1,1, frame_on=False) # no visible frame
ax.xaxis.set_visible(False)  # hide the x axis
ax.yaxis.set_visible(False)  # hide the y axis
pd.plotting.table(ax,data = pd.DataFrame(cms[best_fold_idx],columns= names,index = names))
plt.savefig('mytable.png')
plt.show()

NameError: name 'n_clust' is not defined

In [None]:
def generate_seperate_vectors(n_cluster, time_unit):
    feature_vector = []
    for folder_path in filelist:
        vec = read_attribute_from_all_file(folder_path,time_unit)
        feature_vector.append(vec.values)
    return feature_vector

In [None]:
feature_vector = generate_vectors(best_k, best_l)
seperate_vector = generate_seperate_vectors(best_k,best_l)
k_means = KMeans(n_clusters=n_cluster, random_state=0).fit(feature_vector)

In [None]:
fig = plt.figure(figsize=(30,20))
for i in range(len(filelist)):
    plt.subplot(4,4,i+1)
    assignment = vq(seperate_vector[i],
                    k_means.cluster_centers_)[0]
    fre = np.zeros(best_k)
    for j in assignment:
        fre[j] +=1
    fre = fre / len(os.listdir(filelist[i]))
    plt.bar(range(best_k),fre)
    plt.hist(assignment,bins = best_k,density=True)
    plt.figtext(.5, .9, "K value is 320",fontsize = 40)
    plt.title(names[i])
    plt.xlabel('cluster center')
    plt.ylabel('average frequency per file')
plt.savefig('histogram.png')

In [None]:
plt.figure(figsize=(30,1))
ax = plt.subplot(1,1,1, frame_on=False) # no visible frame
ax.xaxis.set_visible(False)  # hide the x axis
ax.yaxis.set_visible(False)  # hide the y axis
pd.plotting.table(ax,data = pd.DataFrame(cms[best_fold_idx],columns= names,index = names))
plt.savefig('mytable.png')
plt.show()

In [None]:
' '.join(['A', 'B', 'A', 'B', 'A', 'B', 'A']).count('A B A B A')

In [None]:
",but ,but".count(',but')

In [None]:
[i for i in range(1,4)]