In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from clean_data import *
from hmm import DiscreteDistr, GaussDistr, GaussMixDistr

In [58]:
# Prepare data for person 4. Use sequence 1~3 for training, 4~5 for testing.
person = 3
sadl_n = []
for n in range(1, 6):
    sadl_n.append(pd.read_table('data/S%d-ADL%d.dat' % (person, n), sep='\s+', header=None, dtype=float))

In [59]:
# Smooth data, time: col 0, features: col 1~36, labels: col 244 
winsize = 15
stepsize = 8

# train data
train_sample = np.empty((0, 36))
train_labels = np.empty((0))
train_len = []
for i in range(0, 3):
    features = moving_avg(sadl_n[i].iloc[:, 1:37], winsize, stepsize)
    labels = moving_vote_majority(sadl_n[i].iloc[:, 244], winsize, stepsize)
    train_sample = np.concatenate((train_sample, features), axis=0)
    train_len.append(features.shape[0])
    train_labels = np.concatenate( (train_labels, labels) )
train_len = np.array(train_len)

print "train_sample size: ", train_sample.shape
print "train_labels size: ", train_labels.shape
print "subsequence length: ", train_len, ". Sum of length: ", np.sum(train_len)

# test data
test_sample = np.empty((0, 36))
test_labels = np.empty((0))
test_len = []
for i in range(3, 5):
    features = moving_avg(sadl_n[i].iloc[:, 1:37], winsize, stepsize)
    labels = moving_vote_majority(sadl_n[i].iloc[:, 244], winsize, stepsize)
    test_sample = np.concatenate((test_sample, features), axis=0)
    test_len.append(features.shape[0])
    test_labels = np.concatenate( (test_labels, labels) )
test_len = np.array(test_len)  

print "test_sample size: ", test_sample.shape
print "test_labels size: ", test_labels.shape
print "subsequence length: ", test_len, ". Sum of length: ", np.sum(test_len)

train_sample size:  (11469, 36)
train_labels size:  (11469,)
subsequence length:  [4652 3478 3339] . Sum of length:  11469
test_sample size:  (6771, 36)
test_labels size:  (6771,)
subsequence length:  [3460 3311] . Sum of length:  6771


In [60]:
# Fill missing values
col_threshold = 0.5
train, test = fill_missing(train_sample, test_sample, col_threshold, True)

## Train gmm using raw features

In [61]:
# Train gmm using raw features

def make_outputdistr(train, train_labels, class2label):
    outputdistr_stats = pd.DataFrame(index=class2label, columns=['n_mixture', 'mean_logprob'])
    outputdistr_gmm = []
    n_mix = range(1, 10)
    for i, label in enumerate(class2label):
        gmm, N, logprob = search_num_mixtures(train[train_labels == label], n_mix)
        outputdistr_gmm.append(gmm)
        outputdistr_stats.iloc[i, :] = [N, logprob]
    return outputdistr_gmm, outputdistr_stats

In [62]:
from sklearn.model_selection import KFold
def search_num_mixtures(features, n_mix):
    # return: 
    # gmm: the gmm model that gives the highest mean logprob
    # N: number of gaussian mixtures in the gmm
    # logp_x: mean of logP(Xt | gmm) for the chosen gmm
    mean_logprob = [0 for i in range(0, len(n_mix))] # mean of logP(Xt | gmm)

    for i, N in enumerate(n_mix):
        # K-fold cross validataion
        n_folds = 3
        kf = KFold(n_splits = n_folds, random_state=0)
        likelihood_scores = []
        for train, val in kf.split(features):
            gmm = GaussMixDistr(gauss=N)
            gmm.init_by_data(features[train, :])
            gmm.train(features[train, :])
            logprob_x = gmm.logprob(features[val, :])
            likelihood_scores.append(np.mean(logprob_x))    
        mean_logprob[i] = np.mean(likelihood_scores)
    # Refit gmm using all data with selected number of mixtures
    gmm_list = [] # one gmm model for each "number of components"
    i_gmm = np.argmax(mean_logprob)
    n_components = n_mix[i_gmm]
    gmm_opt = GaussMixDistr(gauss=n_components)
    gmm_opt.init_by_data(features)
    gmm_opt.train(features)
    likelihood_score = np.mean(gmm_opt.logprob(features))
    
    return gmm_opt, n_components, likelihood_score

In [63]:
outputdistr_gmm, outputdistr_stats = make_outputdistr(train, train_labels, [0, 101, 102, 103, 104, 105])
outputdistr_stats

Unnamed: 0,n_mixture,mean_logprob
0,7,-250.095
101,6,-228.8917
102,9,-221.7732
103,4,-223.4718
104,5,-239.5752
105,9,-232.8739


In [64]:
# Evalute gmm on test data using naive bayes classifier
# max over label: P(label | X_t) = P(label, Xt) / P(X_t) ~ P(Xt | label) * P(label)

def naive_bayes_gmm_predict(val, gmms, priors, class2label):
    # Input
    # gmms: list of gmm distributions
    # priors: [n_states, ] prior probabilities
    # Returns:
    # labels
    n_samples = val.shape[0]
    logp_x_given_label = gmms[0].logprob(val, gmms)
    logp0 = np.tile(np.log(priors)[:, np.newaxis], (1, n_samples))
    logp_posterior = logp_x_given_label + logp0
    i_map = np.argmax(logp_posterior, axis=0)
    return [class2label[i] for i in i_map]

def compute_priors(train_labels, class2label):
    return [np.sum(train_labels == class2label[i]) / float(len(train_labels)) for i in range(0, len(class2label))]

In [65]:
class2label = [0, 101, 102, 103, 104, 105]
priors = compute_priors(train_labels, class2label)
print "priors: ", priors
train_label_pred = naive_bayes_gmm_predict(train, outputdistr_gmm, priors, class2label)
print "accuracy on train data: ", np.sum(train_labels == train_label_pred) / float(len(train_label_pred))

test_label_pred = naive_bayes_gmm_predict(test, outputdistr_gmm, priors, class2label)
print "accuracy on test data: ", np.sum(test_labels == test_label_pred) / float(len(test_label_pred))


priors:  [0.14962071671462202, 0.094254076205423309, 0.16235068445374487, 0.2370738512511989, 0.096085098962420443, 0.26061557241259048]
accuracy on train data:  0.714186066789
accuracy on test data:  0.590902377788


# Train gmm using feature normalization and dimension reduction

In [66]:
# Normalize features
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler() # center to mean and normalize to unit variance
train_normalized = scalar.fit_transform(train)
test_normalized = scalar.transform(test)

In [67]:
# Train gmm using normalized features
gmms_normalized, gmms_normalized_stats = make_outputdistr(train_normalized, train_labels, [0, 101, 102, 103, 104, 105])
gmms_normalized_stats

Unnamed: 0,n_mixture,mean_logprob
0,5,-44.49249
101,3,-57.33226
102,8,-17.56501
103,4,-16.02311
104,3,-35.01792
105,7,-25.54102


In [68]:
train_label_pred = naive_bayes_gmm_predict(train_normalized, gmms_normalized, priors, class2label)
print "accuracy on train data: ", np.sum(train_labels == train_label_pred) / float(len(train_label_pred))

test_label_pred = naive_bayes_gmm_predict(test_normalized, gmms_normalized, priors, class2label)
print "accuracy on test data: ", np.sum(test_labels == test_label_pred) / float(len(test_label_pred))

accuracy on train data:  0.707908274479
accuracy on test data:  0.589573179737


In [69]:
# Dimension reduction
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(train_normalized)
var_thres = 0.95 # keep components to up to 95% total variance
n_comp = (pca.explained_variance_ratio_.cumsum() < var_thres).sum() + 1
print "Keep %d compoments to retrain %f variance" % (n_comp, var_thres)

pca_train = PCA(n_components=n_comp)
pca_train.fit(train_normalized)
train_reduced = pca_train.transform(train_normalized)
test_reduced = pca_train.transform(test_normalized)
print "Size of reduced dimension training data: ", train_reduced.shape
print "Size of reduced dimension testing data: ", test_reduced.shape

Keep 21 compoments to retrain 0.950000 variance
Size of reduced dimension training data:  (11469, 21)
Size of reduced dimension testing data:  (6771, 21)


In [70]:
gmms_reduced, gmms_reduced_stats = make_outputdistr(train_reduced, train_labels, [0, 101, 102, 103, 104, 105])
gmms_reduced_stats

Unnamed: 0,n_mixture,mean_logprob
0,5,-14.4257
101,5,-12.76097
102,3,-23.11768
103,4,-16.46831
104,2,-26.89163
105,3,-24.83486


In [71]:
train_label_pred = naive_bayes_gmm_predict(train_reduced, gmms_reduced, priors, class2label)
print "accuracy on train data: ", np.sum(train_labels == train_label_pred) / float(len(train_label_pred))

test_label_pred = naive_bayes_gmm_predict(test_reduced, gmms_reduced, priors, class2label)
print "accuracy on test data: ", np.sum(test_labels == test_label_pred) / float(len(test_label_pred))

accuracy on train data:  0.718720027901
accuracy on test data:  0.616452518092
