In [1]:
import os
import numpy as np
import pandas as pd
import joblib
import pickle

from scipy.io.wavfile import read as wav_read

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

from keras.layers import Convolution1D, Dense, Dropout, Input, concatenate, GlobalMaxPooling1D
from keras.models import Model, load_model, save_model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

from utils import *

Using TensorFlow backend.


In [2]:
SIGNAL_PATH = '../new_data/'
MODEL_PATH  = 'models/'
INPUT_PATH  = 'data/'

if not os.path.isdir(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
if not os.path.isdir(INPUT_PATH):
    os.makedirs(INPUT_PATH)

Read data

In [3]:
people = []
signals = []
labels = []

f = open(SIGNAL_PATH + 'list.txt', 'r')
l = f.readline()
l = f.readline().strip().split('\t')

while l:
    if not l[0]:
        break
    people_id = int(l[0])
    label = int(l[1])
    people_signals = l[5:]
    tmp_signals = []
    for sig in people_signals:
        if label:
            pwd = SIGNAL_PATH + 'True/filtered/' + sig + '-1K.wav'
        else:
            pwd = SIGNAL_PATH + 'False/filtered/' + sig + '-1K.wav'
        w = wav_read(pwd)[1]
        if len(w) < 20000:
            # filter some short signals
            continue
        tmp_signals.append(w)
    signals.append(tmp_signals)
    people.append(people_id)
    labels.append(label)
    l = f.readline().strip().split('\t')
    
labels = np.array(labels)

Fix size and seeds

In [4]:
window = 1000
step   = 500

prep_size = 150
fft_size  = 500
wv_size   = 1010

slice_len = 1000

In [5]:
n_iter = 5
n_fold = 5

In [6]:
seeds_nn_init = [34, 190, 405, 12, 23]
seeds_nn_tr   = [120, 180, 222, 89, 130]
seeds_ts      = [50, 11, 123, 911, 55]
seeds_cv      = [345, 12, 100, 735, 90]
seeds_rf      = [45, 90, 449, 100, 23]

Split data for MCNN training. Second part will be used later in experiments.

In [7]:
for it in xrange(n_iter):
    print 'start iter:', it+1
    np.random.seed(seeds_nn_tr[it])
    index = np.random.permutation(np.arange(len(signals)))
    train_count  = int(len(index) * 0.5)
    train_data   = [signals[i] for i in index[:train_count]]
    train_labels = [labels[i] for i in index[:train_count]]
    test_data    = [signals[i] for i in index[train_count:]]
    test_labels  = [labels[i] for i in index[train_count:]]
    print 'mean target train/test:', np.mean(train_labels), np.mean(test_labels)
    print 'count signal train/test:', sum([len(x) for x in train_data]), sum([len(x) for x in test_data])
    joblib.dump((train_data, test_data, train_labels, test_labels), 
                INPUT_PATH + 'data_iter_' + str(it))

start iter: 1
mean target train/test: 0.489208633094 0.507142857143
count signal train/test: 910 887
start iter: 2
mean target train/test: 0.496402877698 0.5
count signal train/test: 1032 765
start iter: 3
mean target train/test: 0.510791366906 0.485714285714
count signal train/test: 1008 789
start iter: 4
mean target train/test: 0.482014388489 0.514285714286
count signal train/test: 847 950
start iter: 5
mean target train/test: 0.525179856115 0.471428571429
count signal train/test: 812 985


Train 5 MCNN for every train data.

In [99]:
for it in xrange(n_iter):
    print 'start iter:', it+1
    train_data, test_data, train_labels, test_labels = joblib.load(INPUT_PATH + 'data_iter_' + str(it))
        
    np.random.seed(seeds_nn_init[it])
    nn = make_network()
    earlyStopping = EarlyStopping(monitor='val_acc', patience=10)
    modelCheckpoing = ModelCheckpoint('tmp.hdf5', save_best_only=True, monitor='val_acc')
    
    nb_epoch        = 200
    steps_per_epoch = 10
    val_steps       = 10
    batch_size      = 500

    h = nn.fit_generator(generator=generator(batch_size=batch_size, slice_len=slice_len, data=train_data, labels=train_labels), 
                         validation_data=generator(batch_size=batch_size, slice_len=slice_len, data=test_data, labels=test_labels), 
                         steps_per_epoch=steps_per_epoch, 
                         epochs=nb_epoch, 
                         validation_steps=val_steps, 
                         callbacks=[earlyStopping, modelCheckpoing], 
                         verbose=0)
    ind = np.argmax(h.history['val_acc'])
    nn.load_weights('tmp.hdf5')
    save_model(nn, MODEL_PATH + 'nn_iter_' + str(it))
    print 'acc tr/val:', h.history['acc'][ind], h.history['val_acc'][ind]

start iter: 1
acc tr/val: 0.685000002384 0.66819999218
start iter: 2
acc tr/val: 0.708999997377 0.683000010252
start iter: 3
acc tr/val: 0.749599993229 0.679199999571
start iter: 4
acc tr/val: 0.727599990368 0.696399998665
start iter: 5
acc tr/val: 0.736799997091 0.686199998856


In [8]:
result = {}
for it in xrange(n_iter):
    result[it] = {str(i): {} for i in xrange(n_fold)}

Make all

In [55]:
for it in xrange(n_iter):
    print 'start iter:', it+1
    train_data, test_data, train_labels, test_labels = joblib.load(INPUT_PATH + 'data_iter_' + str(it))
    
    # load network
    nn = load_model(MODEL_PATH + 'nn_iter_' + str(it))

    # make preprocess model with trained weights
    weights = nn.get_weights()
    preprocess = make_preprocess(weights)
    
    # split other part 
    cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seeds_cv[it])
    fold = 0
    for tr, ts in cv.split(test_data, test_labels):
        print 'start fold:', fold+1

        train_data_2   = [test_data[t] for t in tr]
        train_labels_2 = np.array([test_labels[t] for t in tr])
        test_data_2    = [test_data[t] for t in ts]
        test_labels_2  = np.array([test_labels[t] for t in ts])
        
        # random slice test
        np.random.seed(seeds_ts[it])
        X_tr_slice, y_tr = make_dataset(50000, train_data_2, train_labels_2)
        X_ts_slice, y_ts = make_dataset(50000, test_data_2,  test_labels_2)
        X_tr_preprocess = preprocess_dataset(X_tr_slice)
        X_ts_preprocess = preprocess_dataset(X_ts_slice)
        X_tr_fft = fft_dataset(X_tr_slice)
        X_ts_fft = fft_dataset(X_ts_slice)
        X_tr_wv = wv_preprocess(X_tr_slice)
        X_ts_wv = wv_preprocess(X_ts_slice)
        
        pr = []
        for j in xrange(len(X_ts_slice)):
            pr.append(nn.predict(resample_slice(X_ts_slice[j]))[0][0])
        pr = np.array(pr)
        result[it][fold]['nn'] = calc_result(y_ts, pr)
        
        clf_prep = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1, random_state=seeds_rf[it])
        clf_prep.fit(X_tr_preprocess, y_tr)
        pr = clf_prep.predict_proba(X_ts_preprocess)[:, 1]
        result[it][fold]['prep'] = calc_result(y_ts, pr)
        
        clf_fft = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1, random_state=seeds_rf[it])
        clf_fft.fit(X_tr_fft, y_tr)
        pr = clf_fft.predict_proba(X_ts_fft)[:, 1]
        result[it][fold]['fft'] = calc_result(y_ts, pr)
        
        clf_wv = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1, random_state=seeds_rf[it])
        clf_wv.fit(X_tr_wv, y_tr)
        pr = clf_wv.predict_proba(X_ts_wv)[:, 1]
        result[it][fold]['wv'] = calc_result(y_ts, pr)
        
        print 'random slice:', result[it][fold]['nn']['acc'], result[it][fold]['prep']['acc'], result[it][fold]['fft']['acc'], result[it][fold]['wv']['acc']
        
        # avg probabilities
        pr_nn, pr_prep, pr_fft, pr_wv = [], [], [], []
        for i in xrange(len(test_data_2)):
            people_prob_nn, people_prob_prep, people_prob_fft, people_prob_wv = [], [], [], []
            for j in xrange(len(test_data_2[i])):
                data = np.vstack((test_data_2[i][j][:window*(len(test_data_2[i][j])/window)].reshape((-1, window)),
                                  test_data_2[i][j][step:window*((len(test_data_2[i][j])-step)/window)+step].reshape((-1, window))))
                data = preprocess_signal_all(data)
                people_prob_nn.append(np.mean([nn.predict(resample_slice(data[y])) for y in xrange(len(data))]))
                people_prob_prep.append(np.mean(clf_prep.predict_proba(preprocess_dataset(data))[:, 1]))
                people_prob_fft.append(np.mean(clf_fft.predict_proba(fft_dataset(data))[:, 1]))
                people_prob_wv.append(np.mean(clf_wv.predict_proba(wv_preprocess(data))[:, 1]))
            pr_nn.append(np.mean(people_prob_nn))
            pr_prep.append(np.mean(people_prob_prep))
            pr_fft.append(np.mean(people_prob_fft))
            pr_wv.append(np.mean(people_prob_wv))
        result[it][fold]['nn_avg_prob']   = calc_result(test_labels_2, np.array(pr_nn))
        result[it][fold]['prep_avg_prob'] = calc_result(test_labels_2, np.array(pr_prep))
        result[it][fold]['fft_avg_prob']  = calc_result(test_labels_2, np.array(pr_fft))
        result[it][fold]['wv_avg_prob']   = calc_result(test_labels_2, np.array(pr_wv))
        print 'avg probs:', result[it][fold]['nn_avg_prob']['acc'], result[it][fold]['prep_avg_prob']['acc'], result[it][fold]['fft_avg_prob']['acc'], result[it][fold]['wv_avg_prob']['acc']
        
        # avg states
        pr_prep, pr_fft, pr_wv = [], [], []
        for i in xrange(len(test_data_2)):
            people_prob_prep, people_prob_fft, people_prob_wv = [], [], []
            for j in xrange(len(test_data_2[i])):
                data = np.vstack((test_data_2[i][j][:window*(len(test_data_2[i][j])/window)].reshape((-1, window)),
                                  test_data_2[i][j][step:window*((len(test_data_2[i][j])-step)/window)+step].reshape((-1, window))))
                data = preprocess_signal_all(data)
                people_prob_prep.append(clf_prep.predict_proba(np.mean(preprocess_dataset(data), axis=0).reshape((-1, prep_size)))[:, 1][0])
                people_prob_fft.append(clf_fft.predict_proba(np.mean(fft_dataset(data), axis=0).reshape((-1, fft_size)))[:, 1][0])
                people_prob_wv.append(clf_wv.predict_proba(np.mean(wv_preprocess(data), axis=0).reshape((-1, wv_size)))[:, 1][0])
            pr_prep.append(np.mean(people_prob_prep))
            pr_fft.append(np.mean(people_prob_fft))
            pr_wv.append(np.mean(people_prob_wv))
        result[it][fold]['prep_avg_states'] = calc_result(test_labels_2, np.array(pr_prep))
        result[it][fold]['fft_avg_states']  = calc_result(test_labels_2, np.array(pr_fft))
        result[it][fold]['wv_avg_states']   = calc_result(test_labels_2, np.array(pr_wv))
        print 'avg states:', result[it][fold]['prep_avg_states']['acc'], result[it][fold]['fft_avg_states']['acc'], result[it][fold]['wv_avg_states']['acc']
        
        fold += 1
        pickle.dump(result, open('result.pkl', 'wb'))
        print ''

start iter: 4
start fold: 1
random slice: 0.64834 0.6411 0.62896 0.66692
avg probs: 0.655172413793 0.689655172414 0.655172413793 0.793103448276
avg states: 0.620689655172 0.620689655172 0.51724137931

start fold: 2
random slice: 0.72976 0.63872 0.5927 0.66552
avg probs: 0.793103448276 0.689655172414 0.620689655172 0.793103448276
avg states: 0.586206896552 0.586206896552 0.51724137931

start fold: 3
random slice: 0.71846 0.71232 0.69698 0.61146
avg probs: 0.75 0.75 0.75 0.607142857143
avg states: 0.785714285714 0.892857142857 0.5

start fold: 4
random slice: 0.67252 0.66496 0.63598 0.6129
avg probs: 0.740740740741 0.703703703704 0.703703703704 0.62962962963
avg states: 0.666666666667 0.62962962963 0.518518518519

start fold: 5
random slice: 0.68318 0.69226 0.59904 0.65782
avg probs: 0.703703703704 0.740740740741 0.62962962963 0.703703703704
avg states: 0.703703703704 0.62962962963 0.518518518519



Results

In [10]:
table_result = pd.DataFrame(0, index=['FFT + RF', 'WV + RF', 'MCNN', 'MCNN + RF'], 
                            columns=['AUC', 'Accuracy', 'Max accuracy'])

tmp = exctract_result(result, 'nn')
table_result.ix['MCNN', 'AUC']          = tmp['auc']
table_result.ix['MCNN', 'Accuracy']     = tmp['acc']
table_result.ix['MCNN', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'prep')
table_result.ix['MCNN + RF', 'AUC']          = tmp['auc']
table_result.ix['MCNN + RF', 'Accuracy']     = tmp['acc']
table_result.ix['MCNN + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'fft')
table_result.ix['FFT + RF', 'AUC']          = tmp['auc']
table_result.ix['FFT + RF', 'Accuracy']     = tmp['acc']
table_result.ix['FFT + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'wv')
table_result.ix['WV + RF', 'AUC']          = tmp['auc']
table_result.ix['WV + RF', 'Accuracy']     = tmp['acc']
table_result.ix['WV + RF', 'Max accuracy'] = tmp['max_acc']

table_result

Unnamed: 0,AUC,Accuracy,Max accuracy
FFT + RF,0.689146,0.636525,0.64333
WV + RF,0.701787,0.642387,0.657521
MCNN,0.754368,0.680747,0.691955
MCNN + RF,0.739687,0.670424,0.679658


In [11]:
table_result = pd.DataFrame(0, index=['FFT + RF', 'WV + RF', 'MCNN', 'MCNN + RF'], 
                            columns=['AUC', 'Accuracy', 'Max accuracy'])

tmp = exctract_result(result, 'nn_avg_prob')
table_result.ix['MCNN', 'AUC']          = tmp['auc']
table_result.ix['MCNN', 'Accuracy']     = tmp['acc']
table_result.ix['MCNN', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'prep_avg_prob')
table_result.ix['MCNN + RF', 'AUC']          = tmp['auc']
table_result.ix['MCNN + RF', 'Accuracy']     = tmp['acc']
table_result.ix['MCNN + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'fft_avg_prob')
table_result.ix['FFT + RF', 'AUC']          = tmp['auc']
table_result.ix['FFT + RF', 'Accuracy']     = tmp['acc']
table_result.ix['FFT + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'wv_avg_prob')
table_result.ix['WV + RF', 'AUC']          = tmp['auc']
table_result.ix['WV + RF', 'Accuracy']     = tmp['acc']
table_result.ix['WV + RF', 'Max accuracy'] = tmp['max_acc']

table_result

Unnamed: 0,AUC,Accuracy,Max accuracy
FFT + RF,0.751713,0.679951,0.701136
WV + RF,0.767171,0.699909,0.723832
MCNN,0.816246,0.712474,0.738491
MCNN + RF,0.811204,0.716933,0.75652


In [12]:
table_result = pd.DataFrame(0, index=['FFT + RF', 'WV + RF', 'MCNN + RF'], 
                            columns=['AUC', 'Accuracy', 'Max accuracy'])

tmp = exctract_result(result, 'prep_avg_states')
table_result.ix['MCNN + RF', 'AUC']          = tmp['auc']
table_result.ix['MCNN + RF', 'Accuracy']     = tmp['acc']
table_result.ix['MCNN + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'fft_avg_states')
table_result.ix['FFT + RF', 'AUC']          = tmp['auc']
table_result.ix['FFT + RF', 'Accuracy']     = tmp['acc']
table_result.ix['FFT + RF', 'Max accuracy'] = tmp['max_acc']

tmp = exctract_result(result, 'wv_avg_states')
table_result.ix['WV + RF', 'AUC']          = tmp['auc']
table_result.ix['WV + RF', 'Accuracy']     = tmp['acc']
table_result.ix['WV + RF', 'Max accuracy'] = tmp['max_acc']

table_result

Unnamed: 0,AUC,Accuracy,Max accuracy
FFT + RF,0.750503,0.674733,0.684168
WV + RF,0.485195,0.505619,0.564791
MCNN + RF,0.775216,0.678878,0.703604


### DTW + kNN

Because of speed.

In [51]:
import mlpy # not from pip
import multiprocessing

In [15]:
def knn_predict_one(r):
    X_train, y_train, x, n_neighbors = r
    dist = [mlpy.dtw_std(y, x) for y in X_train]
    inds = np.argsort(dist)[:n_neighbors]
    return np.mean(y_train[inds])

def knn_predict(X_train, y_train, X_test, n_neighbors=5):
    pool = multiprocessing.Pool()
    pred = pool.map(knn_predict_one, [(X_train, y_train, x, n_neighbors) for x in X_test])
    pool.close()
    pool.join()
    return np.array(pred)

Make datasets

In [32]:
np.random.seed(78)
X_train, y_train = make_dataset(10000, train_data, train_labels)
X_test,  y_test  = make_dataset(1000, test_data, test_labels)

In [33]:
%%time
y_pr = knn_predict(X_train, y_train, X_test)

CPU times: user 20.1 s, sys: 8.34 s, total: 28.4 s
Wall time: 8h 32min 45s


AUC

In [34]:
roc_auc_score(y_test, y_pr)

0.66828172108538064

Accuracy

In [35]:
accuracy_score(y_test, y_pr > 0.5)

0.60899999999999999

Max accuracy

In [37]:
acc_1, thr_1, acc_2, thr_2 = calc_max_acc(y_test, y_pr)
(acc_1 + acc_2) / 2.0

0.629