In [4]:
%matplotlib inline

import os
import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from scipy.signal import resample
from scipy.io.wavfile import read as wav_read
from scipy.fftpack import fft

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import LabelKFold
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
people = []
signals = []
labels = []

pwd = 'ibs_true/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(1)
    
pwd = 'ibs_false/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(0)
    
people = np.array(people)
signals = np.array(signals)
labels = np.array(labels)

In [6]:
cv = LabelKFold(people, n_folds=20)

Разделим выборку (сид подобран так, чтобы баланс в целевой переменной сохранялся).

In [7]:
np.random.seed(10)
index = np.arange(0, 20)
index = np.random.permutation(index)
train_mask = np.array([i in index[:16] for i in cv.idxs])
test_mask = ~train_mask

In [8]:
train_data = signals[train_mask]
train_labels = labels[train_mask]
test_data = signals[test_mask]
test_labels = labels[test_mask]

In [9]:
np.mean(train_labels)

0.30023640661938533

In [10]:
np.mean(test_labels)

0.3392568659127625

In [11]:
slice_len = 1000

Подготовим выборку для тестирования обученной сети.

In [12]:
random.seed(123)

testX = []
testy = []
test_size = 1000

for j in range(test_size):
    i = random.randint(0, len(test_data) - 1)
    X = test_data[i].reshape((-1, 1))
    y = test_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x -= np.mean(slice_x)
    slice_x = slice_x / (np.max(slice_x) + 1e-10)
    slice_y = y
    
    x_250 = resample(slice_x, 250).reshape((1, -1, 1))
    x_500 = resample(slice_x, 500).reshape((1, -1, 1))
    x_1000 = slice_x.reshape((1, -1, 1))
    
    testX.append([x_250, x_500, x_1000])
    testy.append(slice_y)

Генераторы батчей для сети:

In [13]:
def generate_slice(slice_len, data, labels):
    i = random.randint(0, len(data) - 1)
    X = data[i].reshape((-1, 1))
    y = labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x -= np.mean(slice_x)
    slice_x = slice_x / (np.max(slice_x) + 1e-10)
    
    return slice_x, y

In [14]:
def generator(batch_size, slice_len, data, labels):
    while True:
        batch_x = []
        batch_y = []
        
        for i in range(0, batch_size):
            x, y = generate_slice(slice_len, data, labels)
            batch_x.append(x)
            batch_y.append(y)
            
        y = np.array(batch_y)
        
        x_250 = np.array([resample(i, 250) for i in batch_x])
        x_500 = np.array([resample(i, 500) for i in batch_x])
        x = np.array([i for i in batch_x])
        yield ([x_250, x_500, x], y)

In [15]:
from keras.layers import Convolution1D, Dense, Dropout, Input, merge, GlobalMaxPooling1D, MaxPooling1D
from keras.models import Model, load_model
from keras.optimizers import RMSprop, Adam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


Базовый блок сети, каждый из которых применяется на своём масштабе.

In [42]:
def get_base_model(input_len, fsize):
    input_seq = Input(shape=(input_len, 1))
    nb_filters = 50
    convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh")(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(150, activation="tanh")(processed)
    compressed = Dropout(0.3)(compressed)
    compressed = Dense(150, activation="tanh")(compressed)
    model = Model(input=input_seq, output=compressed)            
    return model

In [43]:
input250_seq = Input(shape=(250, 1))
input500_seq = Input(shape=(500, 1))
input1000_seq = Input(shape=(1000, 1))
    
base_network250 = get_base_model(250, 15) # 4
base_network500 = get_base_model(500, 25) # 7
base_network1000 = get_base_model(1000, 35) # 10
embedding_250 = base_network250(input250_seq)
embedding_500 = base_network500(input500_seq)
embedding_1000 = base_network1000(input1000_seq)
    
merged = merge([embedding_250, embedding_500, embedding_1000], mode="concat")
merged = Dense(150, activation="tanh")(merged)
merged = Dropout(0.3)(merged)
out = Dense(1, activation='sigmoid')(merged)
model = Model(input=[input250_seq, input500_seq, input1000_seq], output=out)
    
# opt = RMSprop(lr=0.005, clipvalue=10**6)
opt = SGD(lr=0.001, momentum=0.9, nesterov=True)
# opt = Adam(lr=0.001)
model.compile(loss="binary_crossentropy", optimizer=opt)

In [44]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_35 (InputLayer)            (None, 250, 1)        0                                            
____________________________________________________________________________________________________
input_36 (InputLayer)            (None, 500, 1)        0                                            
____________________________________________________________________________________________________
input_37 (InputLayer)            (None, 1000, 1)       0                                            
____________________________________________________________________________________________________
model_18 (Model)                 (None, 150)           31100       input_35[0][0]                   
___________________________________________________________________________________________

In [22]:
nb_epoch = 100
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
modelCheckpoing = ModelCheckpoint('weights.hdf5')
samples_per_epoch = 10000

model.fit_generator(generator(batch_size=100, slice_len=slice_len, data=train_data, labels=train_labels), 
                    samples_per_epoch, 
                    nb_epoch, 
                    validation_data=generator(batch_size=100, slice_len=slice_len, data=test_data, labels=test_labels), 
                    nb_val_samples=1000,
                    callbacks=[earlyStopping, modelCheckpoing], 
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<keras.callbacks.History at 0x1676caad0>

Тестирование на отложенной выборке:

In [23]:
model.load_weights('weights.hdf5')

In [24]:
pr = []
for j in range(len(testX)):
    pr.append(model.predict(testX[j])[0][0])
pr = np.array(pr)

In [25]:
roc_auc_score(testy, pr)

0.66419227099860167

In [27]:
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(testy, pr > thr) for thr in space])

0.76200000000000001

Подготовим преобразователь датасета:

In [33]:
weights = model.get_weights()

In [49]:
def base_model_preprocess(input_len, nb_filters, fsize, weights):
    input_seq = Input(shape=(input_len, 1))
    convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh", 
                              weights=[weights[0], weights[1]])(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(150, activation="tanh", weights=[weights[2], weights[3]])(processed)
    compressed = Dropout(0.3)(compressed)
    compressed = Dense(150, activation="tanh", weights=[weights[4], weights[5]])(compressed)
    model = Model(input=input_seq, output=compressed) 
    return model

In [50]:
len(weights)

22

In [52]:
input250_seq = Input(shape=(250, 1))
input500_seq = Input(shape=(500, 1))
input1000_seq = Input(shape=(1000, 1))

model_250 = base_model_preprocess(250, 50, 15, weights[0:6])
model_500 = base_model_preprocess(500, 50, 25, weights[6:12])
model_1000 = base_model_preprocess(1000, 50, 35, weights[12:18])

merged = merge([model_250(input250_seq), model_500(input500_seq), model_1000(input1000_seq)], mode="concat")
merged = Dense(150, activation="tanh", weights=[weights[18], weights[19]])(merged)

preprocess = Model(input=[input250_seq, input500_seq, input1000_seq], output=merged)

In [53]:
def preprocess_data(size, data, labels):
    prep = np.empty((size, 150))
    prep_y = np.empty(size)
    for j in tqdm.tqdm_notebook(xrange(size)):
        i = random.randint(0, len(data) - 1)
        X = data[i].reshape((-1, 1))
        y = labels[i]

        slice_start = random.randint(0, len(X) - slice_len)
        slice_end = slice_start + slice_len
        slice_x = X[slice_start:slice_end]
        slice_x = np.array(slice_x, dtype=np.float32)
        slice_x -= np.mean(slice_x)
        slice_x = slice_x / (np.max(slice_x) + 1e-10)
        
        x_250 = resample(slice_x, 250).reshape((1, -1, 1))
        x_500 = resample(slice_x, 500).reshape((1, -1, 1))
        x_1000 = slice_x.reshape((1, -1, 1))

        prep[j] = preprocess.predict([x_250, x_500, x_1000])
        prep_y[j] = y
        
    return prep, prep_y

In [54]:
X_train, y_train = preprocess_data(10000, train_data, train_labels)
X_test, y_test = preprocess_data(10000, test_data, test_labels)





In [68]:
clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [69]:
pr = clf.predict_proba(X_test)[:, 1]

In [70]:
roc_auc_score(y_test, pr)

0.60274590797997207

In [71]:
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(y_test, pr > thr) for thr in space])

0.68959999999999999

То есть результат хуже, чем у сетки.

Была идея усреднять показания внутри одного сигнала, но ничего хорошего пока не вышло:

In [111]:
window = 1000
step = 500

pr = []
for s in range(len(test_data)):
    count = int(float(len(test_data[s]) - window) / float(step))
    prob_sum = 0
    for j in range(count):
        slice_x = test_data[s][j*step:j*step+window].reshape((-1, 1))
        slice_x = np.array(slice_x, dtype=np.float32)
        slice_x -= np.mean(slice_x)
        slice_x = slice_x / (np.max(slice_x) + 1e-10)
        
        x_250 = resample(slice_x, 250).reshape((1, -1, 1))
        x_500 = resample(slice_x, 500).reshape((1, -1, 1))
        x_1000 = slice_x.reshape((1, -1, 1))
        
        prob_sum += model.predict([x_250, x_500, x_1000])[0][0]
    pr.append(prob_sum / float(count))

print roc_auc_score(test_labels, pr)

0.388310629875


In [112]:
pr = np.array(pr)
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(test_labels, pr > thr) for thr in space])

0.67366720516962841

А теперь прикинем, что было бы, если бы работали с коэффициентами Фурье с таких же коротких окон.

In [102]:
random.seed(777)

trainX = []
trainy = []
train_size = 10000

for j in range(train_size):
    i = random.randint(0, len(train_data) - 1)
    X = train_data[i].reshape((-1, 1))
    y = train_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x = np.abs(fft(slice_x)[:500]).reshape(-1)
    slice_y = y
    
    trainX.append(slice_x)
    trainy.append(slice_y)

In [103]:
random.seed(777)

testX = []
testy = []
test_size = 10000

for j in range(test_size):
    i = random.randint(0, len(test_data) - 1)
    X = test_data[i].reshape((-1, 1))
    y = test_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x = np.abs(fft(slice_x)[:500]).reshape(-1)
    slice_y = y
    
    testX.append(slice_x)
    testy.append(slice_y)

In [104]:
clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1)
clf.fit(trainX, trainy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [106]:
pr = clf.predict_proba(testX)[:, 1]

In [107]:
roc_auc_score(testy, pr)

0.69743460725431583

In [108]:
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(y_test, pr > thr) for thr in space])

0.66569999999999996

Другое дело было, когда эти признки усреднялись для разных окон.