In [2]:
%matplotlib inline

import os
import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from scipy.signal import resample
from scipy.io.wavfile import read as wav_read
from scipy.fftpack import fft

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import LabelKFold
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
people = []
signals = []
labels = []

pwd = 'ibs_true/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(1)
    
pwd = 'ibs_false/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(0)
    
people = np.array(people)
signals = np.array(signals)
labels = np.array(labels)

In [4]:
cv = LabelKFold(people, n_folds=20)

Разделим выборку (сид подобран так, чтобы баланс в целевой переменной сохранялся).

In [5]:
np.random.seed(10)
index = np.arange(0, 20)
index = np.random.permutation(index)
train_mask = np.array([i in index[:16] for i in cv.idxs])
test_mask = ~train_mask

In [6]:
train_data = signals[train_mask]
train_labels = labels[train_mask]
test_data = signals[test_mask]
test_labels = labels[test_mask]

In [7]:
np.mean(train_labels)

0.30023640661938533

In [8]:
np.mean(test_labels)

0.3392568659127625

In [9]:
slice_len = 1000

Подготовим выборку для тестирования обученной сети.

In [10]:
random.seed(123)

testX = []
testy = []
test_size = 1000

for j in range(test_size):
    i = random.randint(0, len(test_data) - 1)
    X = test_data[i].reshape((-1, 1))
    y = test_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x -= np.mean(slice_x)
    slice_x = slice_x / (np.max(slice_x) + 1e-10)
    slice_y = y
    
    x_250 = resample(slice_x, 250).reshape((1, -1, 1))
    x_500 = resample(slice_x, 500).reshape((1, -1, 1))
    x_1000 = slice_x.reshape((1, -1, 1))
    
    testX.append([x_250, x_500, x_1000])
    testy.append(slice_y)

Генераторы батчей для сети:

In [11]:
def generate_slice(slice_len, data, labels):
    i = random.randint(0, len(data) - 1)
    X = data[i].reshape((-1, 1))
    y = labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_x = np.array(slice_x, dtype=np.float32)
    slice_x -= np.mean(slice_x)
    slice_x = slice_x / (np.max(slice_x) + 1e-10)
    
    return slice_x, y

In [12]:
def generator(batch_size, slice_len, data, labels):
    while True:
        batch_x = []
        batch_y = []
        
        for i in range(0, batch_size):
            x, y = generate_slice(slice_len, data, labels)
            batch_x.append(x)
            batch_y.append(y)
            
        y = np.array(batch_y)
        
        x_250 = np.array([resample(i, 250) for i in batch_x])
        x_500 = np.array([resample(i, 500) for i in batch_x])
        x = np.array([i for i in batch_x])
        yield ([x_250, x_500, x], y)

In [13]:
from keras.layers import Convolution1D, Dense, Dropout, Input, merge, GlobalMaxPooling1D
from keras.models import Model, load_model
from keras.optimizers import RMSprop, Adam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


Базовый блок сети, каждый из которых применяется на своём масштабе.

In [14]:
def get_base_model(input_len, fsize):
    input_seq = Input(shape=(input_len, 1))
    nb_filters = 150
    convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh")(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(150, activation="tanh")(processed)
    compressed = Dropout(0.3)(compressed)
    compressed = Dense(150, activation="tanh")(compressed)
    model = Model(input=input_seq, output=compressed)            
    return model

In [15]:
input250_seq = Input(shape=(250, 1))
input500_seq = Input(shape=(500, 1))
input1000_seq = Input(shape=(1000, 1))
    
base_network250 = get_base_model(250, 4) # 4
base_network500 = get_base_model(500, 7) # 7
base_network1000 = get_base_model(1000, 10) # 10
embedding_250 = base_network250(input250_seq)
embedding_500 = base_network500(input500_seq)
embedding_1000 = base_network1000(input1000_seq)
    
merged = merge([embedding_250, embedding_500, embedding_1000], mode="concat")
out = Dense(1, activation='sigmoid')(merged)
model = Model(input=[input250_seq, input500_seq, input1000_seq], output=out)
    
# opt = RMSprop(lr=0.005, clipvalue=10**6)
opt = SGD(lr=0.001, momentum=0.9, nesterov=True)
# opt = Adam(lr=0.001)
model.compile(loss="binary_crossentropy", optimizer=opt)

In [16]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 250, 1)        0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 500, 1)        0                                            
____________________________________________________________________________________________________
input_3 (InputLayer)             (None, 1000, 1)       0                                            
____________________________________________________________________________________________________
model_1 (Model)                  (None, 150)           46050       input_1[0][0]                    
___________________________________________________________________________________________

In [17]:
nb_epoch = 100
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
modelCheckpoing = ModelCheckpoint('weights.hdf5')
samples_per_epoch = 10000

model.fit_generator(generator(batch_size=100, slice_len=slice_len, data=train_data, labels=train_labels), 
                    samples_per_epoch, 
                    nb_epoch, 
                    validation_data=generator(batch_size=100, slice_len=slice_len, data=test_data, labels=test_labels), 
                    nb_val_samples=1000,
                    callbacks=[earlyStopping, modelCheckpoing], 
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


<keras.callbacks.History at 0x1624608d0>

Тестирование на отложенной выборке:

In [18]:
pr = []
for j in range(len(testX)):
    pr.append(model.predict(testX[j])[0][0])
pr = np.array(pr)

In [22]:
1 - roc_auc_score(testy, pr)

0.62041115046329576

In [20]:
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(testy, pr < thr) for thr in space])

0.65100000000000002

Подготовим преобразователь датасета:

In [23]:
weights = model.get_weights()

In [24]:
def base_model_preprocess(input_len, nb_filters, fsize, weights):
    input_seq = Input(shape=(input_len, 1))
    convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh", 
                              weights=[weights[0], weights[1]])(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(150, activation="tanh", weights=[weights[2], weights[3]])(processed)
    compressed = Dropout(0.3)(compressed)
    compressed = Dense(150, activation="tanh", weights=[weights[4], weights[5]])(compressed)
    model = Model(input=input_seq, output=compressed) 
    return model

In [25]:
input250_seq = Input(shape=(250, 1))
input500_seq = Input(shape=(500, 1))
input1000_seq = Input(shape=(1000, 1))

model_250 = base_model_preprocess(250, 150, 4, weights[0:6])
model_500 = base_model_preprocess(500, 150, 7, weights[6:12])
model_1000 = base_model_preprocess(1000, 150, 10, weights[12:18])

merged = merge([model_250(input250_seq), model_500(input500_seq), model_1000(input1000_seq)], mode="concat")
preprocess = Model(input=[input250_seq, input500_seq, input1000_seq], output=merged)

In [26]:
def preprocess_data(size, data, labels):
    prep = np.empty((size, 450))
    prep_y = np.empty(size)
    for j in tqdm.tqdm_notebook(xrange(size)):
        i = random.randint(0, len(data) - 1)
        X = data[i].reshape((-1, 1))
        y = labels[i]

        slice_start = random.randint(0, len(X) - slice_len)
        slice_end = slice_start + slice_len
        slice_x = X[slice_start:slice_end]
        slice_x = np.array(slice_x, dtype=np.float32)
        slice_x -= np.mean(slice_x)
        slice_x = slice_x / (np.max(slice_x) + 1e-10)
        
        x_250 = resample(slice_x, 250).reshape((1, -1, 1))
        x_500 = resample(slice_x, 500).reshape((1, -1, 1))
        x_1000 = slice_x.reshape((1, -1, 1))

        prep[j] = preprocess.predict([x_250, x_500, x_1000])
        prep_y[j] = y
        
    return prep, prep_y

In [28]:
X_train, y_train = preprocess_data(10000, train_data, train_labels)
X_test, y_test = preprocess_data(10000, test_data, test_labels)





In [32]:
clf = GradientBoostingClassifier(max_depth=5)
clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [33]:
pr = clf.predict_proba(X_test)[:, 1]

In [35]:
roc_auc_score(y_test, pr)

0.63747215304560223

In [36]:
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(y_test, pr < thr) for thr in space])

0.65890000000000004

Была идея усреднять показания внутри одного сигнала, но это даёт малый прирост.

In [82]:
window = 1000
step = 500

pr = []
for s in range(len(test_data)):
    count = int(float(len(test_data[s]) - window) / float(step))
    prob_sum = 0
    for j in range(count):
        slice_x = test_data[s][j*step:j*step+window].reshape((-1, 1))
        slice_x = np.array(slice_x, dtype=np.float32)
        slice_x -= np.mean(slice_x)
        slice_x = slice_x / (np.max(slice_x) + 1e-10)
        prob_sum += model.predict([np.array([i]) for i in multiscale(slice_x)])[0][0]
    pr.append(prob_sum / float(count))

print roc_auc_score(test_labels, pr)

0.352078239609


In [85]:
pr = np.array(pr)
space = np.linspace(np.min(pr), np.max(pr), 100)
max([accuracy_score(test_labels, pr < thr) for thr in space])

0.6607431340872375