In [None]:
%matplotlib inline

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.io.wavfile import read as wav_read
from scipy.fftpack import fft

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import LabelKFold
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
people = []
signals = []
labels = []

pwd = 'ibs_true/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(1)
    
pwd = 'ibs_false/'
files = os.listdir(pwd)
for f in files:
    w = wav_read(pwd + f)[1]
    if len(w) < 20000:
        # filter some short signals
        continue
    people.append(int(f.split('_')[0]))
    signals.append(w)
    labels.append(0)
    
people = np.array(people)
signals = np.array(signals)
labels = np.array(labels)

In [None]:
import random
from scipy.signal import resample

In [None]:
cv = LabelKFold(people, n_folds=20)

In [None]:
train_mask = cv.idxs < 16
test_mask = cv.idxs >= 16

In [None]:
new_signals = []
for i in range(len(signals)):
    new_signals.append(np.array(signals[i], dtype=np.float32) / max(signals[i]))
signals = np.array(new_signals)
del new_signals

In [None]:
train_data = signals[train_mask]
train_labels = labels[train_mask]
test_data = signals[test_mask]
test_labels = labels[test_mask]

In [None]:
slice_len = 1000

In [None]:
random.seed(123)

testX = []
testy = []

for j in range(1000):
    i = random.randint(0, len(test_data) - 1)
    X = test_data[i].reshape((-1, 1))
    y = test_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_y = y
    
    x_250 = resample(slice_x, 250)
    x_500 = resample(slice_x, 500)
    
    testX.append([x_250, x_500, slice_x])
    testy.append(slice_y)

In [None]:
def generate_slice(slice_len):
    i = random.randint(0, len(train_data) - 1)
    X = train_data[i].reshape((-1, 1))
    y = train_labels[i]
    
    slice_start = random.randint(0, len(X) - slice_len)
    slice_end = slice_start + slice_len
    slice_x = X[slice_start:slice_end]
    slice_y = y
        
    return slice_x, slice_y

In [None]:
def data_generator(batch_size, slice_len):
    while True:
        batch_x = []
        batch_y = []
        
        for i in range(0, batch_size):
            x, y = generate_slice(slice_len)
            batch_x.append(x)
            batch_y.append(y)
            
        y = np.array(batch_y)
        
        x_250 = np.array([resample(i, 250) for i in batch_x])
        x_500 = np.array([resample(i, 500) for i in batch_x])
        x = np.array([i for i in batch_x])
        yield ([x_250, x_500, x], y)

In [None]:
def val_generator(batch_size, slice_len):
    while True:
        
        batch_x = []
        batch_y = []
        
        for i in range(0, batch_size):
            i = random.randint(0, len(test_data) - 1)
            X = test_data[i].reshape((-1, 1))
            y = test_labels[i]

            slice_start = random.randint(0, len(X) - slice_len)
            slice_end = slice_start + slice_len
            x = X[slice_start:slice_end]
            y = y
            
            batch_x.append(x)
            batch_y.append(y)
            
        y = np.array(batch_y)
        
        x_250 = np.array([resample(i, 250) for i in batch_x])
        x_500 = np.array([resample(i, 500) for i in batch_x])
        x = np.array([i for i in batch_x])
        yield ([x_250, x_500, x], y)

In [None]:
plt.plot(testX[0][2])

In [None]:
plt.plot(testX[0][0])

In [None]:
from keras.layers import Convolution1D, Dense, Dropout, Input, merge, GlobalMaxPooling1D
from keras.models import Model, load_model
from keras.optimizers import RMSprop, Adam, SGD

In [None]:
def get_base_model(input_len, fsize):
    input_seq = Input(shape=(input_len, 1))
    nb_filters = 150
    convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh")(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(150, activation="tanh")(processed)
    compressed = Dropout(0.3)(compressed)
    compressed = Dense(150, activation="tanh")(compressed)
    model = Model(input=input_seq, output=compressed)            
    return model

In [None]:
input250_seq = Input(shape=(250, 1))
input500_seq = Input(shape=(500, 1))
input1000_seq = Input(shape=(1000, 1))
    
base_network250 = get_base_model(250, 4)
base_network500 = get_base_model(500, 7)
base_network1000 = get_base_model(1000, 10)
embedding_250 = base_network250(input250_seq)
embedding_500 = base_network500(input500_seq)
embedding_1000 = base_network1000(input1000_seq)
    
merged = merge([embedding_250, embedding_500, embedding_1000], mode="concat")
out = Dense(1, activation='sigmoid')(merged)
    
model = Model(input=[input250_seq, input500_seq, input1000_seq], output=out)
    
# opt = RMSprop(lr=0.005, clipvalue=10**6)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss="binary_crossentropy", optimizer=opt)

In [None]:
model.summary()

In [None]:
from keras.callbacks import EarlyStopping

nb_epoch = 100
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
samples_per_epoch = 10000

model.fit_generator(data_generator(batch_size=50, slice_len=slice_len), samples_per_epoch, nb_epoch, 
                    validation_data=val_generator(batch_size=50, slice_len=slice_len), nb_val_samples=100,
                    callbacks=[earlyStopping], verbose=1)

In [None]:
def multiscale(chunk):
    resampled_250 = resample(chunk, 250)
    resampled_500 = resample(chunk, 500)
    return [resampled_250, resampled_500, chunk]

In [None]:
pr = []
for j in range(len(testX)):
    pr.append(model.predict([np.array([i]) for i in multiscale(testX[j][2])])[0][0])

In [None]:
roc_auc_score(testy, pr)