In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy.random import seed

from tensorflow import set_random_seed

from keras.models import Sequential
from keras.layers import Dense, CuDNNGRU
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

from tqdm import tqdm

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
seed(639)
set_random_seed(5944)

In [4]:

print(os.listdir("../input"))

['unarch', 'lanl-features.zip', 'test.zip', 'sample_submission.csv', 'train.csv.zip', 'test', 'train']


In [7]:
float_data = pd.read_csv("../input/train/train.csv", dtype={"acoustic_data": np.float32, "time_to_failure": np.float32}).values

In [8]:
def extract_features(z):
     return np.c_[z.mean(axis=1), 
                  np.transpose(np.percentile(np.abs(z), q=[0, 50, 75, 100], axis=1)),
                  z.std(axis=1)]

In [26]:
def create_X(x, last_index=None, n_steps=150, step_length=1000):
    if last_index == None:
        last_index=len(x)
    
    print("\n")
    print("createX, x.shape:", x.shape)
    print("createX, last_index:", last_index)
    print("createX, n_steps:", n_steps)
    print("createX, step_length:", step_length)
    assert last_index - n_steps * step_length >= 0

    # Reshaping and approximate standardization with mean 5 and std 3.
    # ORIGINAL: I changed this becuase I got an No OpKernel was registered to support Op 'CudnnRNN' error
    #temp = (x[(last_index - n_steps * step_length):last_index].reshape(n_steps, -1) - 5 ) / 3
    # MY CHANGE: This doesn't fix things, I get the same errors
    temp = (x[(last_index - n_steps * step_length):last_index].reshape(n_steps, -1).astype(np.float32) - 5 ) / 3
    temp = x[(last_index - n_steps * step_length):last_index]
    print("createX, temp.shape before reshape:", temp.shape)
    temp = temp.reshape(n_steps, -1).astype(np.float32)
    print("create_X, temp.shape after reshape:", temp.shape)
    
    # Extracts features of sequences of full length 1000, of the last 100 values and finally also 
    # of the last 10 observations.
    print("createX, extract_features(temp).shape:", extract_features(temp).shape)
    print("createX, extract_features(temp[:, -step_length // 10:]).shape", extract_features(temp[:, -step_length // 10:]).shape)
    print("createX, extract_features(temp[:, -step_length // 100:]).shape", extract_features(temp[:, -step_length // 100:]).shape)
    print()
    result = np.c_[extract_features(temp),
                 extract_features(temp[:, -step_length // 10:]),
                 extract_features(temp[:, -step_length // 100:])]
    print("createX, result shape:", result.shape)
    print()
    '''
    return np.c_[extract_features(temp),
                 extract_features(temp[:, -step_length // 10:]),
                 extract_features(temp[:, -step_length // 100:])]
    '''
    return result

In [33]:
# Query "create_X" to figure out the number of features
n_features = create_X(float_data[0:150000]).shape[1]
print("Our RNN is based on %i features"% n_features)
    
# The generator endlessly selects "batch_size" ending positions of sub-time series. For each ending position,
# the "time_to_failure" serves as target, while the features are created by the function "create_X".
def generator(data, min_index=0, max_index=None, batch_size=16, n_steps=150, step_length=1000):
    if max_index is None:
        max_index = len(data) - 1
    print("\n")   
    print("generator, data.shape:", data.shape)
    while True:
        # Pick indices of ending positions
        rows = np.random.randint(min_index + n_steps * step_length, max_index, size=batch_size)
        print("generator, type(rows):", type(rows))
        print("generator, rows.shape:", rows.shape)
        print("generator rows:\n", rows)
         
        # Initialize feature matrices and targets
        samples = np.zeros((batch_size, n_steps, n_features))
        targets = np.zeros(batch_size, )
        print("generator, samples.shape:", samples.shape)
        print("generator, targets.shape:", targets.shape)
        
        for j, row in enumerate(rows):
            print("generator j: {}, row: {}".format(j, row))
            samples[j] = create_X(data[:, 0], last_index=row, n_steps=n_steps, step_length=step_length)
            print("generator, samples[{}].shape:".format(j), samples[j].shape)
            print("generator, row - 1:", row - 1)
            targets[j] = data[row - 1, 1]
            print("generator, targets[{}].shape:".format(j), targets[j].shape)
            print()
        yield samples, targets
        
batch_size = 32

# Position of second (of 16) earthquake. Used to have a clean split
# between train and validation
second_earthquake = 50085877
float_data[second_earthquake, 1]

# Initialize generators
# train_gen = generator(float_data, batch_size=batch_size) # Use this for better score
train_gen = generator(float_data, batch_size=batch_size, min_index=second_earthquake + 1)
valid_gen = generator(float_data, batch_size=batch_size, max_index=second_earthquake)



createX, x.shape: (150000, 2)
createX, last_index: 150000
createX, n_steps: 150
createX, step_length: 1000
createX, temp.shape before reshape: (150000, 2)
create_X, temp.shape after reshape: (150, 2000)
createX, extract_features(temp).shape: (150, 6)
createX, extract_features(temp[:, -step_length // 10:]).shape (150, 6)
createX, extract_features(temp[:, -step_length // 100:]).shape (150, 6)

createX, result shape: (150, 18)

Our RNN is based on 18 features


In [None]:
cb = [ModelCheckpoint("model.hdf5", save_best_only=True, period=3)]

model = Sequential()
model.add(CuDNNGRU(48, input_shape=(None, n_features)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.summary()

In [None]:
model.compile(optimizer=adam(lr=0.0005), loss="mae")

history = model.fit_generator(train_gen,
                              steps_per_epoch=1000,
                              epochs=30,
                              verbose=0,
                              callbacks=cb,
                              validation_data=valid_gen,
                              validation_steps=200)

In [None]:
def perf_plot(history, what = 'loss'):
    x = history.history[what]
    val_x = history.history['val_' + what]
    epochs = np.asarray(history.epoch) + 1
    
    plt.plot(epochs, x, 'bo', label = "Training " + what)
    plt.plot(epochs, val_x, 'b', label = "Validation " + what)
    plt.title("Training and validation " + what)
    plt.xlabel("Epochs")
    plt.legend()
    plt.show()
    return None

perf_plot(history)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})

In [None]:
for i, seg_id in enumerate(tqdm(submission.index)):
  #  print(i)
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    x = seg['acoustic_data'].values
    submission.time_to_failure[i] = model.predict(np.expand_dims(create_X(x), 0))

submission.head()

In [None]:
submission.to_csv('submission.csv')

In [5]:
pd.read_csv?

In [34]:
train_gen.__next__()



generator, data.shape: (629145480, 2)
generator, type(rows): <class 'numpy.ndarray'>
generator, rows.shape: (32,)
generator rows:
 [427141575 572587773 321268504 591703083 570683405 440058321 277957370
 173653120 225683999 436407911 454773149 262854533 353524447 331226096
 273846742 415008385 593772138 393169613 173531598 554572650 455304795
 355135602 518841530 401639256 521701670 312714309 296918499 453366533
 240063513 301911715 119834041 405811470]
generator, samples.shape: (32, 150, 18)
generator, targets.shape: (32,)
generator j: 0, row: 427141575


createX, x.shape: (629145480,)
createX, last_index: 427141575
createX, n_steps: 150
createX, step_length: 1000
createX, temp.shape before reshape: (150000,)
create_X, temp.shape after reshape: (150, 1000)
createX, extract_features(temp).shape: (150, 6)
createX, extract_features(temp[:, -step_length // 10:]).shape (150, 6)
createX, extract_features(temp[:, -step_length // 100:]).shape (150, 6)

createX, result shape: (150, 18)

gener

createX, result shape: (150, 18)

generator, samples[17].shape: (150, 18)
generator, row - 1: 393169612
generator, targets[17].shape: ()

generator j: 18, row: 173531598


createX, x.shape: (629145480,)
createX, last_index: 173531598
createX, n_steps: 150
createX, step_length: 1000
createX, temp.shape before reshape: (150000,)
create_X, temp.shape after reshape: (150, 1000)
createX, extract_features(temp).shape: (150, 6)
createX, extract_features(temp[:, -step_length // 10:]).shape (150, 6)
createX, extract_features(temp[:, -step_length // 100:]).shape (150, 6)

createX, result shape: (150, 18)

generator, samples[18].shape: (150, 18)
generator, row - 1: 173531597
generator, targets[18].shape: ()

generator j: 19, row: 554572650


createX, x.shape: (629145480,)
createX, last_index: 554572650
createX, n_steps: 150
createX, step_length: 1000
createX, temp.shape before reshape: (150000,)
create_X, temp.shape after reshape: (150, 1000)
createX, extract_features(temp).shape: (150, 6)
create

(array([[[ 5.12900019,  0.        ,  5.        , ...,  7.        ,
           8.        ,  2.0591259 ],
         [ 5.08500004,  0.        ,  5.        , ...,  7.75      ,
           8.        ,  2.29999995],
         [ 4.9749999 ,  0.        ,  5.        , ...,  6.75      ,
           8.        ,  1.84661853],
         ...,
         [ 5.31799984,  0.        ,  5.        , ...,  6.75      ,
           8.        ,  2.23606801],
         [ 5.50600004,  0.        ,  5.        , ...,  9.        ,
          11.        ,  2.72763634],
         [ 5.46700001,  0.        ,  5.        , ...,  6.75      ,
          10.        ,  2.93428016]],
 
        [[ 4.08099985,  0.        ,  4.        , ...,  5.25      ,
           7.        ,  2.76405478],
         [ 4.40199995,  0.        ,  5.        , ..., 12.25      ,
          18.        ,  8.93532276],
         [ 4.80700016,  0.        ,  6.5       , ...,  9.75      ,
          13.        ,  3.55105615],
         ...,
         [ 4.05299997,  0.       