In [None]:
!kaggle competitions download -c ventilator-pressure-prediction

import zipfile
import os
DIR = ''
file = 'ventilator-pressure-prediction.zip'
with zipfile.ZipFile(os.path.join(DIR, file), 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import tensorflow.keras as keras
import os
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import gc
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
pd.set_option('display.max_columns', 100)
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
def read_prepare_csv(csv_name = 'train.csv', lo=-2, hi=8):
  df = pd.read_csv(csv_name)
  df['cross']= df['u_in'] * df['u_out']
  df['cross2']= df['time_step'] * df['u_out']
  df['area'] = df['time_step'] * df['u_in']
  df['area'] = df.groupby('breath_id')['area'].cumsum()
  df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
  df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
  df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
  df['one'] = 1
  df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
  df['u_in_cummean'] = df['u_in_cumsum'] / df['count']
  df = df.drop(columns=['one', 'count'])
  for shift in range(1, hi + 1):
    df['u_in_lag_{}'.format(shift)] = df.groupby(df['breath_id'])['u_in'].shift(shift).fillna(0)
    df['u_in_dff_{}'.format(shift)] = df['u_in'] - df['u_in_lag_{}'.format(shift)]
  for shift in range(1, hi + 1):
    df['u_out_lag_{}'.format(shift)] = df.groupby(df['breath_id'])['u_out'].shift(shift).fillna(0) 
    df['u_out_dff_{}'.format(shift)] = df['u_out'] - df['u_out_lag_{}'.format(shift)]

  df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
  df['rolling_sum_4'] = df.groupby('breath_id')['u_in'].rolling(4, min_periods=0).sum().fillna(0).reset_index(level=0, drop=True)
  df['rolling_mean_4'] = df.groupby('breath_id')['u_in'].rolling(4, min_periods=0).mean().fillna(0).reset_index(level=0, drop=True)
  df['rolling_max_4'] = df.groupby('breath_id')['u_in'].rolling(4, min_periods=0).max().fillna(0).reset_index(level=0, drop=True)
  df['rolling_median_4'] = df.groupby('breath_id')['u_in'].rolling(4, min_periods=0).median().fillna(0).reset_index(level=0, drop=True)
  df['rolling_sum_8'] = df.groupby('breath_id')['u_in'].rolling(8, min_periods=0).sum().fillna(0).reset_index(level=0, drop=True)
  df['rolling_mean_8'] = df.groupby('breath_id')['u_in'].rolling(8, min_periods=0).mean().fillna(0).reset_index(level=0, drop=True)
  df['rolling_max_8'] = df.groupby('breath_id')['u_in'].rolling(8, min_periods=0).max().fillna(0).reset_index(level=0, drop=True)
  df['rolling_median_8'] = df.groupby('breath_id')['u_in'].rolling(8, min_periods=0).median().fillna(0).reset_index(level=0, drop=True)
  df['R'] = df['R'].astype(str)
  df['C'] = df['C'].astype(str)
  df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
  df = pd.get_dummies(df)
  return df

In [None]:
if os.path.isfile('prepared.csv'):
  train_df = pd.read_csv('prepared.csv')
else:
  train_df = read_prepare_csv('train.csv')
# print(train_df.corr()['pressure'])
test = np.array(train_df['pressure'], dtype=np.float32)
train = np.array(train_df.drop(columns=['breath_id', 'id', 'pressure']), dtype=np.float32)

gc.collect()
train = scaler.fit_transform(train)
train = train.reshape(-1, 80, train.shape[1])
test = test.reshape(-1, 80)

In [None]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply


def build_model(seed, shape=train.shape[-2:]):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    x_input = Input(shape=shape)
    
    x1 = Bidirectional(LSTM(units=768, return_sequences=True))(x_input)
    x2 = Bidirectional(LSTM(units=512, return_sequences=True))(x1)
    x3 = Bidirectional(LSTM(units=384, return_sequences=True))(x2)
    x4 = Bidirectional(LSTM(units=256, return_sequences=True))(x3)
    x5 = Bidirectional(LSTM(units=128, return_sequences=True))(x4)
    
    z2 = Bidirectional(GRU(units=384, return_sequences=True))(x2)
    
    z31 = Multiply()([x3, z2])
    z31 = BatchNormalization()(z31)
    z3 = Bidirectional(GRU(units=256, return_sequences=True))(z31)
    
    z41 = Multiply()([x4, z3])
    z41 = BatchNormalization()(z41)
    z4 = Bidirectional(GRU(units=128, return_sequences=True))(z41)
    
    z51 = Multiply()([x5, z4])
    z51 = BatchNormalization()(z51)
    z5 = Bidirectional(GRU(units=64, return_sequences=True))(z51)
    
    x = Concatenate(axis=2)([x5, z2, z3, z4, z5])
    
    x = Dense(units=128, activation='elu')(x)
    
    x_output = Dense(units=1)(x)

    model = keras.Model(inputs=x_input, outputs=x_output, 
                  name='gb-vpp-model')

    optimizer = keras.optimizers.Nadam(.001)
    loss = keras.losses.mean_absolute_error
    model.compile(optimizer, loss)
    return model

In [None]:
def scheduler(epoch, lr):
    if epoch < 50:
      return lr
    else:
      return lr * .985

callbacks=[]

for i in range(2):
  callbacks.append(keras.callbacks.ModelCheckpoint(filepath='gdrive/MyDrive/' + 'model_{}.h5'.format(i), save_best_only=True))




gc.collect()
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
seeds = [42, 1337, 666666]
models = []
with tpu_strategy.scope():
  for index,seed in enumerate(seeds):
    models.append(build_model(seed))
    print(len(models), index)
    train, test = shuffle(train, test)
    gc.collect()
    X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.15)
    X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
    y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
    X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)
    lrscheduler = keras.callbacks.LearningRateScheduler(scheduler)
    models[index].fit(X_train, y_train, batch_size=512, epochs=150, validation_data=(X_test, y_test), callbacks=[lrscheduler, callbacks[0]])

In [None]:
#del train_df
del  train, test, X_train, X_test, y_train, y_test
gc.collect()
test_df = read_prepare_csv('test.csv')
test_df = test_df.drop(columns=['id', 'breath_id'])
test_vals = np.array(test_df, dtype=np.float32)
del test_df
gc.collect()
test_vals = scaler.transform(test_vals)
test_vals = test_vals.reshape(-1, 80, test_vals.shape[1])


In [None]:
preds = []
gc.collect()
for index, model in enumerate(models):
  preds.append(model.predict(test_vals).reshape(-1))

In [None]:
sub_a = np.median(preds, 0)
indices = np.arange(1, sub_a.shape[0] + 1)
sub_ = pd.DataFrame({'id': indices, 'pressure': sub_a})
sub_.to_csv("vote_submit_lstm.csv", index=False)
!kaggle competitions submit -c ventilator-pressure-prediction -f vote_submit_lstm.csv -m "vs"