# Transform CSVs to Keras Input

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
import seaborn as sns
from tqdm.notebook import trange, tqdm

import wandb

from data import get_voltage_series

from keras.layers import Dense, LSTM
from keras.models import Sequential
from keras.utils import to_categorical

sns.set()

DATA_DIR = Path('../data')

In [34]:
seawater = pd.read_csv(DATA_DIR / 'seawater.csv', index_col=0)
seawater

Unnamed: 0,voltage,SW0_0,SW0_1,SW0_2,SW0_3,SW0_4,SW0_5,SW0_6,SW0_7,SW0_8,...,SW0_70,SW0_71,SW0_72,SW0_73,SW0_74,SW0_75,SW0_76,SW0_77,SW0_78,SW0_79
0,1.000,-0.693472,-0.312375,-0.574770,-0.568522,-0.568522,-0.237405,-0.493552,-0.237405,-0.456067,...,-0.620375,-0.248150,-0.310188,-0.434263,-0.310188,-0.310188,-0.248150,-0.310188,-0.293632,-0.543532
1,0.996,-0.674730,-0.274890,-0.637245,-0.562275,-0.562275,-0.243652,-0.499800,-0.243652,-0.462315,...,-0.558338,-0.310188,-0.310188,-0.434263,-0.310188,-0.310188,-0.310188,-0.310188,-0.224910,-0.568522
2,0.992,-0.718462,-0.306127,-0.562275,-0.574770,-0.574770,-0.237405,-0.493552,-0.243652,-0.449820,...,-0.620375,-0.248150,-0.310188,-0.372225,-0.248150,-0.310188,-0.248150,-0.310188,-0.274890,-0.549780
3,0.988,-0.649740,-0.256147,-0.606007,-0.543532,-0.543532,-0.237405,-0.474810,-0.237405,-0.443572,...,-0.558338,-0.186113,-0.248150,-0.496300,-0.186113,-0.310188,-0.248150,-0.248150,-0.206167,-0.531037
4,0.984,-0.693472,-0.281137,-0.531037,-0.549780,-0.549780,-0.224910,-0.468562,-0.224910,-0.437325,...,-0.558338,-0.248150,-0.248150,-0.434263,-0.248150,-0.310188,-0.248150,-0.248150,-0.281137,-0.524790
5,0.980,-0.637245,-0.249900,-0.581017,-0.518542,-0.518542,-0.224910,-0.456067,-0.218662,-0.437325,...,-0.496300,-0.248150,-0.310188,-0.434263,-0.248150,-0.248150,-0.248150,-0.248150,-0.193672,-0.487305
6,0.976,-0.674730,-0.274890,-0.506047,-0.518542,-0.518542,-0.212415,-0.449820,-0.218662,-0.424830,...,-0.558338,-0.248150,-0.248150,-0.372225,-0.248150,-0.248150,-0.248150,-0.310188,-0.249900,-0.499800
7,0.972,-0.624750,-0.237405,-0.556027,-0.506047,-0.506047,-0.218662,-0.443572,-0.212415,-0.418582,...,-0.496300,-0.248150,-0.310188,-0.372225,-0.248150,-0.248150,-0.186113,-0.310188,-0.187425,-0.456067
8,0.968,-0.662235,-0.274890,-0.487305,-0.499800,-0.499800,-0.206167,-0.437325,-0.212415,-0.412335,...,-0.496300,-0.310188,-0.248150,-0.372225,-0.310188,-0.248150,-0.186113,-0.310188,-0.256147,-0.493552
9,0.964,-0.618502,-0.206167,-0.537285,-0.499800,-0.499800,-0.212415,-0.424830,-0.199920,-0.412335,...,-0.496300,-0.186113,-0.248150,-0.372225,-0.248150,-0.248150,-0.248150,-0.248150,-0.181177,-0.431077


# Save Index/Voltage Mappings

In [13]:
index_to_voltage_mapping = seawater.voltage.to_dict()
voltage_to_index_mapping = {v: i for i, v in index_to_voltage_mapping.items()}

In [17]:
with open(DATA_DIR / 'index_to_voltage_mapping.pkl', 'wb') as f:
    pickle.dump(index_to_voltage_mapping, f)
    
with open(DATA_DIR / 'voltage_to_index_mapping.pkl', 'wb') as f:
    pickle.dump(voltage_to_index_mapping, f)

In [18]:
with open(DATA_DIR / 'voltage_to_index_mapping.pkl', 'rb') as f:
    vi = pickle.load(f)
    
with open(DATA_DIR / 'index_to_voltage_mapping.pkl', 'rb') as f:
    iv = pickle.load(f)

In [20]:
iv

{0: 1.0,
 1: 0.996,
 2: 0.992,
 3: 0.988,
 4: 0.9840000000000001,
 5: 0.98,
 6: 0.976,
 7: 0.972,
 8: 0.968,
 9: 0.9640000000000001,
 10: 0.96,
 11: 0.956,
 12: 0.9520000000000001,
 13: 0.948,
 14: 0.9440000000000001,
 15: 0.94,
 16: 0.936,
 17: 0.932,
 18: 0.928,
 19: 0.924,
 20: 0.92,
 21: 0.916,
 22: 0.912,
 23: 0.908,
 24: 0.904,
 25: 0.8999999999999999,
 26: 0.8959999999999999,
 27: 0.8919999999999999,
 28: 0.8879999999999999,
 29: 0.8839999999999999,
 30: 0.8799999999999999,
 31: 0.8759999999999999,
 32: 0.8719999999999999,
 33: 0.8679999999999999,
 34: 0.8639999999999999,
 35: 0.8599999999999999,
 36: 0.8559999999999999,
 37: 0.8519999999999999,
 38: 0.8479999999999999,
 39: 0.8439999999999999,
 40: 0.8399999999999999,
 41: 0.8359999999999999,
 42: 0.8319999999999999,
 43: 0.8279999999999998,
 44: 0.8239999999999998,
 45: 0.8199999999999998,
 46: 0.8159999999999998,
 47: 0.8119999999999998,
 48: 0.8079999999999998,
 49: 0.8039999999999998,
 50: 0.7999999999999998,
 51: 0.7959999

# Transform To Keras Input

## Using Full Sequence as input (len 1002)

In [38]:
seawater.T.values[1:].shape

(80, 1002)

In [39]:
# Drop the first row as that is the 'voltage' row
full_input_arrays = np.expand_dims(seawater.T.values[1:], 2)

In [41]:
full_input_arrays.shape

(80, 1002, 1)

In [101]:
def build_model():
    model = Sequential([
        LSTM(10, batch_input_shape=(10, 1002, 1)),
        Dense(4, activation='sigmoid')
    ])
    model.compile(loss='categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['accuracy'])
    return model

In [72]:
y_train = np.array([np.random.choice(4) for _ in range(len(full_input_arrays))])
y_train_ohe = to_categorical(y_train)

In [102]:
model = build_model()
model.fit(full_input_arrays,
         y_train_ohe,
         epochs=3,
         batch_size=10,
         shuffle=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x13cae2358>

## Not Using Full Seq Length as Input

In [75]:
seawater.shape

(1002, 81)

In [79]:
seawater_values_we_want_in_rows = seawater.T[1:]

In [80]:
seawater_values_we_want_in_rows.shape

(80, 1002)

In [85]:
SEQ_LENGTH = 1002
num_chunks = 3 # or 1, 2, 6 (as these perfectly divide 1002)
num_full_length_sequences = seawater_values_we_want_in_rows.shape[0]
# We multiply the number of rows by num_chunks as we are dividing 
num_new_rows = num_full_length_sequences * num_chunks
num_new_cols = int(SEQ_LENGTH / num_chunks)
# num_new_cols must be a whole number
assert SEQ_LENGTH % num_chunks == 0
final_shape = (num_new_rows, num_new_cols)
final_shape

(240, 334)

In [92]:
seawater_values_we_want_in_rows.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,992,993,994,995,996,997,998,999,1000,1001
SW0_0,-0.693472,-0.67473,-0.718462,-0.64974,-0.693472,-0.637245,-0.67473,-0.62475,-0.662235,-0.618502,...,4.29828,4.410735,4.454467,4.57317,4.610655,4.7481,4.779337,4.92303,4.966762,5.110455
SW0_1,-0.312375,-0.27489,-0.306127,-0.256147,-0.281137,-0.2499,-0.27489,-0.237405,-0.27489,-0.206167,...,3.386145,3.461115,3.561075,3.642292,3.767242,3.842212,3.992152,4.060875,4.22331,4.29828
SW0_2,-0.57477,-0.637245,-0.562275,-0.606007,-0.531037,-0.581017,-0.506047,-0.556027,-0.487305,-0.537285,...,3.779737,3.854707,3.985905,4.092112,4.19832,4.304527,4.435725,4.57317,4.704367,4.860555
SW0_3,-0.568522,-0.562275,-0.57477,-0.543532,-0.54978,-0.518542,-0.518542,-0.506047,-0.4998,-0.4998,...,3.729757,3.829717,3.929677,4.02339,4.142092,4.242052,4.367002,4.47321,4.62315,4.735605
SW0_4,-0.568522,-0.562275,-0.57477,-0.543532,-0.54978,-0.518542,-0.518542,-0.506047,-0.4998,-0.4998,...,3.729757,3.829717,3.929677,4.02339,4.142092,4.242052,4.367002,4.47321,4.62315,4.735605


In [100]:
# This works... but how can I feed it into the model?
seawater_3_rows_per_seq = seawater_values_we_want_in_rows.values.reshape(-1, 334, 1)

In [107]:
seawater_3_rows_per_seq.shape

(240, 334, 1)