**Imports**

In [21]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import csv
import math
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Reshape, Dropout, Conv2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.constraints import MinMaxNorm
from keras.callbacks import LearningRateScheduler, EarlyStopping

**CUDA**

In [22]:
len(tf.config.list_physical_devices('GPU'))>0

True

**Convert .txt to .csv**

In [None]:
input_folder = "C:/Users/Giannis/Documents/uva-thesis/data/conc_txt_test"
output_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data"

# create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# iterate through files in input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        try:
            # construct input and output file paths
            input_filepath = os.path.join(input_folder, filename)
            output_filepath = os.path.join(output_folder, os.path.splitext(filename)[0] + ".csv")

            # read txt file and remove leading and trailing quotation marks from each line
            with open(input_filepath, 'r') as file:
                lines = [line.strip().strip('"') for line in file.readlines()]

            # remove every other line (gap lines)
            lines = [line for index, line in enumerate(lines) if index % 2 == 0]

            # check if all lines have the same number of elements
            num_elements = len(lines[0].split(','))
            if all(len(line.split(',')) == num_elements for line in lines):
                # convert to df and save as csv
                df = pd.DataFrame([line.split(',') for line in lines])
                df.to_csv(output_filepath, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=' ')
                print(f"Converted {input_filepath} to {output_filepath}")
            else:
                print(f"Skipping {input_filepath}: Inconsistent number of elements in lines")
        except Exception as e:
            print(f"Error converting {input_filepath}: {e}")

**Concat the cytokines data to create the 'test_data_concat' folder**

In [None]:
base_cytokine_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data"
output_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data_concat"

def concat_files_in_folder(folder_path):
    test_data_concat = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            test_data_concat = pd.concat([test_data_concat, df])
    return test_data_concat

def sort_by_mcsteps(data):
    return data.sort_values(by='mcsteps')

concatenated_data = concat_files_in_folder(base_cytokine_folder)

sorted_data = sort_by_mcsteps(concatenated_data)

output_filepath = os.path.join(output_folder, "sorted_concatenated_data.csv")
sorted_data.to_csv(output_filepath, index=False)
print(f"Sorted and concatenated data saved to: {output_filepath}")

**Load data for test_data_concat and drop zCOM**

In [2]:
sorted_concatenated_csv = "C:/Users/Giannis/Documents/uva-thesis/data/test_data_concat/sorted_concatenated_data.csv"
data = pd.read_csv(sorted_concatenated_csv)
data.drop(columns=['zCOM'], inplace=True)

print(data.head())


   mcsteps  xCOM  yCOM           il8  il1  il6  il10  tnf  tgf
0        0    33    22  3.750408e-07  0.0  0.0   0.0  0.0  0.0
1        0    11    32  4.793328e-09  0.0  0.0   0.0  0.0  0.0
2        0    30    35  8.997935e-09  0.0  0.0   0.0  0.0  0.0
3        0    17    40  1.781103e-08  0.0  0.0   0.0  0.0  0.0
4        0    21    23  4.789621e-09  0.0  0.0   0.0  0.0  0.0


In [3]:
data['time'] = (data['mcsteps'] / 10000).astype(int)
data = data[['time'] + [col for col in data.columns if col != 'time']]
data.drop(columns=['mcsteps'], inplace=True)
print(data)

       time  xCOM  yCOM           il8           il1           il6  \
0         0    33    22  3.750408e-07  0.000000e+00  0.000000e+00   
1         0    11    32  4.793328e-09  0.000000e+00  0.000000e+00   
2         0    30    35  8.997935e-09  0.000000e+00  0.000000e+00   
3         0    17    40  1.781103e-08  0.000000e+00  0.000000e+00   
4         0    21    23  4.789621e-09  0.000000e+00  0.000000e+00   
...     ...   ...   ...           ...           ...           ...   
86155   100    49    35 -8.886183e-11  6.343394e-10  1.206925e-08   
86156   100    36     4  1.334245e-07  1.665160e-07  1.305691e-08   
86157   100    49    44 -1.233990e-10  1.178848e-09  1.615457e-07   
86158   100    45    10  3.257988e-07  4.403049e-08  1.749928e-07   
86159   100    35    38  2.107191e-05  9.170692e-08  5.693712e-19   

               il10           tnf           tgf  
0      0.000000e+00  0.000000e+00  0.000000e+00  
1      0.000000e+00  0.000000e+00  0.000000e+00  
2      0.000000e+00  

**Create arrays**

In [4]:
# define cytokines
cytokines = ['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']

# get unique time values
unique_time = data['time'].unique()

arrays = {}

# iterate over unique time values
for time in unique_time:
    # filter data for current value of time
    data_time = data[data['time'] == time]
    
    # initialize 50x50x6 array for current value of time
    array = np.zeros((50, 50, len(cytokines)))
    
    # get X and Y coordinates
    x = data_time['xCOM'].astype(int)
    y = data_time['yCOM'].astype(int)
    
    # get cytokine concentrations
    concentrations = data_time[['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']].values
    
    # assign cytokine concentrations to corresponding position in array
    array[x, y, :] = concentrations
    
    # store array for current value of time
    arrays[time] = array

**Print arrays**

In [5]:
print("Number of arrays:", len(arrays))
array = arrays[91]
print("Shape of the array:", array.shape)
print("Value at position (39,25):", array[39,25])

Number of arrays: 101
Shape of the array: (50, 50, 6)
Value at position (39,25): [1.7285698e-05 7.6371320e-11 2.9109778e-14 5.1850766e-18 9.9240220e-17
 1.7688583e-11]


**read 10 sequences to predict the 11th**

In [6]:
sequence_length = 10
input_sequences = []
output_values = []

# convert dictionary values to a list of arrays
arrays_list = [arrays[key] for key in sorted(arrays.keys())]

# convert 'arrays' list to numpy array
arrays_np = np.array(arrays_list)

for i in range(len(arrays_np) - sequence_length):
    input_seq = arrays_np[i:i+sequence_length]  # input sequence of arrays
    output_val = arrays_np[i+sequence_length]   # array at next time step
    
    input_sequences.append(input_seq)
    output_values.append(output_val)

# convert lists to numpy arrays
input_sequences = np.array(input_sequences)
output_values = np.array(output_values)

In [12]:
print(input_sequences.shape)
print(output_values.shape)

(91, 10, 50, 50, 6)
(91, 50, 50, 6)


**Models**

In [None]:
def lr_schedule(epoch, lr):
    if epoch < 100:
        return 0.0001
    else:
        return 0.001

lr_scheduler = LearningRateScheduler(lr_schedule)

early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1, restore_best_weights=True)

initial_lr = 0.0001

input_sequences_reshaped = input_sequences.reshape(input_sequences.shape[0], input_sequences.shape[1], -1)

model = Sequential()
model.add(LSTM(units=64, input_shape=(10, 50 * 50 * 6)))  # 10 for a sequence length of 10 as defined above
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=50 * 50 * 6, activation='linear'))  # output layer, linear activation
model.add(Reshape((50, 50, 6)))
model.compile(optimizer=Adam(learning_rate=initial_lr), loss='mse')  # compile with adam, mse
print(model.summary())

history = model.fit(input_sequences_reshaped, output_values, epochs=500, batch_size=64, 
                    validation_split=0.2, callbacks=[lr_scheduler, early_stopping])
print("Training Loss:", history.history['loss'])

loss = model.evaluate(input_sequences_reshaped, output_values)
print("Test Loss:", loss)

In [28]:
def lr_schedule(epoch, lr):
    if epoch < 100:
        return 0.0001
    else:
        return 0.001

lr_scheduler = LearningRateScheduler(lr_schedule)

early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1, restore_best_weights=True)

class SpatialTemporalAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_size):
        super(SpatialTemporalAttention, self).__init__()
        self.hidden_size = hidden_size
        self.W_s = tf.keras.layers.Dense(hidden_size)
        self.W_t = tf.keras.layers.Dense(hidden_size)
        self.V = tf.keras.layers.Dense(1)

    def call(self, lstm_output, input_data):
        # attention weights
        spatial_attention = tf.tanh(self.W_s(lstm_output))
        temporal_attention = tf.tanh(self.W_t(input_data))
        attention_scores = self.V(spatial_attention * temporal_attention)
        attention_weights = tf.nn.softmax(attention_scores, axis=1)
        
        # apply attention to LSTM output
        attended_output = tf.matmul(tf.transpose(attention_weights, [0, 2, 1]), lstm_output)
        return attended_output

class STALSTM(tf.keras.Model):
    def __init__(self, hidden_size, input_shape):
        super(STALSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.attention = SpatialTemporalAttention(hidden_size)
        self.fc1 = tf.keras.layers.Dense(50, activation='relu')
        self.fc2 = tf.keras.layers.Dense(50, activation='relu')
        self.fc3 = tf.keras.layers.Dense(tf.reduce_prod(input_shape[1:]), activation='linear')  # adjust output size
        self.reshape = tf.keras.layers.Reshape(input_shape[1:])  # reshape to match output shape
        self.input_shape_model = input_shape

    def call(self, input_data):
        lstm_output = self.lstm(input_data)
        attended_output = self.attention(lstm_output, input_data)
        x = tf.reshape(attended_output, (-1, self.hidden_size))  # flatten for fully connected layers
        x = self.fc1(x)
        x = self.fc2(x)
        output = self.fc3(x)
        output = self.reshape(output)  # reshape to match input shape
        return output

# reshape input sequences
input_sequences_reshaped = input_sequences.reshape(input_sequences.shape[0], input_sequences.shape[1], -1)

input_shape = input_sequences.shape[1:]
model = STALSTM(hidden_size=64, input_shape=input_shape)

# build the model by calling it on a batch of data
sample_input = tf.convert_to_tensor(input_sequences_reshaped[:1])  # take a sample batch
_ = model(sample_input)  # calling the model on a sample input to build it

initial_lr = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=initial_lr), loss='mse')

print(model.summary())

# custom MSE loss function
def custom_mse(y_true, y_pred):
    # reshape y_pred to match the shape of y_true
    y_pred = tf.reshape(y_pred, (-1,) + y_true.shape[1:])
    return tf.keras.losses.mean_squared_error(y_true, y_pred)

# train
history = model.fit(input_sequences_reshaped, output_values, epochs=500, batch_size=128, 
                    validation_split=0.2, callbacks=[lr_scheduler, early_stopping])

# evaluate
loss = model.evaluate(input_sequences_reshaped, output_values)
print("Test Loss:", loss)


Model: "stalstm_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_18 (LSTM)              multiple                  3856640   
                                                                 
 spatial_temporal_attention_  multiple                 964289    
 18 (SpatialTemporalAttentio                                     
 n)                                                              
                                                                 
 dense_107 (Dense)           multiple                  3250      
                                                                 
 dense_108 (Dense)           multiple                  2550      
                                                                 
 dense_109 (Dense)           multiple                  765000    
                                                                 
 reshape_7 (Reshape)         multiple                  0

In [20]:
def lr_scheduler(epoch):
    if epoch < 100:
        return 1e-4
    else:
        return 1e-3

scaler_input = MinMaxScaler()
input_sequences_scaled = scaler_input.fit_transform(input_sequences.reshape(-1, sequence_length * 50 * 50 * 6))
input_sequences_scaled = input_sequences_scaled.reshape(-1, sequence_length, 50, 50, 6)

scaler_output = MinMaxScaler()
output_values_scaled = scaler_output.fit_transform(output_values.reshape(-1, 50 * 50 * 6))
output_values_scaled = output_values_scaled.reshape(-1, 50, 50, 6)

model = Sequential([
    Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(sequence_length, 50, 50, 6)),
    Reshape((sequence_length, -1)),
    LSTM(units=64, return_sequences=False),
    Dense(units=50*50*6, activation='linear'),
    Reshape((50, 50, 6))
])

model.compile(optimizer=Adam(lr=1e-4), loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
lr_scheduler_callback = LearningRateScheduler(lr_scheduler)

history = model.fit(input_sequences_scaled, output_values_scaled, validation_split=0.2, epochs=500, batch_size=32, callbacks=[early_stopping, lr_scheduler_callback])

loss = model.evaluate(input_sequences_scaled, output_values_scaled)

predictions_scaled = model.predict(input_sequences_scaled)

predictions = scaler_output.inverse_transform(predictions_scaled.reshape(-1, 50 * 50 * 6))
predictions = predictions.reshape(-1, 50, 50, 6)

true_cytokine_concentrations = output_values[:, :, :, :6]
predicted_cytokine_concentrations = predictions[:, :, :, :6]

cytokine_mse = mean_squared_error(true_cytokine_concentrations.flatten(), predicted_cytokine_concentrations.flatten())
cytokine_mae = mean_absolute_error(true_cytokine_concentrations.flatten(), predicted_cytokine_concentrations.flatten())

print("Cytokine Concentrations MSE:", cytokine_mse)
print("Cytokine Concentrations MAE:", cytokine_mae)

Epoch 1/500


  super().__init__(name, **kwargs)


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7