In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import csv
import math
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Reshape, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.constraints import MinMaxNorm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
len(tf.config.list_physical_devices('GPU'))>0

In [None]:
input_folder = "C:/Users/Giannis/Documents/uva-thesis/data/conc_txt_test"
output_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data"

# create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# iterate through files in input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        try:
            # construct input and output file paths
            input_filepath = os.path.join(input_folder, filename)
            output_filepath = os.path.join(output_folder, os.path.splitext(filename)[0] + ".csv")

            # read txt file and remove leading and trailing quotation marks from each line
            with open(input_filepath, 'r') as file:
                lines = [line.strip().strip('"') for line in file.readlines()]

            # remove every other line (gap lines)
            lines = [line for index, line in enumerate(lines) if index % 2 == 0]

            # check if all lines have the same number of elements
            num_elements = len(lines[0].split(','))
            if all(len(line.split(',')) == num_elements for line in lines):
                # convert to df and save as csv
                df = pd.DataFrame([line.split(',') for line in lines])
                df.to_csv(output_filepath, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=' ')
                print(f"Converted {input_filepath} to {output_filepath}")
            else:
                print(f"Skipping {input_filepath}: Inconsistent number of elements in lines")
        except Exception as e:
            print(f"Error converting {input_filepath}: {e}")

**Concat the cytokines data to create the 'test_data_concat' folder**

In [None]:
base_cytokine_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data"
output_folder = "C:/Users/Giannis/Documents/uva-thesis/data/test_data_concat"

def concat_files_in_folder(folder_path):
    test_data_concat = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            test_data_concat = pd.concat([test_data_concat, df])
    return test_data_concat

def sort_by_mcsteps(data):
    return data.sort_values(by='mcsteps')

concatenated_data = concat_files_in_folder(base_cytokine_folder)

sorted_data = sort_by_mcsteps(concatenated_data)

output_filepath = os.path.join(output_folder, "sorted_concatenated_data.csv")
sorted_data.to_csv(output_filepath, index=False)
print(f"Sorted and concatenated data saved to: {output_filepath}")

**Load data for test_data_concat and drop zCOM**

In [2]:
sorted_concatenated_csv = "C:/Users/Giannis/Documents/uva-thesis/data/test_data_concat/sorted_concatenated_data.csv"
data = pd.read_csv(sorted_concatenated_csv)
data.drop(columns=['zCOM'], inplace=True)

print(data.head())


   mcsteps  xCOM  yCOM           il8  il1  il6  il10  tnf  tgf
0        0    33    22  3.750408e-07  0.0  0.0   0.0  0.0  0.0
1        0    11    32  4.793328e-09  0.0  0.0   0.0  0.0  0.0
2        0    30    35  8.997935e-09  0.0  0.0   0.0  0.0  0.0
3        0    17    40  1.781103e-08  0.0  0.0   0.0  0.0  0.0
4        0    21    23  4.789621e-09  0.0  0.0   0.0  0.0  0.0


In [3]:
data['time'] = (data['mcsteps'] / 10000).astype(int)
data = data[['time'] + [col for col in data.columns if col != 'time']]
data.drop(columns=['mcsteps'], inplace=True)
print(data)

       time  xCOM  yCOM           il8           il1           il6  \
0         0    33    22  3.750408e-07  0.000000e+00  0.000000e+00   
1         0    11    32  4.793328e-09  0.000000e+00  0.000000e+00   
2         0    30    35  8.997935e-09  0.000000e+00  0.000000e+00   
3         0    17    40  1.781103e-08  0.000000e+00  0.000000e+00   
4         0    21    23  4.789621e-09  0.000000e+00  0.000000e+00   
...     ...   ...   ...           ...           ...           ...   
86155   100    49    35 -8.886183e-11  6.343394e-10  1.206925e-08   
86156   100    36     4  1.334245e-07  1.665160e-07  1.305691e-08   
86157   100    49    44 -1.233990e-10  1.178848e-09  1.615457e-07   
86158   100    45    10  3.257988e-07  4.403049e-08  1.749928e-07   
86159   100    35    38  2.107191e-05  9.170692e-08  5.693712e-19   

               il10           tnf           tgf  
0      0.000000e+00  0.000000e+00  0.000000e+00  
1      0.000000e+00  0.000000e+00  0.000000e+00  
2      0.000000e+00  

In [6]:
import numpy as np

# define cytokines
cytokines = ['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']

# get unique time values
unique_time = data['time'].unique()

arrays = {}

# iterate over unique time values
for time in unique_time:
    # filter data for current value of time
    data_time = data[data['time'] == time]
    
    # initialize 50x50x6 array for current value of time
    array = np.zeros((50, 50, len(cytokines)))
    
    # get X and Y coordinates
    x = data_time['xCOM'].astype(int)
    y = data_time['yCOM'].astype(int)
    
    # get cytokine concentrations
    concentrations = data_time[['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']].values
    
    # assign cytokine concentrations to corresponding position in array
    array[x, y, :] = concentrations
    
    # store array for current value of time
    arrays[time] = array



In [9]:
print("Number of arrays:", len(arrays))
array = arrays[91]
print("Shape of the array:", array.shape)
print("Value at position (39,25):", array[39,25])

Number of arrays: 101
Shape of the array: (50, 50, 6)
Value at position (35,20): [1.7285698e-05 7.6371320e-11 2.9109778e-14 5.1850766e-18 9.9240220e-17
 1.7688583e-11]


In [10]:
sequence_length = 10
input_sequences = []
output_values = []

# convert dictionary values to a list of arrays
arrays_list = [arrays[key] for key in sorted(arrays.keys())]

# convert 'arrays' list to numpy array
arrays_np = np.array(arrays_list)

for i in range(len(arrays_np) - sequence_length):
    input_seq = arrays_np[i:i+sequence_length]  # input sequence of arrays
    output_val = arrays_np[i+sequence_length]   # array at next time step
    
    input_sequences.append(input_seq)
    output_values.append(output_val)

# convert lists to numpy arrays
input_sequences = np.array(input_sequences)
output_values = np.array(output_values)

In [11]:
print(input_sequences.shape)
print(output_values.shape)

(91, 10, 50, 50, 6)
(91, 50, 50, 6)


In [25]:
from keras.callbacks import LearningRateScheduler, EarlyStopping

def lr_schedule(epoch, lr):
    if epoch < 100:
        return 0.0001
    else:
        return 0.001

lr_scheduler = LearningRateScheduler(lr_schedule)

early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1, restore_best_weights=True)

initial_lr = 0.0001

input_sequences_reshaped = input_sequences.reshape(input_sequences.shape[0], input_sequences.shape[1], -1)

model = Sequential()
model.add(LSTM(units=64, input_shape=(10, 50 * 50 * 6)))  # 10 for a sequence length of 10 as defined above
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=50 * 50 * 6, activation='linear'))  # output layer, linear activation
model.add(Reshape((50, 50, 6)))
model.compile(optimizer=Adam(learning_rate=initial_lr), loss='mse')  # compile with adam, mse
print(model.summary())

history = model.fit(input_sequences_reshaped, output_values, epochs=500, batch_size=64, 
                    validation_split=0.2, callbacks=[lr_scheduler, early_stopping])
print("Training Loss:", history.history['loss'])

loss = model.evaluate(input_sequences_reshaped, output_values)
print("Test Loss:", loss)

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_13 (LSTM)              (None, 64)                3856640   
                                                                 
 dense_27 (Dense)            (None, 50)                3250      
                                                                 
 dense_28 (Dense)            (None, 50)                2550      
                                                                 
 dense_29 (Dense)            (None, 15000)             765000    
                                                                 
 reshape_12 (Reshape)        (None, 50, 50, 6)         0         
                                                                 
Total params: 4,627,440
Trainable params: 4,627,440
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
Epoch 2/500
Epoch 3/500
Ep