**Imports**

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import csv
import math
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Reshape, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.constraints import MinMaxNorm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**CUDA**

In [2]:
len(tf.config.list_physical_devices('GPU'))>0

True

# **Data Preparation**

**Convert .txt to .csv**

In [None]:
input_folder = "C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations_txt/S8"
output_folder = "C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations/S8"

# create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# iterate through files in input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        try:
            # construct input and output file paths
            input_filepath = os.path.join(input_folder, filename)
            output_filepath = os.path.join(output_folder, os.path.splitext(filename)[0] + ".csv")

            # read txt file and remove leading and trailing quotation marks from each line
            with open(input_filepath, 'r') as file:
                lines = [line.strip().strip('"') for line in file.readlines()]

            # convert to df and save as csv
            df = pd.DataFrame([line.split(',') for line in lines])
            df.to_csv(output_filepath, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=' ')
            print(f"Converted {input_filepath} to {output_filepath}")
        except Exception as e:
            print(f"Error converting {input_filepath}: {e}")


**Merge data in batches to create the 'merged_data' folder containing all the available data**

In [None]:
cellcounts_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/cellcounts'
base_cytokine_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations'
output_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/merged_data'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

cellcounts_batch_size = 8
cytokine_batch_size = 101

for batch_index in range(1, cellcounts_batch_size + 1):
    print(f"Processing batch {batch_index}")
    cellcounts_file = os.path.join(cellcounts_folder, f'cellcount S{batch_index}.csv')
    print("Cellcounts file:", cellcounts_file)
    cytokine_subfolder = f'S{batch_index}'
    cytokine_folder = os.path.join(base_cytokine_folder, cytokine_subfolder)
    df_cellcounts_batch = pd.read_csv(cellcounts_file)
    print("Cellcounts batch shape:", df_cellcounts_batch.shape)
    df_cytokine_batch = pd.DataFrame()
    for cytokine_file in os.listdir(cytokine_folder):
        if cytokine_file.endswith('.csv'):
            file_path = os.path.join(cytokine_folder, cytokine_file)
            df_cytokine = pd.read_csv(file_path)
            df_cytokine_batch = pd.concat([df_cytokine_batch, df_cytokine], ignore_index=True)

    print("Cytokine batch shape:", df_cytokine_batch.shape)

    merged_data = pd.merge(df_cellcounts_batch, df_cytokine_batch, on='mcsteps')
    print("Merged data shape:", merged_data.shape)

    output_filename = os.path.join(output_folder, f'combined_data_batch_{batch_index}.csv')
    merged_data.to_csv(output_filename, index=False)
    print(f"Saved merged data to {output_filename}")

**Concat the cytokines data to create the 'all_data' folder**

In [5]:
base_cytokine_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations'
output_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/all_data'

def create_simulation_subfolders(output_folder, simulation_names):
    for sim_name in simulation_names:
        sim_folder = os.path.join(output_folder, sim_name)
        os.makedirs(sim_folder, exist_ok=True)


def concat_files_in_folder(folder_path):
    all_data = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_data = pd.concat([all_data, df])
    return all_data

def sort_by_mcsteps(data):
    return data.sort_values(by='mcsteps')

simulation_names = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
create_simulation_subfolders(output_folder, simulation_names)

for sim_name in simulation_names:
    sim_folder_path = os.path.join(output_folder, sim_name)
    sim_data = concat_files_in_folder(os.path.join(base_cytokine_folder, sim_name))
    sorted_data = sort_by_mcsteps(sim_data)
    output_file_path = os.path.join(sim_folder_path, f'{sim_name}_data_sorted.csv')
    sorted_data.to_csv(output_file_path, index=False)

**Load data to cover all initializations (df1 to df8) and drop zCOM column as we have 2D spatial data**

In [5]:
output_folder = "C:/Users/Giannis/Documents/uva-thesis/data/all_data"

def load_and_drop_zCOM(folder_path):
    df = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.csv')])
    df.drop(columns=['zCOM'], inplace=True)
    return df

dfs = []
for i in range(1, 9):
    sim_folder_path = os.path.join(output_folder, f'S{i}')
    df = load_and_drop_zCOM(sim_folder_path)
    dfs.append(df)

df1, df2, df3, df4, df5, df6, df7, df8 = dfs

**Change 'mcsteps' to 'time' and print dataframe to make sure it works as intended**

In [6]:
df8['time'] = (df8['mcsteps'] / 10000).astype(int)
df8 = df8[['time'] + [col for col in df8.columns if col != 'time']]
df8.drop(columns=['mcsteps'], inplace=True)
print(df8)

        time  xCOM  yCOM           il8           il1           il6  \
0          0   147   116  8.603181e-10  0.000000e+00  0.000000e+00   
1          0   251   364  9.141505e-10  0.000000e+00  0.000000e+00   
2          0   279   112  9.350631e-10  0.000000e+00  0.000000e+00   
3          0    83   288  9.888261e-10  0.000000e+00  0.000000e+00   
4          0   171   371  1.105985e-09  0.000000e+00  0.000000e+00   
...      ...   ...   ...           ...           ...           ...   
569270   100   172   192  1.493359e-08  1.195303e-11  3.136481e-27   
569271   100   141   107  1.232809e-08  1.575511e-11  3.546226e-15   
569272   100    81   257  1.258248e-08  1.729580e-11  1.002741e-11   
569273   100   127   264  1.046952e-08  9.687031e-12  1.480615e-14   
569274   100   298   363  1.142387e-08  7.674055e-12  5.401783e-25   

                il10           tnf           tgf  
0       0.000000e+00  0.000000e+00  0.000000e+00  
1       0.000000e+00  0.000000e+00  0.000000e+00  
2     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df8.drop(columns=['mcsteps'], inplace=True)


In [7]:
# Select columns containing cytokine data
cytokine_columns = ['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']

# Find the smallest values for each cytokine
smallest_values = df8[cytokine_columns].min()

# Find the largest values for each cytokine
largest_values = df8[cytokine_columns].max()

print("Smallest values for each cytokine:")
print(smallest_values)

print("\nLargest values for each cytokine:")
print(largest_values)

Smallest values for each cytokine:
il8    -1.802096e-11
il1     0.000000e+00
il6     0.000000e+00
il10    0.000000e+00
tnf     0.000000e+00
tgf     0.000000e+00
dtype: float64

Largest values for each cytokine:
il8     1.817857e-08
il1     7.833237e-09
il6     6.361151e-09
il10    1.409022e-09
tnf     5.365540e-08
tgf     1.111927e-08
dtype: float64


**Create an array for each unique mcsteps value, should taken a couple minutes to run**

In [5]:
# define cytokines
cytokines = ['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']

# get unique time values
unique_time = df8['time'].unique()

arrays = {}

# iterate over unique time values
for time in unique_time:
    # filter data for current value of time
    df_time = df8[df8['time'] == time]
    
    # initialize 500x500 array for current value of time
    array = np.zeros((500, 500, len(cytokines)))
    
    # iterate over rows in filtered df
    for index, row in df_time.iterrows():
        # get X and Y coordinates
        x = int(row['xCOM'])
        y = int(row['yCOM'])
        
        # get cytokine concentrations
        concentrations = row[['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']].values
        
        # assign cytokine concentrations to corresponding position in array
        array[x, y] = concentrations
    
    # store array for current value of time
    arrays[time] = array

**Print to make sure array works as intended**

In [6]:
print("Number of arrays:", len(arrays))
array = arrays[100]
print("Shape of the array:", array.shape)
print("Value at position (356,200):", array[356,200])

Number of arrays: 101
Shape of the array: (500, 500, 6)
Value at position (356,200): [1.1378244e-08 2.0458439e-11 5.3863960e-15 5.2224680e-23 7.6039260e-18
 1.2120631e-11]


**Create input_sequences and output_values for the LSTM to use in order to be able to predict output_values from input_sequences**

In [18]:
sequence_length = 10
input_sequences = []
output_values = []

# convert dictionary values to a list of arrays
arrays_list = [arrays[key] for key in sorted(arrays.keys())]

# convert 'arrays' list to numpy array
arrays_np = np.array(arrays_list)

for i in range(len(arrays_np) - sequence_length):
    input_seq = arrays_np[i:i+sequence_length]  # input sequence of arrays
    output_val = arrays_np[i+sequence_length]   # array at next time step
    
    input_sequences.append(input_seq)
    output_values.append(output_val)

# convert lists to numpy arrays
input_sequences = np.array(input_sequences)
output_values = np.array(output_values)

**input_sequences has a shape of (91, 10, 500, 500, 6), which means we have 91 samples, each consisting of 10 arrays of shape (500, 500, 6).**

**output_values has a shape of (91, 500, 500, 6), indicating that each sample has an output array of shape (500, 500, 6).**

In [19]:
print(input_sequences.shape)
print(output_values.shape)

(99, 2, 500, 500, 6)
(99, 500, 500, 6)


# **Models**

In [20]:
model = Sequential()
model.add(LSTM(units=64, input_shape=(10, 500 * 500 * 6)))  # 10 for a sequence length of 10 as defined above
#model.add(Dense(units=100, activation='relu'))  # 100 neurons, first hidden layer, relu
#model.add(Dense(units=100, activation='relu'))  # 100 neurons, second hidden layer, relu
model.add(Dense(units=500 * 500 * 6, activation='linear'))  # output layer, linear activation
model.add(Reshape((500, 500, 6)))
model.compile(optimizer='adam', loss='mse')  # compile with adam, mse
print(model.summary())

input_sequences_reshaped = input_sequences.reshape(input_sequences.shape[0], 10, -1)

# train
history = model.fit(input_sequences_reshaped, output_values, epochs=10, batch_size=32, validation_split=0.2)
print("Training Loss:", history.history['loss'])

# evaluate
loss = model.evaluate(input_sequences, output_values)
print("Test Loss:", loss)

ResourceExhaustedError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Mul]

**Stuff I tried previously**

In [None]:
class Model(tf.keras.Model):
    def __init__(self, model_path, train_mode=True, input_dim=19, lstm_size=500, batch_size=4, e_learning_rate=1e-5):
        super(Model, self).__init__()
        self.model_path = model_path
        self.train_mode = train_mode
        self.input_dim = input_dim
        self.lstm_size = lstm_size
        self.batch_size = batch_size
        self.e_learning_rate = e_learning_rate
        self.lstm_layer = tf.keras.layers.LSTM(units=self.lstm_size, return_sequences=True)
        self.output_layer = tf.keras.layers.Dense(units=2, activation=None)
        self.reshape_layer = tf.keras.layers.Reshape((500, 500))
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.e_learning_rate)
        self.loss_fn = tf.keras.losses.MeanSquaredError()

    def call(self, inputs):
        inputs = tf.expand_dims(inputs, axis=1)
        x = self.lstm_layer(inputs)
        output = self.output_layer(x)
        output = self.reshape_layer(output)
        return output

    def train_step(self, xtrain, ytrain):
        with tf.GradientTape() as tape:
            y_pred = self(xtrain, training=True)
            loss = self.loss_fn(ytrain, y_pred)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss

    def train(self, train_set, valid_set, maxEpoch=10):
        x_train, y_train = train_set
        x_valid, y_valid = valid_set
        
        for epoch in range(maxEpoch):
            train_loss = self.train_step(x_train, y_train)
            valid_loss = self.loss_fn(y_valid, self(x_valid, training=False))
            print(f"Epoch {epoch + 1}, Train Loss: {train_loss}, Valid Loss: {valid_loss}")

if __name__ == "__main__":

    df1 = data_frames[0]
    
    #convert the df to numpy array and cast the data to float
    results = df1.to_numpy(dtype='float')

    #define input indices and output indices
    input_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    output_indices = [11, 12] 

   # Split data into train and valid sets
    train_size = int(len(results) * 0.9)
    train_features = results[:train_size, input_indices]
    train_targets = results[:train_size, output_indices]
    valid_features = results[train_size:, input_indices]
    valid_targets = results[train_size:, output_indices]

    # Create train and valid sets with input features and targets
    train_set = (train_features, train_targets)
    valid_set = (valid_features, valid_targets)

    #initialize and train the model
    mymodel = Model(model_path="saved_model")
    mymodel.train(train_set, valid_set, maxEpoch=500)   


In [None]:
df7 = data_frames[6]
X = df7.drop(columns=['xCOM', 'yCOM']).values
y = df7[['xCOM', 'yCOM']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

custom_optimizer = Adam(learning_rate=0.001)

model = Sequential()
model.add(LSTM(256, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
#model.add(Dropout(0.2)) 
model.add(LSTM(256))
model.add(Dense(2))  # output layer with 2 neurons for xCOM and yCOM
model.compile(loss='mean_squared_error', optimizer=custom_optimizer)
# train
model.fit(X_train_reshaped, y_train, epochs=100, batch_size=128, validation_data=(X_test_reshaped, y_test))
# evaluate
loss = model.evaluate(X_test_reshaped, y_test)
print('Test Loss:', loss)

predictions = model.predict(X_test_reshaped)

In [None]:
class MyLSTMModel:
    def __init__(self, data_frames):
        self.data_frames = data_frames
        self.X_train_reshaped = None
        self.X_test_reshaped = None
        self.y_train = None
        self.y_test = None
        self.model = None

    def prepare_data(self, test_size=0.1):
        df7 = self.data_frames[6]
        X = df7.drop(columns=['xCOM', 'yCOM']).values
        y = df7[['xCOM', 'yCOM']].values

        X_train, X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        self.X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
        self.X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

    def build_model(self, optimizer='adam'):
        custom_optimizer = Adam(learning_rate=0.001) if optimizer == 'adam' else optimizer

        self.model = Sequential([
            LSTM(256, input_shape=(self.X_train_reshaped.shape[1], self.X_train_reshaped.shape[2]), return_sequences=True),
            LSTM(256, return_sequences=True),
            LSTM(256),
            Dense(2)
        ])
        self.model.compile(loss='mean_squared_error', optimizer=custom_optimizer)

    def train_model(self, epochs=50, batch_size=256):
        self.model.fit(self.X_train_reshaped, self.y_train, epochs=epochs, batch_size=batch_size, validation_data=(self.X_test_reshaped, self.y_test))

    def evaluate_model(self):
        loss = self.model.evaluate(self.X_test_reshaped, self.y_test)
        print('Test Loss:', loss)

    def predict(self):
        predictions = self.model.predict(self.X_test_reshaped)
        return predictions

data_frames
lstm_model = MyLSTMModel(data_frames)
lstm_model.prepare_data()
lstm_model.build_model()
lstm_model.train_model()
lstm_model.evaluate_model()
predictions = lstm_model.predict()