**Imports**

In [24]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import csv
import math
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Reshape, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.constraints import MinMaxNorm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**CUDA**

In [2]:
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(tf.config.list_physical_devices('GPU'))

[]


# **Data Preparation**

**Convert .txt to .csv**

In [None]:
input_folder = "C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations_txt/S8"
output_folder = "C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations/S8"

# create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# iterate through files in input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        try:
            # construct input and output file paths
            input_filepath = os.path.join(input_folder, filename)
            output_filepath = os.path.join(output_folder, os.path.splitext(filename)[0] + ".csv")

            # read txt file and remove leading and trailing quotation marks from each line
            with open(input_filepath, 'r') as file:
                lines = [line.strip().strip('"') for line in file.readlines()]

            # convert to df and save as csv
            df = pd.DataFrame([line.split(',') for line in lines])
            df.to_csv(output_filepath, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=' ')
            print(f"Converted {input_filepath} to {output_filepath}")
        except Exception as e:
            print(f"Error converting {input_filepath}: {e}")


**Merge data in batches to create the 'merged_data' folder containing all the available data**

In [None]:
cellcounts_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/cellcounts'
base_cytokine_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations'
output_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/merged_data'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

cellcounts_batch_size = 8
cytokine_batch_size = 101

for batch_index in range(1, cellcounts_batch_size + 1):
    print(f"Processing batch {batch_index}")
    cellcounts_file = os.path.join(cellcounts_folder, f'cellcount S{batch_index}.csv')
    print("Cellcounts file:", cellcounts_file)
    cytokine_subfolder = f'S{batch_index}'
    cytokine_folder = os.path.join(base_cytokine_folder, cytokine_subfolder)
    df_cellcounts_batch = pd.read_csv(cellcounts_file)
    print("Cellcounts batch shape:", df_cellcounts_batch.shape)
    df_cytokine_batch = pd.DataFrame()
    for cytokine_file in os.listdir(cytokine_folder):
        if cytokine_file.endswith('.csv'):
            file_path = os.path.join(cytokine_folder, cytokine_file)
            df_cytokine = pd.read_csv(file_path)
            df_cytokine_batch = pd.concat([df_cytokine_batch, df_cytokine], ignore_index=True)

    print("Cytokine batch shape:", df_cytokine_batch.shape)

    merged_data = pd.merge(df_cellcounts_batch, df_cytokine_batch, on='mcsteps')
    print("Merged data shape:", merged_data.shape)

    output_filename = os.path.join(output_folder, f'combined_data_batch_{batch_index}.csv')
    merged_data.to_csv(output_filename, index=False)
    print(f"Saved merged data to {output_filename}")

**Concat the cytokines data to create the 'all_data' folder**

In [5]:
base_cytokine_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/concentrations'
output_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/all_data'

def create_simulation_subfolders(output_folder, simulation_names):
    for sim_name in simulation_names:
        sim_folder = os.path.join(output_folder, sim_name)
        os.makedirs(sim_folder, exist_ok=True)


def concat_files_in_folder(folder_path):
    all_data = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_data = pd.concat([all_data, df])
    return all_data

def sort_by_mcsteps(data):
    return data.sort_values(by='mcsteps')

simulation_names = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
create_simulation_subfolders(output_folder, simulation_names)

for sim_name in simulation_names:
    sim_folder_path = os.path.join(output_folder, sim_name)
    sim_data = concat_files_in_folder(os.path.join(base_cytokine_folder, sim_name))
    sorted_data = sort_by_mcsteps(sim_data)
    output_file_path = os.path.join(sim_folder_path, f'{sim_name}_data_sorted.csv')
    sorted_data.to_csv(output_file_path, index=False)

**Load data to cover all initializations (df1 to df8) and drop zCOM column as we have 2D spatial data**

In [6]:
output_folder = 'C:/Users/Ioannis/Documents/UvA thesis/UvA-thesis/data/all_data'

def load_and_drop_zCOM(folder_path):
    df = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.csv')])
    df.drop(columns=['zCOM'], inplace=True)
    return df

dfs = []
for i in range(1, 9):
    sim_folder_path = os.path.join(output_folder, f'S{i}')
    df = load_and_drop_zCOM(sim_folder_path)
    dfs.append(df)

df1, df2, df3, df4, df5, df6, df7, df8 = dfs

**Print dataframe to make sure it works as intended**

In [18]:
print(df8.info)

<bound method DataFrame.info of         mcsteps  xCOM  yCOM           il8           il1           il6  \
0             0   147   116  8.603181e-10  0.000000e+00  0.000000e+00   
1             0   251   364  9.141505e-10  0.000000e+00  0.000000e+00   
2             0   279   112  9.350631e-10  0.000000e+00  0.000000e+00   
3             0    83   288  9.888261e-10  0.000000e+00  0.000000e+00   
4             0   171   371  1.105985e-09  0.000000e+00  0.000000e+00   
...         ...   ...   ...           ...           ...           ...   
569270  1000000   172   192  1.493359e-08  1.195303e-11  3.136481e-27   
569271  1000000   141   107  1.232809e-08  1.575511e-11  3.546226e-15   
569272  1000000    81   257  1.258248e-08  1.729580e-11  1.002741e-11   
569273  1000000   127   264  1.046952e-08  9.687031e-12  1.480615e-14   
569274  1000000   298   363  1.142387e-08  7.674055e-12  5.401783e-25   

                il10           tnf           tgf  
0       0.000000e+00  0.000000e+00  0.00

**Create an array for each unique mcsteps value**

In [21]:
# Define cytokines
cytokines = ['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']
# Get unique 'mcsteps' values
unique_mcsteps = df8['mcsteps'].unique()

# Define array to store results
arrays = {}

# Iterate over unique 'mcsteps' values
for mcstep in unique_mcsteps:
    # Filter data for current 'mcsteps' value
    df_mcstep = df8[df8['mcsteps'] == mcstep]
    
    # Initialize 500x500 array for current 'mcsteps' value
    array = np.zeros((500, 500, len(cytokines)))
    
    # Iterate over rows in filtered DataFrame
    for index, row in df_mcstep.iterrows():
        # Get X and Y coordinates
        x = int(row['xCOM'])
        y = int(row['yCOM'])
        
        # Get cytokine concentrations
        concentrations = row[['il8', 'il1', 'il6', 'il10', 'tnf', 'tgf']].values
        
        # Assign cytokine concentrations to corresponding position in array
        array[x, y] = concentrations
    
    # Store array for current 'mcsteps' value
    arrays[mcstep] = array

**Print to make sure array works as intended**

In [50]:
print("Number of arrays:", len(arrays))
array = arrays[1000000]
print("Shape of the array:", array.shape)
print("Value at position (356,200):", array[356,200])

Number of arrays: 101
Shape of the array: (500, 500, 6)
Value at position (356,200): [1.1378244e-08 2.0458439e-11 5.3863960e-15 5.2224680e-23 7.6039260e-18
 1.2120631e-11]


# **Models**

In [55]:
# Assuming you have an array named arrays where each element represents an array with shape (500, 500, 6)
# The indices for accessing the arrays are distinct and range from some arbitrary values

# Generate corresponding target data (just as placeholders)
y = np.random.rand(len(arrays), 2)  # Placeholder for target data, adjust as per your actual targets

# Split data into train and test sets
arrays_train, arrays_test, y_train, y_test = train_test_split(arrays, y, test_size=0.1, random_state=30000)

# Stack the arrays to create a 4D array
X_train = np.stack(arrays_train)
X_test = np.stack(arrays_test)

# Build the LSTM model
model = Sequential([
    LSTM(256, input_shape=(500, 6), return_sequences=True),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(2)  # Assuming 2 output values
])

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print('Test Loss:', loss)

# Make predictions
predictions = model.predict(X_test)
print('Predictions shape:', predictions.shape)

KeyError: 65

In [None]:
with tf.device('/GPU:0'):
 class Model(tf.keras.Model):
    def __init__(self, model_path, train_mode=True, input_dim=19, lstm_size=500, batch_size=4, e_learning_rate=1e-5):
        super(Model, self).__init__()
        self.model_path = model_path
        self.train_mode = train_mode
        self.input_dim = input_dim
        self.lstm_size = lstm_size
        self.batch_size = batch_size
        self.e_learning_rate = e_learning_rate

        #define LSTM layer
        self.lstm_layer = tf.keras.layers.LSTM(units=self.lstm_size, return_sequences=True)

        #define output layer
        self.output_layer = tf.keras.layers.Dense(units=2, activation=None)
        # Define reshape layer
        self.reshape_layer = tf.keras.layers.Reshape((500, 500))  # Reshape to match spatial dimensions

        #define optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.e_learning_rate)

        #define MSE as loss function
        self.loss_fn = tf.keras.losses.MeanSquaredError()

        
        
    def call(self, inputs):
        # Reshape inputs to add sequence length dimension
        inputs = tf.expand_dims(inputs, axis=1)
        
        # Input shape: (batch_size, sequence_length, input_dim)
        x = self.lstm_layer(inputs)
        # Output shape: (batch_size, sequence_length, lstm_size)
        output = self.output_layer(x)
        # Output shape: (batch_size, sequence_length, 2) - 2 for outputs
        
        # Reshape output to match spatial dimensions
        output = self.reshape_layer(output)
        return output

    def train_step(self, xtrain, ytrain):
        with tf.GradientTape() as tape:
            y_pred = self(xtrain, training=True)
            loss = self.loss_fn(ytrain, y_pred)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss

    def train(self, train_set, valid_set, maxEpoch=10):
        x_train, y_train = train_set
        x_valid, y_valid = valid_set
        
        for epoch in range(maxEpoch):
            train_loss = self.train_step(x_train, y_train)
            valid_loss = self.loss_fn(y_valid, self(x_valid, training=False))
            print(f"Epoch {epoch + 1}, Train Loss: {train_loss}, Valid Loss: {valid_loss}")



if __name__ == "__main__":

    df1 = data_frames[0]
    
    #convert the df to numpy array and cast the data to float
    results = df1.to_numpy(dtype='float')

    #define input indices and output indices
    input_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    output_indices = [11, 12] 

   # Split data into train and valid sets
    train_size = int(len(results) * 0.9)
    train_features = results[:train_size, input_indices]
    train_targets = results[:train_size, output_indices]
    valid_features = results[train_size:, input_indices]
    valid_targets = results[train_size:, output_indices]

    # Create train and valid sets with input features and targets
    train_set = (train_features, train_targets)
    valid_set = (valid_features, valid_targets)

    #initialize and train the model
    mymodel = Model(model_path="saved_model")
    mymodel.train(train_set, valid_set, maxEpoch=500)   


In [97]:
print(df1.info)
print(df1.shape)

<bound method DataFrame.info of        mcsteps     1       2      3      4     5    6      7    8      9  \
0            0  10.0  1072.0  972.0  109.0  25.0  0.0  121.0  0.0    0.0   
1            0  10.0  1072.0  972.0  109.0  25.0  0.0  121.0  0.0    0.0   
2            0  10.0  1072.0  972.0  109.0  25.0  0.0  121.0  0.0    0.0   
3            0  10.0  1072.0  972.0  109.0  25.0  0.0  121.0  0.0    0.0   
4            0  10.0  1072.0  972.0  109.0  25.0  0.0  121.0  0.0    0.0   
...        ...   ...     ...    ...    ...   ...  ...    ...  ...    ...   
74753  1000000  10.0    14.0   84.0  105.0   2.0  0.0   38.0  0.0  307.0   
74754  1000000  10.0    14.0   84.0  105.0   2.0  0.0   38.0  0.0  307.0   
74755  1000000  10.0    14.0   84.0  105.0   2.0  0.0   38.0  0.0  307.0   
74756  1000000  10.0    14.0   84.0  105.0   2.0  0.0   38.0  0.0  307.0   
74757  1000000  10.0    14.0   84.0  105.0   2.0  0.0   38.0  0.0  307.0   

          10  xCOM  yCOM           il8           il1   

In [None]:
df7 = data_frames[6]
# separate features (X) and target variables (y)
X = df7.drop(columns=['xCOM', 'yCOM']).values
y = df7[['xCOM', 'yCOM']].values

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# normalize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# reshape input data to 3D for LSTM input
# the input shape is samples, timesteps, features
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

custom_optimizer = Adam(learning_rate=0.001) # learning rate

model = Sequential()
model.add(LSTM(256, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True))
#model.add(Dropout(0.2))  # dropout regularization
model.add(LSTM(256, return_sequences=True))
#model.add(Dropout(0.2)) 
model.add(LSTM(256))
model.add(Dense(2))  # output layer with 2 neurons for xCOM and yCOM
model.compile(loss='mean_squared_error', optimizer=custom_optimizer)

# train
model.fit(X_train_reshaped, y_train, epochs=100, batch_size=128, validation_data=(X_test_reshaped, y_test))

# evaluate
loss = model.evaluate(X_test_reshaped, y_test)
print('Test Loss:', loss)

predictions = model.predict(X_test_reshaped)

In [138]:
class MyLSTMModel:
    def __init__(self, data_frames):
        self.data_frames = data_frames
        self.X_train_reshaped = None
        self.X_test_reshaped = None
        self.y_train = None
        self.y_test = None
        self.model = None

    def prepare_data(self, test_size=0.1):
        df7 = self.data_frames[6]
        X = df7.drop(columns=['xCOM', 'yCOM']).values
        y = df7[['xCOM', 'yCOM']].values

        X_train, X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        self.X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
        self.X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

    def build_model(self, optimizer='adam'):
        custom_optimizer = Adam(learning_rate=0.001) if optimizer == 'adam' else optimizer

        self.model = Sequential([
            LSTM(256, input_shape=(self.X_train_reshaped.shape[1], self.X_train_reshaped.shape[2]), return_sequences=True),
            LSTM(256, return_sequences=True),
            LSTM(256),
            Dense(2)
        ])
        self.model.compile(loss='mean_squared_error', optimizer=custom_optimizer)

    def train_model(self, epochs=50, batch_size=256):
        self.model.fit(self.X_train_reshaped, self.y_train, epochs=epochs, batch_size=batch_size, validation_data=(self.X_test_reshaped, self.y_test))

    def evaluate_model(self):
        loss = self.model.evaluate(self.X_test_reshaped, self.y_test)
        print('Test Loss:', loss)

    def predict(self):
        predictions = self.model.predict(self.X_test_reshaped)
        return predictions

data_frames
lstm_model = MyLSTMModel(data_frames)
lstm_model.prepare_data()
lstm_model.build_model()
lstm_model.train_model()
lstm_model.evaluate_model()
predictions = lstm_model.predict()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 12387.54296875
