In [None]:
# Connects to your Google Drive so you can import files
from google.colab import drive
drive.mount('/content/drive')

import numpy as np # linear algebra
import pandas as pd

file_path = '/content/drive/My Drive/Colab Notebooks/Blood_brain_pred/ENSG00000096060_blood_brain.csv'
data = pd.read_csv(file_path)

# Set the index to the first column
data.set_index(data.columns[0], inplace=True)

Mounted at /content/drive


In [None]:
# Transpose the dataframe so each gene is a feature and each sample is a column
data_transposed = data.T

data_transposed.sample(4)

# Separate the target variable
target_row = 'ENSG00000096060'
y = data_transposed[target_row]
X = data_transposed.drop(columns=[target_row])

# Check the shapes to ensure they are as expected
print(X.shape)
print(y.shape)


(81, 18706)
(81,)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.regularizers import l1

# Prepare your data
X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the input dimension
input_dim = X.shape[1]

# Function to create a basic autoencoder
def create_basic_autoencoder(input_dim, encoding_dim, activation, optimizer, loss):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation=activation)(input_layer)
    decoded = Dense(input_dim, activation='linear')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=optimizer, loss=loss)

    return autoencoder

# Function to create a sparse autoencoder
def create_sparse_autoencoder(input_dim, encoding_dim, activation, optimizer, loss, sparsity_weight):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation=activation, activity_regularizer=l1(sparsity_weight))(input_layer)
    decoded = Dense(input_dim, activation='linear')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=optimizer, loss=loss)

    return autoencoder

# Grid search parameters
encoding_dims = [5, 10, 15]
activations = ['relu']
optimizers = [Adam(learning_rate=0.0001), Adam(learning_rate=0.001), Adam(learning_rate=0.01)]
losses = ['mean_squared_error']
sparsity_weights = [1e-5, 1e-4, 1e-3]  # For the sparse autoencoder
autoencoder_types = ['basic', 'sparse']

best_mse = float('inf')
best_r2 = float('-inf')
best_params = {}

# Grid search
for encoding_dim in encoding_dims:
    for activation in activations:
        for optimizer in optimizers:
            for loss in losses:
                for autoencoder_type in autoencoder_types:
                    if autoencoder_type == 'basic':
                        autoencoder = create_basic_autoencoder(input_dim, encoding_dim, activation, optimizer, loss)
                        # Train
                        autoencoder.fit(X_train, X_train, epochs=250, batch_size=10, validation_split=0.2, verbose=0)

                        # Create model
                        encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(index=1).output)
                        X_train_encoded = encoder.predict(X_train)
                        X_test_encoded = encoder.predict(X_test)

                        print(f'Encoded train shape (basic): {X_train_encoded.shape}')
                        print(f'Encoded test shape (basic): {X_test_encoded.shape}')

                        # Use the encoded features in a regression model
                        regressor = LinearRegression()
                        regressor.fit(X_train_encoded, y_train)

                        # Predict and evaluate model
                        y_pred = regressor.predict(X_test_encoded)

                        mse = mean_squared_error(y_test, y_pred)
                        r2 = r2_score(y_test, y_pred)

                        print(f'Autoencoder type: {autoencoder_type}, Encoding dim: {encoding_dim}, Activation: {activation}, Optimizer: {optimizer}, Loss: {loss}')
                        print(f'Mean Squared Error: {mse}, R-squared: {r2}')

                        # Update best parameters if current model is better
                        if mse < best_mse:
                            best_mse = mse
                            best_r2 = r2
                            best_params = {
                                'autoencoder_type': autoencoder_type,
                                'encoding_dim': encoding_dim,
                                'activation': activation,
                                'optimizer': optimizer,
                                'loss': loss,
                            }

                    elif autoencoder_type == 'sparse':
                        for sparsity_weight in sparsity_weights:
                            autoencoder = create_sparse_autoencoder(input_dim, encoding_dim, activation, optimizer, loss, sparsity_weight)
                            # Train
                            autoencoder.fit(X_train, X_train, epochs=250, batch_size=10, validation_split=0.2, verbose=0)

                            # Createmodel
                            encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(index=1).output)
                            X_train_encoded = encoder.predict(X_train)
                            X_test_encoded = encoder.predict(X_test)

                            print(f'Encoded train shape (sparse): {X_train_encoded.shape}')
                            print(f'Encoded test shape (sparse): {X_test_encoded.shape}')

                            # Use the encoded features in a regression model
                            regressor = LinearRegression()
                            regressor.fit(X_train_encoded, y_train)

                            # Predict and evaluate the model
                            y_pred = regressor.predict(X_test_encoded)

                            mse = mean_squared_error(y_test, y_pred)
                            r2 = r2_score(y_test, y_pred)

                            print(f'Autoencoder type: {autoencoder_type}, Encoding dim: {encoding_dim}, Activation: {activation}, Optimizer: {optimizer}, Loss: {loss}, Sparsity weight: {sparsity_weight}')
                            print(f'Mean Squared Error: {mse}, R-squared: {r2}')

                            # Update best parameters if current model is better
                            if mse < best_mse:
                                best_mse = mse
                                best_r2 = r2
                                best_params = {
                                    'autoencoder_type': autoencoder_type,
                                    'encoding_dim': encoding_dim,
                                    'activation': activation,
                                    'optimizer': optimizer,
                                    'loss': loss,
                                    'sparsity_weight': sparsity_weight,
                                }

print('Best parameters found:')
print(best_params)
print(f'Best Mean Squared Error: {best_mse}')
print(f'Best R-squared: {best_r2}')


Encoded train shape (basic): (64, 5)
Encoded test shape (basic): (17, 5)
Autoencoder type: basic, Encoding dim: 5, Activation: relu, Optimizer: <keras.src.optimizers.legacy.adam.Adam object at 0x7dda0a9cc340>, Loss: mean_squared_error
Mean Squared Error: 0.5806929510613764, R-squared: -0.0723206301289625
Encoded train shape (sparse): (64, 5)
Encoded test shape (sparse): (17, 5)
Autoencoder type: sparse, Encoding dim: 5, Activation: relu, Optimizer: <keras.src.optimizers.legacy.adam.Adam object at 0x7dda0a9cc340>, Loss: mean_squared_error, Sparsity weight: 1e-05
Mean Squared Error: 0.5696714568005572, R-squared: -0.05196809158148641
Encoded train shape (sparse): (64, 5)
Encoded test shape (sparse): (17, 5)
Autoencoder type: sparse, Encoding dim: 5, Activation: relu, Optimizer: <keras.src.optimizers.legacy.adam.Adam object at 0x7dda0a9cc340>, Loss: mean_squared_error, Sparsity weight: 0.0001
Mean Squared Error: 0.5696714568005572, R-squared: -0.05196809158148641
Encoded train shape (spar