In [None]:
# Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import os
import optuna
import scipy
from pathlib import Path
from shap import Explanation
from ydata_profiling import ProfileReport
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.ensemble import IsolationForest
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import pdist, squareform
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import uniform, randint
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor, BaggingRegressor, HistGradientBoostingRegressor
from umap.umap_ import UMAP
from sklearn.preprocessing import KBinsDiscretizer
from ctgan import CTGAN
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from utils import plot_all_learning_curves, evaluate_models, prefix_params

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# Setting up pandas printing options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None) 

# Setting up working environment
base_path = Path.cwd()
os.chdir(base_path)

# Print out the current working directory
print(base_path)

In [None]:
# Read in the provided data
print("Read in data...")


# Gene expression dataset
expr = pd.read_csv("input/CCLE_expression.csv")

# Metadata of the samples
metadata = pd.read_csv("input/sample_info.csv")

#Drug sensitivity data
sens = pd.read_excel("input/GDSC2_fitted_dose_response_25Feb20.xlsx")

In [None]:
# Prepare data for downstream analysis
print("Data wrangling...")

metadata = metadata[metadata['DepMap_ID'].isin(expr['Unnamed: 0'])]
metadata = metadata.set_index('DepMap_ID').reindex(expr['Unnamed: 0']).reset_index()

# Bring together all tables into singular giant table
merged_df = expr.merge(metadata, left_on='Unnamed: 0', right_on='Unnamed: 0').merge(sens[sens['DRUG_NAME'] == 'Lapatinib'], left_on='Sanger_Model_ID', right_on='SANGER_MODEL_ID')

# Replace unknown sex with NaN to be imputed
merged_df.loc[merged_df["sex"] == "Unknown", "sex"] = np.nan

In [None]:
# Subset the data to only include the necessary columns and bring the data into the normal scale
subset_df = merged_df.iloc[:, 1:(expr.shape)[1]]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(subset_df, merged_df['LN_IC50'], test_size=0.2, random_state=17)

In [None]:
# Define list of models
print("Define pipeline and models...")

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=0)
}

# Define models and parameter grids
model_params = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Ridge': {
        'model': Ridge(),
        'params': {'alpha': uniform(0.1, 10)}
    },
    'Lasso': {
        'model': Lasso(),
        'params': {'alpha': uniform(0.1, 10)}
    },
    'ElasticNet': {
        'model': ElasticNet(),
        'params': {'alpha': uniform(0.1, 10), 'l1_ratio': uniform(0, 1)}
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(),
        'params': {'max_depth': randint(2, 20), 'min_samples_split': randint(2, 20)}
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': randint(50, 200), 'max_depth': randint(2, 20)}
    },
    'ExtraTreesRegressor': {
        'model': ExtraTreesRegressor(),
        'params': {'n_estimators': randint(50, 200), 'max_depth': randint(2, 20)}
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(),
        'params': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3)}
    },
    'AdaBoostRegressor': {
        'model': AdaBoostRegressor(),
        'params': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 1.0)}
    },
    'BaggingRegressor': {
        'model': BaggingRegressor(),
        'params': {'n_estimators': randint(10, 100)}
    },
    'HistGradientBoostingRegressor': {
        'model': HistGradientBoostingRegressor(),
        'params': {'learning_rate': uniform(0.01, 0.3), 'max_depth': randint(2, 20)}
    },
    'SVR': {
        'model': SVR(),
        'params': {'C': uniform(0.1, 10), 'gamma': uniform(0.01, 1)}
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': randint(2, 20), 'weights': ['uniform', 'distance']}
    },
    'CatBoostRegressor': {
        'model': CatBoostRegressor(verbose=0),
        'params': {'learning_rate': uniform(0.01, 0.3), 'depth': randint(2, 10)}
    }
}

In [None]:
categorical_cols = ['primary_disease', 'sex']
numerical_cols = ['age']  # Keep age numeric

# One-hot encode categorical features
subset_df = merged_df.iloc[:, 1:(expr.shape)[1]]
subset_df = pd.concat([subset_df, merged_df[categorical_cols + numerical_cols]], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(subset_df, merged_df['LN_IC50'], test_size=0.2, random_state=17)

# Define preprocessing steps
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('variancethreshold', VarianceThreshold(threshold=0.02)),
            ('scaler', StandardScaler())
        ]), X_train.columns[:-3]),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols),  
        
        ('age', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), ['age'])
    ]
)

In [None]:
print("Train GAN...")

# Extract relevant features for GAN training
columns_to_synthesize = categorical_cols + numerical_cols + list(X_train.columns[:-3]) # Features to generate
discrete_columns = categorical_cols  # Specify categorical columns

# Train a CTGAN model
ctgan = CTGAN(epochs=150, batch_size=100, verbose=True)
ctgan.fit(X_train[columns_to_synthesize], discrete_columns)


In [None]:
# Generate synthetic samples
num_samples = 500
synthetic_data = ctgan.sample(num_samples)

In [None]:
# Merge real and synthetic data
X_train = pd.concat([X_train[columns_to_synthesize], synthetic_data], axis=0)

# Ensure target variable has the same number of labels (you can either duplicate y_train or generate labels another way)
y_train = pd.concat([y_train, y_train.sample(n=num_samples, replace=True)], axis=0)

## Create a DL 

In [None]:

def create_model(input_dim, l2_reg=0.01):
    model = tf.keras.Sequential([
        # Input Layer
        layers.InputLayer(input_shape=(input_dim,)),
        layers.BatchNormalization(),
        
        # Hidden Layer 1
        layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Hidden Layer 2
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Hidden Layer 3
        layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        # Output Layer
        layers.Dense(1)  # Linear activation for regression
    ])
    
    return model


In [None]:
# Fit the preprocessor on the training data and transform both train and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to numpy arrays if they're not already
X_train_preprocessed = X_train_preprocessed.toarray() if scipy.sparse.issparse(X_train_preprocessed) else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if scipy.sparse.issparse(X_test_preprocessed) else X_test_preprocessed


In [None]:
print("Compile DL...", flush=True)

# Create the model
input_dim = X_train_preprocessed.shape[1]
model = create_model(input_dim)

# Compile the model
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics=['mse'])

# Print model summary
model.summary()

In [None]:
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train_preprocessed, y_train,
    epochs=250,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)


In [None]:
print("Evaluate DL...", flush=True)

# Evaluate on test set
test_loss, test_mae = model.evaluate(X_test_preprocessed, y_test, verbose=0)
print(f"Test neg_mean_squared_error: {test_mae:.4f}", flush=True)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Calculate R-squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"R-squared: {r2:.4f}", flush=True)
print(f"MSE: {mse:.4f}", flush=True)

In [None]:
print("Visualize DL...", flush=True)

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mse'], label='Training MAE')
plt.plot(history.history['val_mse'], label='Validation MAE')
plt.title('Model neg_mean_squared_error')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()

plt.tight_layout()
plt.savefig("output/gan_neural_network_learning.pdf")
plt.show()
