In [1]:
# Basic imports
import pandas as pd
import numpy as np
import logging
import warnings
import os
from collections import Counter
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# PyTorch imports
import torch

# PyTorch Tabular and Model imports (replace with correct imports based on your package structure)
from pytorch_tabular import TabularModel
from pytorch_tabular.models.category_embedding import CategoryEmbeddingModel

from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
import sys
sys.path.append('../')
from src.data.process_data import load_dataset, split_dataset

# Suppress common warnings
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)


In [2]:
# Define the file paths to your datasets

data_path = '../data/'
drug_syn_path = os.path.join(data_path, 'drug_synergy.csv')
cell_lines_path = os.path.join(data_path, 'cell_lines.csv')
drug_portfolio_path = os.path.join(data_path, 'drug_portfolio.csv')

full_dataset_df, column_type_dict = load_dataset(drug_syn_path, cell_lines_path, drug_portfolio_path)

# Split the dataset into training, testing, and leaderboard sets
datasets = split_dataset(full_dataset_df)

categorical_cols = column_type_dict['categorical']['col_names']
continuous_cols = column_type_dict['numerical']['col_names']

In [3]:
print(len(full_dataset_df))
print(full_dataset_df.columns)
print(len(categorical_cols), len(continuous_cols))
print(categorical_cols)
print(continuous_cols)
print(Counter(full_dataset_df['Dataset']))
print(len(Counter(full_dataset_df['Combination ID'])))
print(len(Counter(full_dataset_df['Cell line name'])))
print(Counter(full_dataset_df['GDSC tissue descriptor 2']))

3475
Index(['Cell line name', 'Compound A', 'Compound B',
       'GDSC tissue descriptor 2', 'MSI', 'Growth properties',
       'Putative target_A', 'Function_A', 'Pathway_A', 'Putative target_B',
       'Function_B', 'Pathway_B', 'Max. conc. A', 'IC50 A', 'H A', 'Einf A',
       'Max. conc. B', 'IC50 B', 'H B', 'Einf B', 'Synergy score',
       'Combination ID', 'Dataset'],
      dtype='object')
12 8
['Cell line name', 'Compound A', 'Compound B', 'GDSC tissue descriptor 2', 'MSI', 'Growth properties', 'Putative target_A', 'Function_A', 'Pathway_A', 'Putative target_B', 'Function_B', 'Pathway_B']
['Max. conc. A', 'IC50 A', 'H A', 'Einf A', 'Max. conc. B', 'IC50 B', 'H B', 'Einf B']
Counter({'train': 1795, 'test': 1089, 'LB': 591})
167
85
Counter({'breast': 1942, 'lung_NSCLC_adenocarcinoma': 550, 'Bladder': 465, 'large_intestine': 191, 'lung_NSCLC_squamous_cell_carcinoma': 137, 'melanoma': 72, 'lung_small_cell_carcinoma': 45, 'lung_NSCLC_large cell': 38, 'stomach': 22, 'prostate': 7, 'm

In [4]:
cell_lines_df = pd.read_csv(cell_lines_path)
# drop duplicates
cell_lines_df = cell_lines_df[cell_lines_df['AZ-DREAM ']==1]
cell_lines_df = cell_lines_df.drop_duplicates()
print(Counter(cell_lines_df['GDSC tissue descriptor 1']))
print(len(cell_lines_df['Cell line name'].unique()))

Counter({'breast': 34, 'lung_NSCLC': 21, 'urogenital_system': 14, 'large_intestine': 10, 'skin': 2, 'digestive_system': 2, 'lung_SCLC': 1, 'myeloma': 1})
85


In [5]:
drug_portfolio_df = pd.read_csv(drug_portfolio_path, sep='\t')
print(len(drug_portfolio_df))
print(drug_portfolio_df.columns)
print(len(drug_portfolio_df['Challenge drug name'].unique()))

118
Index(['Drug name', 'Challenge drug name', 'Putative target', 'Function',
       'Pathway', 'HBA', 'HBD', 'Molecular weight', 'cLogP', 'Lipinski',
       'SMILES'],
      dtype='object')
118
