In [None]:


#     Source material
#     ==================================================================
#     
#     Public DeepCDR
#        https://github.com/kimmo1019/DeepCDR
#        
#     Our DeepCDR
#        https://github.com/KatanaGraph/solutions/tree/main/recipes/hls/DeepCDR
#  
#        Parent repo,
#           https://github.com/KatanaGraph/solutions
#           
#           
#     Source data
#     ==================================================================
#     
#     https://console.cloud.google.com/storage/browser/hls-dataset-bucket
#     https://console.cloud.google.com/storage/browser/hls-dataset-bucket/DeepCDR_data   
#  
#          391,018     cell_line_gene_expression_edges.csv
#          453,289     cell_line_gene_methylation_edges.csv
#        19,451,554    cell_line_gene_mutation_edges.csv
#            1,462     cell_lines.csv
#             239      drugs.csv
#          257,755     gdsc_cell_line_edges.csv
#             267      gdsc.csv
#             267      gdsc_drug_edges.csv
#             720      genes.csv
#        20,556,571    total
    
    

In [1]:

%load_ext autoreload
%autoreload 2

from katana import remote
from katana.remote import import_data, export_data


my_client = remote.Client()

print("")
print("Client server version: %s" % (my_client.server_version))




Client server version: 0.8.0+20230301T170702Z.5ee413de9.dev


In [2]:

from config import hyperparams

input_config = hyperparams.load_input_config()

print("")
print(input_config)



DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)

InputConfig:
    num_partitions = 3,
    use_train_rdg = False,
    trained_rdg_path = gs://hls-dataset-bucket/DeepCDR_trained,
    cell_lines_path = gs://hls-dataset-bucket/DeepCDR_data/cell_lines.csv,
    drugs_path = gs://hls-dataset-bucket/DeepCDR_data/drugs.csv,
    gdsc_path = gs://hls-dataset-bucket/DeepCDR_data/gdsc.csv,
    genes_path = gs://hls-dataset-bucket/DeepCDR_data/genes.csv,
    gdsc_cell_line_path = gs://hls-dataset-bucket/DeepCDR_data/gdsc_cell_line_edges.csv,
    gdsc_drug_path = gs://hls-dataset-bucket/DeepCDR_data/gdsc_drug_edges.csv,
    cell_line_gene_expression_path = gs://hls-dataset-bucket/DeepCDR_data/cell_line_gene_expression_edges.csv,
    cell_line_gene_methylation_path = gs://hls-dataset-bucket/DeepCDR_data/cell_line_gene_methylation_edges.csv,
    cell_l

In [3]:

from timeit import default_timer

# Import the module that uses Dask to import drug data
from src import dask_ingestion


my_graph = my_client.create_graph(num_partitions = input_config.num_partitions)


if input_config.use_train_rdg:
   print(f"Import pretrained graph from: {input_config.trained_rdg_path}")
   import_data.rdg(my_graph, input_config.trained_rdg_path)
else:
   print("Generate the graph with data from source")
   dask_ingestion.generate_deepcdr_graph(my_graph, input_config)
    
print("--")



Generate the graph with data from source
***Loading nodes dataframe***


OSError: Forbidden: b/hls-dataset-bucket/o
799544334556-compute@developer.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist).

In [None]:
from src import katana_pipeline

rec_pipeline = katana_pipeline.RecipePipeline(my_graph)
rec_pipeline.my_graph.schema().view()

In [None]:
start_time = default_timer()
rec_pipeline.feature_generator()
print(f"***Took {default_timer() - start_time} seconds to generate the features.***")

In [None]:
stats = rec_pipeline.stats()
stats

In [None]:
start_time = default_timer()
rec_pipeline.split_generator(input_config)
print(f"***Took {default_timer() - start_time} seconds to generate the split.***")
rec_pipeline.my_graph.schema().view()

In [None]:
model_config = hyperparams.load_model_config()
model_config

In [None]:
training_config = hyperparams.load_training_config()
training_config

In [None]:
start_time = default_timer()
validation_metric = rec_pipeline.train(model_config, training_config)
print(f"***Took {default_timer() - start_time} seconds to train the model.***")
print("Validation metric: ", validation_metric)

In [None]:
start_time = default_timer()
test_res, ypred, ytrue = rec_pipeline.test(training_config)
print(f"***Took {default_timer() - start_time} seconds to test the model.***")
test_res

In [None]:
start_time = default_timer()
rec_pipeline.plot(ypred, ytrue)
print(f"***Took {default_timer() - start_time} seconds to plot figures.***")

In [None]:
rec_pipeline.infer(training_config)

In [None]:
bortezomib = "B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN=C2)(O)O"
cell_line = "ACH-000001"
rec_pipeline.infer(training_config, drug=bortezomib, cell_line=cell_line)

In [None]:
start_time = default_timer()
rec_pipeline.infer_embeddings(model_config)
print(f"***Took {default_timer() - start_time} seconds to save node embeddings.***")
rec_pipeline.my_graph.schema().view()

In [None]:


#  if (input_config.save_graph_path):
#      
#     export_data.rdg(my_graph, input_config.save_graph_path)

    