## Data Engineering

In [1]:

import dask.dataframe as dd
from katana import remote
from katana.remote import analytics, import_data
from katana.remote.import_data import DataFrameImporter

client = remote.Client()
client.server_version

print("--")



--


## Dask Data Ingestion

In [None]:
%%time

input_dir_path = "gs://katana-providence/deep-cdr/"

node_dd = {}

node_type = "CELL_LINE"
cell_line = input_dir_path + "cell_lines.csv"
node_dd[node_type] = dd.read_csv(
    cell_line, sep=",", dtype={"CCLE_ID": "object", "Doubling.Time.from.Vendor": "object", "Life_Stage": "object"}
)
node_dd[node_type] = node_dd[node_type].rename(columns={"depMapID": "id"})
node_dd[node_type] = node_dd[node_type].drop(node_dd[node_type].columns[0], axis=1)
node_dd[node_type] = node_dd[node_type].drop(node_dd[node_type].columns[13:27], axis=1)

node_type = "GDSC"
cell_line = input_dir_path + "gdsc.csv"
node_dd[node_type] = dd.read_csv(cell_line, sep=",", dtype={"id": "object"})
node_dd[node_type] = node_dd[node_type].drop(node_dd[node_type].columns[0], axis=1)

node_type = "GENE"
cell_line = input_dir_path + "genes.csv"
node_dd[node_type] = dd.read_csv(cell_line, sep=",", dtype={"ID": "object"})
node_dd[node_type] = node_dd[node_type].rename(columns={"ID": "id"})
node_dd[node_type] = node_dd[node_type].drop(node_dd[node_type].columns[0], axis=1)

node_type = "DRUG"
cell_line = input_dir_path + "merged_drug_smiles.csv"
node_dd[node_type] = dd.read_csv(cell_line, sep=",", dtype={"pub_chem_id": "object"})
node_dd[node_type] = node_dd[node_type].rename(columns={"pub_chem_id": "id"})
node_dd[node_type] = node_dd[node_type].drop(node_dd[node_type].columns[0], axis=1)


node_dd["CELL_LINE"].head()

In [None]:
edges = {
    "gdsc_cl_edges": ("HAS_CELL_LINE", "GDSC", "CELL_LINE"),
    "genomics_edges": ("HAS_GENE_OBSERVATION", "CELL_LINE", "GENE"),
    "gdsc_drug_edges": ("FOR_DRUG", "GDSC", "DRUG"),
}

edge_dd = {}
for filename, edge_type in edges.items():
    print(edge_type)
    file_name = input_dir_path + filename + ".csv"
    edge_dd[edge_type] = dd.read_csv(file_name, sep=",", dtype={"START_ID": "object", "END_ID": "object"})
    edge_dd[edge_type] = edge_dd[edge_type].dropna()
    edge_dd[edge_type] = edge_dd[edge_type][edge_dd[edge_type].END_ID != "none"]
    edge_dd[edge_type] = edge_dd[edge_type].drop(edge_dd[edge_type].columns[0], axis=1)

In [None]:
partitions = 4
client = remote.Client()
graph = client.create_graph(num_partitions=partitions)

## Create Graph using Dask DataFrame

In [None]:
%%time
print("Importing graph from dataframe files into graph...")
with DataFrameImporter(graph) as df_importer:
    for node_type, dd in node_dd.items():
        df_importer.nodes_dataframe(dd, id_column="id", id_space=node_type)
    for tup, dd in edge_dd.items():
        source_col = "START_ID"
        destination_col = "END_ID"
        df_importer.edges_dataframe(
            dd,
            source_id_space=tup[1],
            destination_id_space=tup[2],
            source_column=source_col,
            destination_column=destination_col,
            type=tup[0],
        )
    df_importer.insert()
print("{} nodes, {} edges".format(graph.num_nodes(), graph.num_edges()))

In [None]:
graph.schema().view()

## PreProcessing: Prepare graph for downstream ML training

In [None]:
graph.query(
    """
    MATCH (c:CELL_LINE)-[r:HAS_GENE_OBSERVATION]->(g:GENE)
    WHERE (r.source = "genomics_expression")
    WITH c, r, g
    ORDER by g.ID
    WITH c, collect(r.observation) as genomics_expression
    SET c.genomics_expression = genomics_expression
    """
)

graph.query(
    """
    MATCH (c:CELL_LINE)-[r:HAS_GENE_OBSERVATION]->(g:GENE)
    WHERE (r.source = "genomics_mutation")
    WITH c, r, g
    ORDER by g.ID
    WITH c, collect(r.observation) as genomics_mutation
    SET c.genomics_mutation = genomics_mutation
    """
)

graph.query(
    """
    MATCH (c:CELL_LINE)-[r:HAS_GENE_OBSERVATION]->(g:GENE)
    WHERE (r.source = "genomics_methylation")
    WITH c, r, g
    ORDER by g.ID
    WITH c, collect(r.observation) as genomics_methylation
    SET c.genomics_methylation = genomics_methylation 
    """
);

In [None]:
df_cell_lines = graph.query(
    """
    MATCH (c:CELL_LINE)
    WITH c.CCLE_ID as CCLE_ID,
        c.genomics_expression as genomics_expression,
        c.genomics_mutation as genomics_mutation,
        c.genomics_methylation as genomics_methylation
    WHERE genomics_expression IS NOT NULL
        AND genomics_mutation IS NOT NULL
        AND genomics_methylation IS NOT NULL
    RETURN CCLE_ID, genomics_expression, genomics_mutation, genomics_methylation
    """
)
df_cell_lines.head()

In [None]:
preprocessed_graph = graph.create_snapshot_at_version()
preprocessed_graph.query(
    """
    MATCH (c:CELL_LINE)
    WHERE c.genomics_methylation IS NULL
    OR c.genomics_expression IS NULL
    OR c.genomics_mutation IS NULL
    DETACH DELETE c
    """
);

In [None]:
# Print some basic statistics
preprocessed_graph.query(
    """MATCH (a:DRUG)<-[:FOR_DRUG]-(g:GDSC)-[:HAS_CELL_LINE]->(c:CELL_LINE)
RETURN COUNT(Distinct a) as DRUG, COUNT(Distinct g) as GDSC, COUNT(Distinct c) as CELL_LINE,
COUNT(a) as DRUG_CELL_LINE_PAIRS"""
).head()

In [None]:
preprocessed_graph.schema().view()