In [10]:
import pandas as pd
import requests
import json
import os
import collections


url = "http://l2s2.maayanlab.cloud/graphql"


def enrich_l2s2_single_set(geneset: list, first=1000):
    query = {
    "operationName": "EnrichmentQuery",
    "variables": {
        "filterTerm": " ",
        "offset": 0,
        "first": first,
        "filterFda": False,
        "sortBy": "pvalue_up",
        "filterKo": False,
        "genes": geneset,
    },
    "query": """query EnrichmentQuery(
                    $genes: [String]!
                    $filterTerm: String = ""
                    $offset: Int = 0
                    $first: Int = 10
                    $filterFda: Boolean = false
                    $sortBy: String = ""
                    $filterKo: Boolean = false
                    ) {
                    currentBackground {
                        enrich(
                        genes: $genes
                        filterTerm: $filterTerm
                        offset: $offset
                        first: $first
                        filterFda: $filterFda
                        sortby: $sortBy
                        filterKo: $filterKo
                        ) {
                        nodes {
                            geneSetHash
                            pvalue
                            adjPvalue
                            oddsRatio
                            nOverlap
                            geneSets {
                            nodes {
                                term
                                id
                                nGeneIds
                                geneSetFdaCountsById {
                                nodes {
                                    approved
                                    count
                                }
                                }
                            }
                            totalCount
                            }
                        }
                        totalCount
                        consensusCount
                        consensus {
                            drug
                            oddsRatio
                            pvalue
                            adjPvalue
                            approved
                            countSignificant
                            countInsignificant
                            countUpSignificant
                            pvalueUp
                            adjPvalueUp
                            oddsRatioUp
                            pvalueDown
                            adjPvalueDown
                            oddsRatioDown
                        }
                        }
                    }
                    }
                    """,
    }

    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }

    response = requests.post(url, data=json.dumps(query), headers=headers)

    response.raise_for_status()
    res = response.json()
    #consensus = pd.DataFrame(res['data']['currentBackground']['enrich']['consensus'])
    consensus = res['data']['currentBackground']['enrich']['consensus']
    #enrichment = pd.DataFrame(res['data']['currentBackground']['enrich']['nodes'])
    enrichment = res['data']['currentBackground']['enrich']['nodes']# %%
    df_consensus = pd.DataFrame(consensus).rename(columns={'drug': 'perturbation'})

    df_enrichment = pd.json_normalize(
        enrichment, 
        record_path=['geneSets', 'nodes'], 
        meta=['geneSetHash', 'pvalue', 'adjPvalue', 'oddsRatio', 'nOverlap']
    )
    if df_enrichment.empty:
        return pd.DataFrame(), pd.DataFrame()
    df_enrichment["approved"] = df_enrichment["geneSetFdaCountsById.nodes"].map(lambda x: x[0]['approved'] if len(x) > 0 else False)
    df_enrichment["count"] = df_enrichment["geneSetFdaCountsById.nodes"].map(lambda x: x[0]['count'] if len(x) > 0 else 0)
    df_enrichment.drop(columns=['geneSetFdaCountsById.nodes'], inplace=True)
    df_enrichment['batch'] = df_enrichment["term"].map(lambda t: t.split('_')[0])
    df_enrichment["timepoint"] = df_enrichment["term"].map(lambda t: t.split('_')[1])
    df_enrichment["cellLine"] = df_enrichment["term"].map(lambda t: t.split('_')[2])
    df_enrichment["batch2"] = df_enrichment["term"].map(lambda t: t.split('_')[3])
    
    df_enrichment["perturbation"] = df_enrichment["term"].map(lambda t: t.split('_')[4].split(' ')[0] + " KO" if len(t.split('_')[4].split(' ')) == 2 else t.split('_')[4])
    
    df_enrichment['concentration'] = df_enrichment["term"].map(lambda t: t.split('_')[5].split(' ')[0] if len(t.split('_')) > 5 else "N/A")
    df_enrichment['direction'] = df_enrichment["term"].map(lambda t: t.split(' ')[1])

    return df_enrichment, df_consensus


In [23]:
def enrich_l2s2_up_down(genes_up: list[str], genes_down: list[str], first=100):
  query = {
    "operationName": "PairEnrichmentQuery",
    "variables": {
      "filterTerm": " ",
      "offset": 0,
      "first": first,
      "filterFda": False,
      "sortBy": "pvalue_mimic",
      "filterKo": False,
      "topN": 1000,
      "pvalueLe": 0.05,
      "genesUp": genes_up,
      "genesDown": genes_down
    },
    "query": """query PairEnrichmentQuery($genesUp: [String]!, $genesDown: [String]!, $filterTerm: String = "", $offset: Int = 0, $first: Int = 10, $filterFda: Boolean = false, $sortBy: String = "", $filterKo: Boolean = false, $topN: Int = 10000, $pvalueLe: Float = 0.05) {
      currentBackground {
        pairedEnrich(
          filterTerm: $filterTerm
          offset: $offset
          first: $first
          filterFda: $filterFda
          sortby: $sortBy
          filterKo: $filterKo
          topN: $topN
          pvalueLe: $pvalueLe
          genesDown: $genesDown
          genesUp: $genesUp
          ) {
            totalCount
            consensusCount
            consensus {
              drug
              oddsRatio
              pvalue
              adjPvalue
              approved
              countSignificant
              countInsignificant
              countUpSignificant
              pvalueUp
              adjPvalueUp
              oddsRatioUp
              pvalueDown
              adjPvalueDown
              oddsRatioDown
              }
              nodes {
                adjPvalueMimic
                adjPvalueReverse
                mimickerOverlap
                oddsRatioMimic
                oddsRatioReverse
                pvalueMimic
                pvalueReverse
                reverserOverlap
                geneSet {
                  nodes {
                    id
                    nGeneIds
                    term
                    geneSetFdaCountsById {
                      nodes {
                        count
                        approved
                        }
                      }
                    }
                  }
                }
              }
            }
          }
    """
  }

  headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
  }

  response = requests.post(url, data=json.dumps(query), headers=headers)

  response.raise_for_status()
  res = response.json()

  # Assuming you already have the response data loaded as 'res'
  consensus = res['data']['currentBackground']['pairedEnrich']['consensus']
  enrichment = res['data']['currentBackground']['pairedEnrich']['nodes']
  

  df_consensus_pair = pd.DataFrame(consensus).rename(columns={'drug': 'perturbation', 
                                                              'pvalueUp': 'pvalueMimick', 
                                                              'pvalueDown': 'pvalueReverse', 
                                                              'adjPvalueUp': 'adjPvalueMimic', 
                                                              'adjPvalueDown': 'adjPvalueReverse', 
                                                              'oddsRatioUp': 'oddsRatioMimic', 
                                                              'oddsRatioDown': 'oddsRatioReverse'
                                                            })
  df_enrichment_pair = pd.DataFrame(enrichment)
  
  if df_enrichment_pair.empty:
    return pd.DataFrame(), pd.DataFrame()
  
  df_enrichment_pair['term'] = df_enrichment_pair['geneSet'].map(lambda t: t['nodes'][0]['term'].split(' ')[0])
  df_enrichment_pair['approved'] = df_enrichment_pair['geneSet'].map(lambda t: t['nodes'][0]['geneSetFdaCountsById']['nodes'][0]['approved'])
  df_enrichment_pair['count'] = df_enrichment_pair['geneSet'].map(lambda t: t['nodes'][0]['geneSetFdaCountsById']['nodes'][0]['count'])
  df_enrichment_pair['nGeneIdsUp'] = df_enrichment_pair['geneSet'].map(lambda t: t['nodes'][0]['nGeneIds'])
  df_enrichment_pair['nGeneIdsDown'] = df_enrichment_pair['geneSet'].map(lambda t: t['nodes'][0]['nGeneIds'])
  df_enrichment_pair["perturbation_id"] = df_enrichment_pair["term"].map(lambda t: t.split('_')[0])
  df_enrichment_pair["timepoint"] = df_enrichment_pair["term"].map(lambda t: t.split('_')[1])
  df_enrichment_pair["cellLine"] = df_enrichment_pair["term"].map(lambda t: t.split('_')[2])
  df_enrichment_pair["batch"] = df_enrichment_pair["term"].map(lambda t: t.split('_')[3])
  # Assuming df_enrichment_pair is your dataframe with a column 'geneSet'
  df_enrichment_pair["geneSetIdUp"] = df_enrichment_pair["geneSet"].map(
      lambda t: next((node['id'] for node in t['nodes'] if ' up' in node['term']), None)
  )

  df_enrichment_pair["geneSetIdDown"] = df_enrichment_pair["geneSet"].map(
      lambda t: next((node['id'] for node in t['nodes'] if ' down' in node['term']), None)
  )

  df_enrichment_pair = df_enrichment_pair.set_index('term')
  df_enrichment_pair = df_enrichment_pair.drop(columns=['geneSet']).reset_index(drop=False)
  df_enrichment_pair

  return df_enrichment_pair, df_consensus_pair


In [26]:
def read_genes(ds_name):
    with open(f"../data/processed/02_l2s2_queries/{ds_name}.txt") as f:
        genes = [line.strip() for line in f.readlines()]
    return genes


dataset_names = []

for fn in os.listdir("../data/processed/02_l2s2_queries/"):
    if fn.endswith(".txt"):
        dataset_names.append(fn.replace(".txt", ""))

dataset_names = sorted(dataset_names)

for ds in dataset_names:
    genes = read_genes(ds)
    print("Working on dataset:", ds)
    de, dc = enrich_l2s2_single_set(geneset=genes)
    de.to_csv(f"../data/processed/03_l2s2_runs/{ds}_enrichment.csv", index=False)
    dc.to_csv(f"../data/processed/03_l2s2_runs/{ds}_consensus.csv", index=False)

updw = collections.defaultdict(dict)
for ds in dataset_names:
    if "test" in ds:
        continue
    if "_up" in ds:
        base_name = ds.replace("_up", "")
        updw[base_name]['up'] = ds
    elif "_dw" in ds:
        base_name = ds.replace("_dw", "")
        updw[base_name]['dw'] = ds
    else:
        continue

for k,v in updw.items():
    if "test" in k:
        continue
    if 'up' in v and 'dw' in v:
        print("Working on paired dataset:", k)
        genes_up = read_genes(v['up'])
        genes_down = read_genes(v['dw'])
        print("  # up genes:", len(genes_up))
        print("  # down genes:", len(genes_down))
        de_pair, dc_pair = enrich_l2s2_up_down(genes_up=genes_up, genes_down=genes_down)
        de_pair.to_csv(f"../data/processed/03_l2s2_runs/{k}_updw_enrichment.csv", index=False)
        dc_pair.to_csv(f"../data/processed/03_l2s2_runs/{k}_updw_consensus.csv", index=False)


Working on dataset: all
Working on dataset: all_dw
Working on dataset: all_up
Working on dataset: ibhi_dseq2_dw
Working on dataset: ibhi_dseq2_up
Working on dataset: katie_ebseq_dw
Working on dataset: katie_ebseq_up
Working on dataset: severin_edger_dw
Working on dataset: severin_edger_up
Working on dataset: test_dw
Working on dataset: test_up
Working on dataset: yutaka_genmat
Working on paired dataset: all
  # up genes: 37
  # down genes: 19
Working on paired dataset: ibhi_dseq2
  # up genes: 13
  # down genes: 6
Working on paired dataset: katie_ebseq
  # up genes: 5
  # down genes: 11
Working on paired dataset: severin_edger
  # up genes: 31
  # down genes: 3


In [14]:
updw

defaultdict(dict,
            {'all': {'dw': 'all_dw', 'up': 'all_up'},
             'ibhi_dseq2': {'dw': 'ibhi_dseq2_dw', 'up': 'ibhi_dseq2_up'},
             'katie_ebseq': {'dw': 'katie_ebseq_dw', 'up': 'katie_ebseq_up'},
             'severin_edger': {'dw': 'severin_edger_dw',
              'up': 'severin_edger_up'}})