## BioGRID REST Sevices:  ORCS

- for context BioGRID has two primary APIs that differ slightly in focus
- examples in this notebook use the ORCS API
- Use the API base url to request an API key and capture it in an .env file

### Protein, Genetic and Chemical Interactions
- Site URL:  https://thebiogrid.org/
- API docs:  https://wiki.thebiogrid.org/doku.php/biogridrest
- API base url:  https://webservice.thebiogrid.org
- API key:  https://webservice.thebiogrid.org

### Open Repository of CRISPR Screens (ORCS)
- Site URL:  https://orcs.thebiogrid.org/
- API docs:  https://wiki.thebiogrid.org/doku.php/orcs:webservice
- API base url:  https://orcsws.thebiogrid.org
- API key:  https://orcsws.thebiogrid.org

CITATION:
- original examples were based on
  - [BIOGRID-REST-EXAMPLES](https://github.com/BioGRID/BIOGRID-REST-EXAMPLES) GitHub repository
  - [ORCS-REST-EXAMPLES](https://github.com/BioGRID/ORCS-REST-EXAMPLES) GitHub repository

In [1]:
# Import necessary libraries
import os
from dotenv import load_dotenv
import requests
from pprint import pprint

# Load environment variables from .env file
load_dotenv()

# Fetch the API keys and base URLs from the .env file
BG_INT_ACCESS_KEY = os.getenv("BG_INT_ACCESS_KEY")
BG_INT_BASE_URL = os.getenv("BG_INT_BASE_URL")
BG_ORCS_ACCESS_KEY = os.getenv("BG_ORCS_ACCESS_KEY")
BG_ORCS_BASE_URL = os.getenv("BG_ORCS_BASE_URL")

# Validate the environment variables
if not BG_INT_ACCESS_KEY or not BG_INT_BASE_URL:
    raise ValueError("BG_INT_ACCESS_KEY or BG_INT_BASE_URL is missing from the .env file.")

if not BG_ORCS_ACCESS_KEY or not BG_ORCS_BASE_URL:
    raise ValueError("BG_ORCS_ACCESS_KEY or BG_ORCS_BASE_URL is missing from the .env file.")

## organisms endpoint

In [2]:
"""
Fetch the set of currently supported organism IDs
"""

request_url = BG_ORCS_BASE_URL + "/organisms"

params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
organisms = r.json( )

pprint( organisms )

{'10090': 'Mus musculus',
 '559292': 'Saccharomyces cerevisiae (S288C)',
 '60711': 'Chlorocebus sabaeus',
 '7227': 'Drosophila melanogaster',
 '9606': 'Homo sapiens'}


## vocabs endpoint

In [3]:
"""
Fetch the set of controlled vocabularies and also fetch
a set of terms for two of the vocabulary categories
"""

request_url = BG_ORCS_BASE_URL + "/vocabs"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
vocabs = r.json( )

pprint( vocabs )

{'1': 'Throughput',
 '10': 'Cell Line',
 '11': 'Cell Type',
 '12': 'Phenotype',
 '15': 'Statistical Analysis',
 '2': 'Screen Type',
 '3': 'Experimental Setup',
 '4': 'Condition Name',
 '5': 'Library',
 '6': 'Library Type',
 '7': 'Library Methodology',
 '8': 'Screen Format',
 '9': 'Enzyme'}


## vocab endpoint

In [4]:
# request actual terms in controlled vocabularies

request_url = BG_ORCS_BASE_URL + "/vocab/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

# Fetch list of available phenotypes (vocab: 12)
r = requests.get( request_url + '12', params = params )
vocab = r.json( )

pprint( vocab )

{'1084': 'autophagy',
 '1093': 'protein/peptide distribution',
 '1107': 'response to bacteria',
 '1129': 'senescence',
 '1141': 'phagocytosis',
 '1146': 'pyroptosis',
 '1151': 'response to oxygen concentration',
 '1165': 'viability',
 '1218': 'lysosome homeostasis',
 '1242': 'response to radiation',
 '1342': 'vesicle distribution',
 '141': 'response to toxin',
 '1539': 'cell migration',
 '1561': 'regulation of lipid localization',
 '1582': 'protein binding',
 '1775': 'rna accumulation',
 '1779': 'regulation of viral programmed -1 ribosomal frameshifting (-1 prf)',
 '1833': 'viral programmed ribosomal frameshifting (prf)',
 '1834': 'regulation of nonsense-mediated decay (nmd)',
 '1918': 'syncytium formation',
 '261': 'cell cycle progression',
 '309': 'regulation of signal transduction phenotype',
 '314': 'protein transport',
 '325': 'protein/peptide accumulation',
 '50': 'cell proliferation',
 '51': 'response to chemicals',
 '53': 'response to virus',
 '55': 'tumorigenicity'}


In [5]:
# Fetch list of cell lines (vocab: 11)
r = requests.get( request_url + '11', params = params )
vocab = r.json( )

vocab

{'1758': 'acute lymphoblastic leukemia cell line',
 '224': 'acute myeloid leukemia cell line',
 '1058': 'adrenal gland neuroblastoma',
 '1350': 'african green monkey kidney cell line',
 '457': 'anaplastic large cell lymphoma cell line',
 '319': 'anaplastic thyroid cancer cell line',
 '1057': 'askin tumor',
 '590': 'astrocytoma cell line',
 '744': 'b-cell non-hodgkin lymphoma cell line',
 '898': 'b-lymphoblastoid cell line',
 '740': 'b-lymphoma cell line',
 '1053': 'bladder carcinoma',
 '381': 'bladder transitional cell carcinoma cell line',
 '1636': 'bone marrow cell line',
 '377': 'breast adenocarcinoma cell line',
 '167': 'breast cancer cell line',
 '1443': 'breast epithelium',
 '202': 'burkitt lymphoma cell line',
 '161': 'cancer cell line',
 '1551': 'cardiac muscle cell line',
 '519': 'cecum cancer cell line',
 '147': 'cervical adenocarcinoma cell line',
 '1797': 'cervical squamous cell carcinoma',
 '1054': 'cholangiocarcinoma cell',
 '1051': 'chondrosarcoma',
 '1499': 'chronic mye

## genes endpoint

In [6]:
"""
Fetch scores for several genes across the entire set
of screens in the database
"""

request_url = BG_ORCS_BASE_URL + "/genes/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "geneID": '66050|66056|66044',
    "format": "json"
}

r = requests.get( request_url, params = params )
scores = r.json( )

for score in scores :
    pprint( score )

{'ALIASES': '-',
 'HIT': 'NO',
 'IDENTIFIER_ID': '66050',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': '0610009B22Rik',
 'ORGANISM_ID': '10090',
 'ORGANISM_OFFICIAL': 'Mus musculus',
 'SCORE.1': '0.71157',
 'SCORE.2': '1',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '578',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': '-',
 'HIT': 'NO',
 'IDENTIFIER_ID': '66050',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': '0610009B22Rik',
 'ORGANISM_ID': '10090',
 'ORGANISM_OFFICIAL': 'Mus musculus',
 'SCORE.1': '0',
 'SCORE.2': '0',
 'SCORE.3': '0.955',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '345',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': '-',
 'HIT': 'NO',
 'IDENTIFIER_ID': '66050',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': '0610009B22Rik',
 'ORGANISM_ID': '10090',
 'ORGANISM_OFFICIAL': 'Mus musculus',
 'SCORE.1': '0.67341',
 'SCORE.2': '0.67301',
 'SCORE.3': '0.999999',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '1077',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': '

In [7]:
# Specify the gene name (e.g., "TP53")
gene_name = "TP53"

# Construct the request URL for querying genes
request_url = f"{BG_ORCS_BASE_URL}/genes/"

# Set query parameters
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

# Make the API request
response = requests.get(request_url, params=params)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    gene_data = response.json()
    if gene_data:
        print(f"Results for gene: {gene_name}\n")
        print(f"Number of results: {len(gene_data)}")
        pprint(gene_data)
    else:
        print(f"No results found for gene: {gene_name}")
else:
    print(f"Failed to fetch data: {response.status_code}")
    print(response.text)


Results for gene: TP53

Number of results: 1400
[{'ALIASES': 'BCC7|LFS1|P53|TRP53',
  'HIT': 'NO',
  'IDENTIFIER_ID': '7157',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'TP53',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '-167.775',
  'SCORE.2': '-',
  'SCORE.3': '-',
  'SCORE.4': '-',
  'SCORE.5': '-',
  'SCREEN_ID': '16',
  'SOURCE': 'BioGRID ORCS'},
 {'ALIASES': 'BCC7|LFS1|P53|TRP53',
  'HIT': 'NO',
  'IDENTIFIER_ID': '7157',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'TP53',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '-89.95',
  'SCORE.2': '-',
  'SCORE.3': '-',
  'SCORE.4': '-',
  'SCORE.5': '-',
  'SCREEN_ID': '17',
  'SOURCE': 'BioGRID ORCS'},
 {'ALIASES': 'BCC7|LFS1|P53|TRP53',
  'HIT': 'YES',
  'IDENTIFIER_ID': '7157',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'TP53',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '12.71',
  'SCORE.2': '-',
  'SCORE.3': '-',
 

In [8]:
# Specify the gene name (e.g., "RB1")
gene_name = "RB1"

# Construct the request URL for querying genes
request_url = f"{BG_ORCS_BASE_URL}/genes/"

# Set query parameters
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

# Make the API request
response = requests.get(request_url, params=params)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    gene_data = response.json()
    if gene_data:
        print(f"Results for gene: {gene_name}\n")
        print(f"Number of results: {len(gene_data)}")
        pprint(gene_data)
    else:
        print(f"No results found for gene: {gene_name}")
else:
    print(f"Failed to fetch data: {response.status_code}")
    print(response.text)


Results for gene: RB1

Number of results: 1355
[{'ALIASES': 'OSRC|PPP1R130|RB|p105-Rb|pRb|pp110',
  'HIT': 'NO',
  'IDENTIFIER_ID': '5925',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'RB1',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '-87.377',
  'SCORE.2': '-',
  'SCORE.3': '-',
  'SCORE.4': '-',
  'SCORE.5': '-',
  'SCREEN_ID': '16',
  'SOURCE': 'BioGRID ORCS'},
 {'ALIASES': 'OSRC|PPP1R130|RB|p105-Rb|pRb|pp110',
  'HIT': 'NO',
  'IDENTIFIER_ID': '5925',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'RB1',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '-62.359',
  'SCORE.2': '-',
  'SCORE.3': '-',
  'SCORE.4': '-',
  'SCORE.5': '-',
  'SCREEN_ID': '17',
  'SOURCE': 'BioGRID ORCS'},
 {'ALIASES': 'OSRC|PPP1R130|RB|p105-Rb|pRb|pp110',
  'HIT': 'NO',
  'IDENTIFIER_ID': '5925',
  'IDENTIFIER_TYPE': 'gene',
  'OFFICIAL_SYMBOL': 'RB1',
  'ORGANISM_ID': '9606',
  'ORGANISM_OFFICIAL': 'Homo sapiens',
  'SCORE.1': '-27.3

## gene endpoint

In [9]:
"""
Fetch scores across all screens for a single gene
using customizable filtering options
"""

gene_id = 7023
request_url = BG_ORCS_BASE_URL + "/gene/" + str(gene_id)

params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json",
    "hit": "yes"
}

r = requests.get( request_url, params = params )
scores = r.json( )

data = {}
for row in scores :
    # Create a hash of results by gene identifier
    data[row['SCREEN_ID']] = row

# Print out data about the gene from several specific screens
pprint( data['549'] )
pprint( data['170'] )
pprint( data['197'] )

{'ALIASES': 'AP-4|bHLHc41',
 'HIT': 'YES',
 'IDENTIFIER_ID': '7023',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'TFAP4',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '-0.6694077',
 'SCORE.2': '0.0108',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '549',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': 'AP-4|bHLHc41',
 'HIT': 'YES',
 'IDENTIFIER_ID': '7023',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'TFAP4',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '1.384515796',
 'SCORE.2': '0.831183816',
 'SCORE.3': '0.963569948',
 'SCORE.4': '0.960101193',
 'SCORE.5': '-',
 'SCREEN_ID': '170',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': 'AP-4|bHLHc41',
 'HIT': 'YES',
 'IDENTIFIER_ID': '7023',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'TFAP4',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '-0.595238539',
 'SCORE.2': '0.00408',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': 

## File: get_screens.py

In [10]:
"""
Fetch screen annotation with customizable search criteria
that can be tailored to match your own requirements
"""

request_url = BG_ORCS_BASE_URL + "/screens/"

# library methodology of "knockout"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "cellLine": "hela",
    "libraryMethodology": "knockout",
    "format": "json"
}

r = requests.get( request_url, params = params )
screens = r.json( )

for screen in screens :
    pprint( screen )

{'ANALYSIS': 'BAGEL',
 'AUTHOR': 'Hart T (2015)',
 'CELL_LINE': 'HeLa',
 'CELL_TYPE': 'Cervical Adenocarcinoma Cell Line',
 'CONDITION_DOSAGE': '-',
 'CONDITION_NAME': '-',
 'DURATION': '18 Days',
 'ENZYME': 'Cas9',
 'EXPERIMENTAL_SETUP': 'Timecourse',
 'FULL_SIZE': '17648',
 'FULL_SIZE_AVAILABLE': 'Yes',
 'LIBRARY': 'TKO (Toronto Knockout) v1',
 'LIBRARY_TYPE': 'CRISPRn',
 'METHODOLOGY': 'Knockout',
 'MOI': '~ 0.3',
 'NOTES': 'Genes with a Bayes Factor (BF) above the threshold of 15.47 at an '
          'FDR of 5% (FDR < 0.05) were identified as fitness genes (hits) for '
          'this cell line in this CRISPR screen.',
 'NUMBER_OF_HITS': '1696',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'PHENOTYPE': 'cell proliferation',
 'SCORE.1_TYPE': 'Bayes Factor',
 'SCORE.2_TYPE': '-',
 'SCORE.3_TYPE': '-',
 'SCORE.4_TYPE': '-',
 'SCORE.5_TYPE': '-',
 'SCORES_SIZE': '17648',
 'SCORE_COL_COUNT': '1',
 'SCREEN_FORMAT': 'Pool',
 'SCREEN_ID': '17',
 'SCREEN_NAME': '2-PMID2662

In [11]:
"""
Fetch screen annotation with customizable search criteria
that can be tailored to match your own requirements
"""

request_url = BG_ORCS_BASE_URL + "/screens/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "experimentalSetup": "Synthetic Lethal",
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

r = requests.get( request_url, params = params )
screens = r.json( )

## pprint the number of screens
print( len(screens) )

for screen in screens :
    pprint( screen )

1735
{'ANALYSIS': 'Kolmogorov-Smirnov',
 'AUTHOR': 'Wang T (2014)',
 'CELL_LINE': 'KBM-7',
 'CELL_TYPE': 'Chronic Myeloid Leukemia Cell Line',
 'CONDITION_DOSAGE': '130.0 nM',
 'CONDITION_NAME': 'Etoposide',
 'DURATION': '12 Days',
 'ENZYME': 'Cas9',
 'EXPERIMENTAL_SETUP': 'Drug Exposure',
 'FULL_SIZE': '7114',
 'FULL_SIZE_AVAILABLE': 'Yes',
 'LIBRARY': 'Human CRISPR Knockout Pooled Libraries (Enriched Sub-pools)',
 'LIBRARY_TYPE': 'CRISPRn',
 'METHODOLOGY': 'Knockout',
 'MOI': '-',
 'NOTES': 'Phenotypic readout: cell proliferation\n'
          'The threshold used to determine significant genes is '
          '-log10(corrected p-value)>1.3',
 'NUMBER_OF_HITS': '5',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'PHENOTYPE': 'response to chemicals',
 'SCORE.1_TYPE': 'Log10 (Corrected p-Value)',
 'SCORE.2_TYPE': '-',
 'SCORE.3_TYPE': '-',
 'SCORE.4_TYPE': '-',
 'SCORE.5_TYPE': '-',
 'SCORES_SIZE': '7114',
 'SCORE_COL_COUNT': '1',
 'SCREEN_FORMAT': 'Pool',
 'SCREEN_ID': '1

## File: get_screen_json.py

In [12]:
"""
Fetch screen scores with customizable search criteria
that can be tailored to match your own requirements
in json format
"""

screen_id = 178
request_url = BG_ORCS_BASE_URL + "/screen/" + str(screen_id)
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json",
    "score1min": 0.9,
    "score1max": 0.98
}

r = requests.get( request_url, params = params )
screen = r.json( )

data = {}
for row in screen :
    # Create a hash of results by gene identifier
    data[row['IDENTIFIER_ID']] = row

# Print out data about the genes BRIX1, ASB4, and NOB1
pprint( data['55299'] )
pprint( data['51666'] )
pprint( data['28987'] )

{'ALIASES': 'BRIX|BXDC2|FLJ11100',
 'HIT': 'NO',
 'IDENTIFIER_ID': '55299',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'BRIX1',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '0.94239',
 'SCORE.2': '0.999965',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '178',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': 'ASB-4',
 'HIT': 'NO',
 'IDENTIFIER_ID': '51666',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'ASB4',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '0.97613',
 'SCORE.2': '0.999965',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '178',
 'SOURCE': 'BioGRID ORCS'}
{'ALIASES': 'ART-4|MST158|MSTP158|NOB1P|PSMD8BP1',
 'HIT': 'NO',
 'IDENTIFIER_ID': '28987',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'NOB1',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '0.96316',
 'SCORE.2': '0.999965',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '178',
 'SO

## File: get_screen_pandas.py

In [13]:
"""
Fetch screen scores in json format, and load the results
into a pandas dataframe. Pandas is a convenient library for
loading tabular datasets and provides the ability to perform
subsequent queries on the loaded dataframe after tbe fact.
"""

import pandas as pd

screen_id = 205
request_url = BG_ORCS_BASE_URL + "/screen/" + str(screen_id)
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
screen = r.json( )

data = {}
for row in screen :
    # create a hash of results by gene identifier
    data[row['IDENTIFIER_ID']] = row

# Load dataset into pandas dataframe
dataset = pd.DataFrame.from_dict( data, orient='index' )

In [14]:
# Re-order the columns to remove the un-needed columns
columns = ['IDENTIFIER_TYPE', 'OFFICIAL_SYMBOL', 'ALIASES', 'ORGANISM_ID', 'ORGANISM_OFFICIAL', 'SCORE.1', 'SCORE.2', 'HIT', 'SOURCE']
dataset = dataset[columns]

In [15]:
dataset.head()

Unnamed: 0,IDENTIFIER_TYPE,OFFICIAL_SYMBOL,ALIASES,ORGANISM_ID,ORGANISM_OFFICIAL,SCORE.1,SCORE.2,HIT,SOURCE
1,unknown,ERCC6-PGBD3,-,9606,Homo sapiens,-0.128413752,0.367,NO,BioGRID ORCS
29974,gene,A1CF,ACF|ACF64|ACF65|APOBEC1CF|ASP,9606,Homo sapiens,0.258916286,0.382,NO,BioGRID ORCS
2,gene,A2M,A2MD|CPAMD5|FWP007|S863-7,9606,Homo sapiens,-0.015887049,0.382,NO,BioGRID ORCS
144568,gene,A2ML1,CPAMD9|p170|FLJ25179,9606,Homo sapiens,0.16742891,0.382,NO,BioGRID ORCS
127550,gene,A3GALT2,A3GALT2P|IGB3S|IGBS3S,9606,Homo sapiens,-0.243185319,0.259,NO,BioGRID ORCS


In [16]:
# Convert numeric columns into floats
dataset[['SCORE.1','SCORE.2']] = dataset[['SCORE.1','SCORE.2']].apply( pd.to_numeric )

In [17]:
dataset.head()

Unnamed: 0,IDENTIFIER_TYPE,OFFICIAL_SYMBOL,ALIASES,ORGANISM_ID,ORGANISM_OFFICIAL,SCORE.1,SCORE.2,HIT,SOURCE
1,unknown,ERCC6-PGBD3,-,9606,Homo sapiens,-0.128414,0.367,NO,BioGRID ORCS
29974,gene,A1CF,ACF|ACF64|ACF65|APOBEC1CF|ASP,9606,Homo sapiens,0.258916,0.382,NO,BioGRID ORCS
2,gene,A2M,A2MD|CPAMD5|FWP007|S863-7,9606,Homo sapiens,-0.015887,0.382,NO,BioGRID ORCS
144568,gene,A2ML1,CPAMD9|p170|FLJ25179,9606,Homo sapiens,0.167429,0.382,NO,BioGRID ORCS
127550,gene,A3GALT2,A3GALT2P|IGB3S|IGBS3S,9606,Homo sapiens,-0.243185,0.259,NO,BioGRID ORCS


In [18]:
# Print all rows with SCORE.1 > 1
dataset.loc[dataset['SCORE.1'] > 1]

Unnamed: 0,IDENTIFIER_TYPE,OFFICIAL_SYMBOL,ALIASES,ORGANISM_ID,ORGANISM_OFFICIAL,SCORE.1,SCORE.2,HIT,SOURCE
154791,gene,FMC1,C7orf55|HSPC268,9606,Homo sapiens,1.138414,0.382,NO,BioGRID ORCS
11235,gene,PDCD10,CCM3|TFAR15,9606,Homo sapiens,1.007427,0.382,NO,BioGRID ORCS
729873,gene,TBC1D3,PRC17|TBC1D3A|TBC1D3F|DKFZp434P2235,9606,Homo sapiens,2.968792,0.382,NO,BioGRID ORCS
100302736,gene,TMED7-TICAM2,-,9606,Homo sapiens,1.007427,0.382,NO,BioGRID ORCS


## File: get_genes_and_screens.py

In [19]:
"""
Fetch genes with customizable search criteria
that can be tailored to match your own requirements. Then fetch 
the screen annotation associated with those gene scores.
"""

import requests


request_url = BG_ORCS_BASE_URL + "/genes/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": "DPF2|SULT1E1|UBQLN4",
    "organismID": "9606",
    "hit": "yes",
    "format": "json"
}

r = requests.get( request_url, params = params )
scores = r.json( )
pprint( "Number of Scores Found: " + str(len(scores)) )

# Step through all the scores and build a unique set of screens
# as well as build a two dimentional structure for storing results
# where the identifier ID is the outer level and the screen ID is the
# inner level. This will allow us to lookup genes and then screens for that
# gene later on.
screen_ids = set( )
genes = {}
for score in scores :
    screen_ids.add( score['SCREEN_ID'] )
    
    if score['IDENTIFIER_ID'] not in genes :
        genes[score['IDENTIFIER_ID']] = {}

    genes[score['IDENTIFIER_ID']][score['SCREEN_ID']] = score

pprint( "Number of Unique Screen IDs Found: " + str(len(screen_ids)) )
pprint( "Number of Genes Found: " + str(len(genes)) )

'Number of Scores Found: 471'
'Number of Unique Screen IDs Found: 377'
'Number of Genes Found: 3'


In [20]:
# Make a new request for annotation about all the new screens

request_url = BG_ORCS_BASE_URL + "/screens/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "screenID": "|".join( screen_ids ),
    "format": "json"
}

r = requests.get( request_url, params = params )
screens = r.json( )
pprint( "Number of Screen Details Retrieved: " + str(len(screens)) )

'Number of Screen Details Retrieved: 377'


In [21]:
# Step through each screen and build a nice index of screens by screen_id
screen_lookup = {}
for screen in screens :
    screen_lookup[screen['SCREEN_ID']] = screen

# Print out a score and its associated screen annotation
# for gene 5977 and screen 201
pprint( genes['5977']['201'] )

{'ALIASES': 'REQ|UBID4|ubi-d4|BAF45d',
 'HIT': 'YES',
 'IDENTIFIER_ID': '5977',
 'IDENTIFIER_TYPE': 'gene',
 'OFFICIAL_SYMBOL': 'DPF2',
 'ORGANISM_ID': '9606',
 'ORGANISM_OFFICIAL': 'Homo sapiens',
 'SCORE.1': '-0.492155932',
 'SCORE.2': '0.028',
 'SCORE.3': '-',
 'SCORE.4': '-',
 'SCORE.5': '-',
 'SCREEN_ID': '201',
 'SOURCE': 'BioGRID ORCS'}


In [22]:
pprint( screen_lookup['201'] )

{'ANALYSIS': 'CERES',
 'AUTHOR': 'Meyers RM (2017)',
 'CELL_LINE': 'U-343MGa',
 'CELL_TYPE': 'Glioma Cell Line',
 'CONDITION_DOSAGE': '-',
 'CONDITION_NAME': '-',
 'DURATION': '21 Days',
 'ENZYME': 'Cas9',
 'EXPERIMENTAL_SETUP': 'Timecourse',
 'FULL_SIZE': '17670',
 'FULL_SIZE_AVAILABLE': 'Yes',
 'LIBRARY': 'Avana',
 'LIBRARY_TYPE': 'CRISPRn',
 'METHODOLOGY': 'Knockout',
 'MOI': '< 1',
 'NOTES': 'To identify gene hits at an FDR < 0.05, ORCS estimated FDRs for the '
          'CERES Avana dataset using customized gold-standard positive and '
          'negative sets of genes. The custom positive set included genes '
          'essential in at least 90% of cell lines in both the CERES Avana '
          'dataset (PMID:29083409) and the 10 lines included in Bertomeu et '
          'al. (PMID:29038160). The custom negative set included genes '
          'essential in zero cell lines from one of the two datasets and in '
          '<=10% of the other dataset.',
 'NUMBER_OF_HITS': '1948',
 'O