## BioGRID REST Sevices:  ORCS

- for context BioGRID has two primary APIs that differ slightly in focus
- examples in this notebook use the ORCS API
- Use the API base url to request an API key and capture it in an .env file

### Protein, Genetic and Chemical Interactions
- Site URL:  https://thebiogrid.org/
- API docs:  https://wiki.thebiogrid.org/doku.php/biogridrest
- API base url:  https://webservice.thebiogrid.org
- API key:  https://webservice.thebiogrid.org

### Open Repository of CRISPR Screens (ORCS)
- Site URL:  https://orcs.thebiogrid.org/
- API docs:  https://wiki.thebiogrid.org/doku.php/orcs:webservice
- API base url:  https://orcsws.thebiogrid.org
- API key:  https://orcsws.thebiogrid.org

CITATION:
- original examples were based on
  - [BIOGRID-REST-EXAMPLES](https://github.com/BioGRID/BIOGRID-REST-EXAMPLES) GitHub repository
  - [ORCS-REST-EXAMPLES](https://github.com/BioGRID/ORCS-REST-EXAMPLES) GitHub repository

In [1]:
# Import necessary libraries
import os
from dotenv import load_dotenv
import requests
from pprint import pprint

# Load environment variables from .env file
load_dotenv()

# Fetch the API keys and base URLs from the .env file
BG_INT_ACCESS_KEY = os.getenv("BG_INT_ACCESS_KEY")
BG_INT_BASE_URL = os.getenv("BG_INT_BASE_URL")
BG_ORCS_ACCESS_KEY = os.getenv("BG_ORCS_ACCESS_KEY")
BG_ORCS_BASE_URL = os.getenv("BG_ORCS_BASE_URL")

# Validate the environment variables
if not BG_INT_ACCESS_KEY or not BG_INT_BASE_URL:
    raise ValueError("BG_INT_ACCESS_KEY or BG_INT_BASE_URL is missing from the .env file.")

if not BG_ORCS_ACCESS_KEY or not BG_ORCS_BASE_URL:
    raise ValueError("BG_ORCS_ACCESS_KEY or BG_ORCS_BASE_URL is missing from the .env file.")

## organisms endpoint

In [None]:
"""
Fetch the set of currently supported organism IDs
"""

request_url = BG_ORCS_BASE_URL + "/organisms"

params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
organisms = r.json( )

pprint( organisms )

## vocabs endpoint

In [None]:
"""
Fetch the set of controlled vocabularies and also fetch
a set of terms for two of the vocabulary categories
"""

request_url = BG_ORCS_BASE_URL + "/vocabs"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
vocabs = r.json( )

pprint( vocabs )

## vocab endpoint

In [None]:
# request actual terms in controlled vocabularies

request_url = BG_ORCS_BASE_URL + "/vocab/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

# Fetch list of available phenotypes (vocab: 12)
r = requests.get( request_url + '12', params = params )
vocab = r.json( )

pprint( vocab )

In [None]:
# Fetch list of cell lines (vocab: 11)
r = requests.get( request_url + '11', params = params )
vocab = r.json( )

vocab

## genes endpoint

In [None]:
"""
Fetch scores for several genes across the entire set
of screens in the database
"""

request_url = BG_ORCS_BASE_URL + "/genes/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "geneID": '66050|66056|66044',
    "format": "json"
}

r = requests.get( request_url, params = params )
scores = r.json( )

for score in scores :
    pprint( score )

In [None]:
# Specify the gene name (e.g., "TP53")
gene_name = "TP53"

# Construct the request URL for querying genes
request_url = f"{BG_ORCS_BASE_URL}/genes/"

# Set query parameters
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

# Make the API request
response = requests.get(request_url, params=params)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    gene_data = response.json()
    if gene_data:
        print(f"Results for gene: {gene_name}\n")
        print(f"Number of results: {len(gene_data)}")
        pprint(gene_data)
    else:
        print(f"No results found for gene: {gene_name}")
else:
    print(f"Failed to fetch data: {response.status_code}")
    print(response.text)


In [None]:
# Specify the gene name (e.g., "RB1")
gene_name = "RB1"

# Construct the request URL for querying genes
request_url = f"{BG_ORCS_BASE_URL}/genes/"

# Set query parameters
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

# Make the API request
response = requests.get(request_url, params=params)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    gene_data = response.json()
    if gene_data:
        print(f"Results for gene: {gene_name}\n")
        print(f"Number of results: {len(gene_data)}")
        pprint(gene_data)
    else:
        print(f"No results found for gene: {gene_name}")
else:
    print(f"Failed to fetch data: {response.status_code}")
    print(response.text)


## gene endpoint

In [None]:
"""
Fetch scores across all screens for a single gene
using customizable filtering options
"""

gene_id = 7023
request_url = BG_ORCS_BASE_URL + "/gene/" + str(gene_id)

params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json",
    "hit": "yes"
}

r = requests.get( request_url, params = params )
scores = r.json( )

data = {}
for row in scores :
    # Create a hash of results by gene identifier
    data[row['SCREEN_ID']] = row

# Print out data about the gene from several specific screens
pprint( data['549'] )
pprint( data['170'] )
pprint( data['197'] )

## File: get_screens.py

In [None]:
"""
Fetch screen annotation with customizable search criteria
that can be tailored to match your own requirements
"""

request_url = BG_ORCS_BASE_URL + "/screens/"

# library methodology of "knockout"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "cellLine": "hela",
    "libraryMethodology": "knockout",
    "format": "json"
}

r = requests.get( request_url, params = params )
screens = r.json( )

for screen in screens :
    pprint( screen )

In [None]:
"""
Fetch screen annotation with customizable search criteria
that can be tailored to match your own requirements
"""

request_url = BG_ORCS_BASE_URL + "/screens/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "experimentalSetup": "Synthetic Lethal",
    "name": gene_name,  # Search by official gene symbol
    "organismID": "9606",  # Search for human genes    
    "format": "json",
}

r = requests.get( request_url, params = params )
screens = r.json( )

## pprint the number of screens
print( len(screens) )

for screen in screens :
    pprint( screen )

## File: get_screen_json.py

In [None]:
"""
Fetch screen scores with customizable search criteria
that can be tailored to match your own requirements
in json format
"""

screen_id = 178
request_url = BG_ORCS_BASE_URL + "/screen/" + str(screen_id)
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json",
    "score1min": 0.9,
    "score1max": 0.98
}

r = requests.get( request_url, params = params )
screen = r.json( )

data = {}
for row in screen :
    # Create a hash of results by gene identifier
    data[row['IDENTIFIER_ID']] = row

# Print out data about the genes BRIX1, ASB4, and NOB1
pprint( data['55299'] )
pprint( data['51666'] )
pprint( data['28987'] )

## File: get_screen_pandas.py

In [13]:
"""
Fetch screen scores in json format, and load the results
into a pandas dataframe. Pandas is a convenient library for
loading tabular datasets and provides the ability to perform
subsequent queries on the loaded dataframe after tbe fact.
"""

import pandas as pd

screen_id = 205
request_url = BG_ORCS_BASE_URL + "/screen/" + str(screen_id)
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "format": "json"
}

r = requests.get( request_url, params = params )
screen = r.json( )

data = {}
for row in screen :
    # create a hash of results by gene identifier
    data[row['IDENTIFIER_ID']] = row

# Load dataset into pandas dataframe
dataset = pd.DataFrame.from_dict( data, orient='index' )

In [14]:
# Re-order the columns to remove the un-needed columns
columns = ['IDENTIFIER_TYPE', 'OFFICIAL_SYMBOL', 'ALIASES', 'ORGANISM_ID', 'ORGANISM_OFFICIAL', 'SCORE.1', 'SCORE.2', 'HIT', 'SOURCE']
dataset = dataset[columns]

In [None]:
dataset.head()

In [16]:
# Convert numeric columns into floats
dataset[['SCORE.1','SCORE.2']] = dataset[['SCORE.1','SCORE.2']].apply( pd.to_numeric )

In [None]:
dataset.head()

In [None]:
# Print all rows with SCORE.1 > 1
dataset.loc[dataset['SCORE.1'] > 1]

## File: get_genes_and_screens.py

In [None]:
"""
Fetch genes with customizable search criteria
that can be tailored to match your own requirements. Then fetch 
the screen annotation associated with those gene scores.
"""

import requests


request_url = BG_ORCS_BASE_URL + "/genes/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "name": "DPF2|SULT1E1|UBQLN4",
    "organismID": "9606",
    "hit": "yes",
    "format": "json"
}

r = requests.get( request_url, params = params )
scores = r.json( )
pprint( "Number of Scores Found: " + str(len(scores)) )

# Step through all the scores and build a unique set of screens
# as well as build a two dimentional structure for storing results
# where the identifier ID is the outer level and the screen ID is the
# inner level. This will allow us to lookup genes and then screens for that
# gene later on.
screen_ids = set( )
genes = {}
for score in scores :
    screen_ids.add( score['SCREEN_ID'] )
    
    if score['IDENTIFIER_ID'] not in genes :
        genes[score['IDENTIFIER_ID']] = {}

    genes[score['IDENTIFIER_ID']][score['SCREEN_ID']] = score

pprint( "Number of Unique Screen IDs Found: " + str(len(screen_ids)) )
pprint( "Number of Genes Found: " + str(len(genes)) )

In [None]:
# Make a new request for annotation about all the new screens

request_url = BG_ORCS_BASE_URL + "/screens/"
params = {
    "accesskey": BG_ORCS_ACCESS_KEY,
    "screenID": "|".join( screen_ids ),
    "format": "json"
}

r = requests.get( request_url, params = params )
screens = r.json( )
pprint( "Number of Screen Details Retrieved: " + str(len(screens)) )

In [None]:
# Step through each screen and build a nice index of screens by screen_id
screen_lookup = {}
for screen in screens :
    screen_lookup[screen['SCREEN_ID']] = screen

# Print out a score and its associated screen annotation
# for gene 5977 and screen 201
pprint( genes['5977']['201'] )

In [None]:
pprint( screen_lookup['201'] )