# Multiple Sclerosis Drug Repurposing

## Introduction

https://github.com/biothings/biothings_explorer/issues/134  
https://www.nature.com/articles/s41586-018-0360-3

## Step 0: Load BTE modules, notebook functions

In [None]:
## for Google Colab
%%capture
!pip install git+https://github.com/colleenXu/biothings_explorer@issues_134_135#egg=biothings_explorer

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# import modules from biothings_explorer
from biothings_explorer.hint import Hint
from biothings_explorer.user_query_dispatcher import FindConnection

## show time that this notebook was executed 
from datetime import datetime

## packages to work with objects 
import re

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

In [2]:
## functions to add to modules?
def hint_display(query, hint_result):
    """
    show the type, name, number of IDs for all results returned by the query
    
    :param: query: string used in hint query
    :param: hint_result: object returned from hint query, a dictionary of lists of dictionaries
    
    Returns: None
    """
    ## function needs to be rewritten if it's going to give the exact index of each object within its type 
    display = ['type', 'name']  ## replace with the parts of the BioThings object you want to see
    concise_results = []
    for BT_type, result in hint_result.items():
        if result:  ## basically if it's not empty
            for items in result:
                ## number of identifiers per object: number of keys - 4 (name, primary, display, type)
                temp = len(items) - 4
                concise_results.append((items[display[0]], items[display[1]], 
                                         str(temp)))
                    
    print('There are {total} BioThings objects returned for {ht}:'.format(\
                total = len(concise_results), ht = query))
    for display_info in concise_results:
        print('{0}, {1}, num of IDs: {2}'.format(display_info[0], display_info[1], display_info[2]))

In [3]:
def filter_table(df):
    """
    use _source and _method columns to remove rows (paths) from the dataframe
    :param: pandas dataframe containing results from BTE FindConnection module, in table form
    
    Returns: filtered dataframe
    """
    ## note: still needs checking with EXPLAIN queries
    ## key is the string to match to column, value is a list of strings to match to column values
    filter_out = {'_source': ['SEMMED', 'CTD', 'ctd', 'omia']   
#                   '_method': []  ## currently no method stuff I want to filter out
                 }
    ## SEMMED: text mining results wrong for PhenotypicFeature -> Gene
    ## CTD/ctd: results odd for MSUD -> ChemicalSubstance
    ## omia: results wrong or discontinued gene IDs for PhenotypicFeature -> Gene
    
    
    df_temp = df.copy()  ## so the original df isn't modified in-place
    for key,val in filter_out.items():
        ## find columns that match the key string
        columns = [i for i in df_temp.columns if key in i]
        ## iterate through each column
        for col in columns:
            ## iterate through each value to take out, check if string CONTAINS match. 
            ## only keep rows that don't contain the value
            for i in val:
                df_temp = df_temp[~ df_temp[col].str.contains(i, na = False)]
    return df_temp

In [4]:
def scoring_output(df, q_type):
    """
    score results based on whether query was Predict or Explain type, number of 
        intermediate nodes 
    :param: pandas dataframe containing results from BTE FindConnection module
    :param: string describing type of query (Predict or Explain)
    
    May flatten some edges, because score only counts one edge per 
        unique predicate / API / method (ignoring source and pubmed col)
    
    Predict queries: score each output node by counting # of paths
        from input nodes to it. Normalize by dividing by maximum
        possible # of paths
    Explain two-hop (one intermediate) queries: score each intermediate node by 
        counting # of paths (between input and output nodes) that include it. 
        Normalize by dividing by maximum possible # of paths    

    Explain one-hop (direct) queries: no need to score, prints message
    Other Explain queries (many-hops): currently not able to score, prints message     
    
    Returns: pandas series with scores, index is output_name
             or None (one-hop or many-hop Explain query)
    """
    df_temp = df.copy()  ## so no chance to mutate this   
    flag_direct = False  ## one-hop query or not
    ## use df_col to look quicker into columns
    df_col = set(df_temp.columns)
    
    ## ignore source and pubmed col in looking at unique edges 
    columns_drop = [col for col in df_col if (('_source' in col) or ('_pubmed' in col))]
    df_temp.drop(columns = columns_drop, inplace = True)    
    df_temp.drop_duplicates(inplace = True)
    
    ## check if query is one-hop or not
    if "node1_name" not in df_col:    ## name for first intermediate node layer
        flag_direct = True  
    
    if q_type == 'Explain':
        if flag_direct:   # one hop / no intermediates
            print('No valid node scoring for one-hop (direct) Explain queries.')
            return None
        ## if there are many-hops/intermediate layers
        elif "node2_name" in df_col:  ## name for 2nd intermed. node layer
            print('Cannot currently score many-hop Explain queries.')
            return None
        else:   ## two-hop / 1 intermediate layer
            ## count multi-edges to results (the intermediate node1 col)
            scores = df_temp.node1_name.value_counts() 
            ## to find the maximum-possible number of edges, look at non-result cols
            columns_drop = [col for col in df_col if 'node1' in col]
            df_temp.drop(columns = columns_drop, inplace = True)
            ## now look at number of unique combos for input, edge info, output
            df_temp.drop_duplicates(inplace = True)
            max_paths = df_temp.shape[0]            
            ## normalize scores by dividing each by max number of paths
            scores = scores / max_paths

    else:  ## Predict type query
        ## count multi-edges to results (the output col)
        scores = df_temp.output_name.value_counts()
        ## to find the maximum number of multi-edges, look at non-output col
        columns_drop = [col for col in df_temp.columns if 'output' in col]
        df_temp.drop(columns = columns_drop, inplace = True)
        ## now look at number of unique paths possible
        df_temp.drop_duplicates(inplace = True)
        max_paths = df_temp.shape[0]
        ## normalize scores by dividing each by max number of paths
        scores = scores / max_paths
            
    ## return scores as pandas dataframe, with rank
    scores = scores.to_frame(name = 'score') 
    scores['rank'] = scores['score'].rank(method = 'dense', ascending = False)
    return scores

In [5]:
## record when cell blocks are executed
print('The time that this notebook was executed is...')
print('Local time (PST, West Coast USA): ')
print(datetime.now())
print('UTC time: ')
print(datetime.utcnow())

The time that this notebook was executed is...
Local time (PST, West Coast USA): 
2020-11-30 22:54:14.449904
UTC time: 
2020-12-01 06:54:14.450071


## Step 1: Find representation of "multiple sclerosis" in BTE

In [70]:
ht = Hint()  ## neater way to call this BTE module

## the human user gives this input
starting_str1 = "CHEBI:86570"

start_hint1 = ht.query(starting_str1)
hint_display(starting_str1, start_hint1)

There are 1 BioThings objects returned for CHEBI:86570:
ChemicalSubstance, DIHYDROCHOLESTEROL, num of IDs: 9


In [71]:
choice_type1 = 'ChemicalSubstance'
choice_idx1 = 0

start_hint_obj1 = start_hint1[choice_type1][choice_idx1]  
start_hint_obj1

{'CHEMBL.COMPOUND': 'CHEMBL1289436',
 'PUBCHEM': 6665,
 'CHEBI': 'CHEBI:86570',
 'UNII': '8M308U816E',
 'INCHIKEY': 'QYIXCDOBOSTCEI-QCYZZNICSA-N',
 'INCHI': 'InChI=1S/C27H48O/c1-18(2)7-6-8-19(3)23-11-12-24-22-10-9-20-17-21(28)13-15-26(20,4)25(22)14-16-27(23,24)5/h18-25,28H,6-17H2,1-5H3/t19-,20+,21+,22+,23-,24+,25+,26+,27-/m1/s1',
 'name': 'DIHYDROCHOLESTEROL',
 'CAS': '80-97-7',
 'IUPAC': '(3S,5S,8R,9S,10S,13R,14S,17R)-17-[(1R)-1,5-dimethylhexyl]-10,13-dimethyl-2,3,4,5,6,7,8,9,11,12,14,15,16,17-tetradecahydro-1H-cyclopenta[a]phenanthren-3-ol',
 'formula': 'C27H48O',
 'primary': {'identifier': 'CHEBI',
  'cls': 'ChemicalSubstance',
  'value': 'CHEBI:86570'},
 'display': 'CHEBI(CHEBI:86570) CHEMBL.COMPOUND(CHEMBL1289436) PUBCHEM(6665) UNII(8M308U816E) name(DIHYDROCHOLESTEROL) CAS(80-97-7) IUPAC((3S,5S,8R,9S,10S,13R,14S,17R)-17-[(1R)-1,5-dimethylhexyl]-10,13-dimethyl-2,3,4,5,6,7,8,9,11,12,14,15,16,17-tetradecahydro-1H-cyclopenta[a]phenanthren-3-ol) formula(C27H48O)',
 'type': 'ChemicalSub

## query

In [72]:
## the human user gives this input
q1_output_type = 'Gene'
# q1_intermediate = 'PhenotypicFeature'

q1 = FindConnection(input_obj = start_hint_obj1,\
                     output_obj = q1_output_type, \
                    intermediate_nodes = None)
q1.connect(verbose = True)


BTE will find paths that join 'DIHYDROCHOLESTEROL' and 'Gene'. Paths will have 0 intermediate node.




==== Step #1: Query path planning ====

Because DIHYDROCHOLESTEROL is of type 'ChemicalSubstance', BTE will query our meta-KG for APIs that can take 'ChemicalSubstance' as input and 'Gene' as output

BTE found 8 apis:

API 1. hmdb(1 API call)
API 2. dgidb(1 API call)
API 3. chembio(1 API call)
API 4. scibite(1 API call)
API 5. pharos(1 API call)
API 6. scigraph(1 API call)
API 7. mychem(1 API call)
API 8. cord_chemical(1 API call)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 8.1: https://biothings.ncats.io/cord_chemical/query?fields=associated_with (POST -d q=CHEBI:86570&scopes=chebi)
API 2.1: http://dgidb.genome.wustl.edu/api/v2/interactions.json?drugs=CHEMBL1289436
API 3.1: https://automat.renci.org/chembio/chemical_substance/gene/CHEBI:86570
API 4.1: https://automat.renci.org/cord

In [64]:
q1_r_paths_table = q1.display_table_view()

q1_type = re.findall("dispatcher.([a-zA-Z]+)'", str(type(q1.fc)))
q1_type = "".join(q1_type)  ## convert to string

q1 = None  ## clear memory

In [65]:
## show number of unique output nodes
print("There are {0} unique {1}.".format( \
    q1_r_paths_table.output_name.nunique(), q1_output_type))

## show number of paths from MSUD to genes
print("There are {0} unique paths.".format( \
    q1_r_paths_table.shape[0]))

There are 12 unique Gene.
There are 13 unique paths.


In [44]:
q1_r_paths_table[q1_r_paths_table['output_name'].str.contains('LSS')]
q1_r_paths_table[q1_r_paths_table['output_name'].str.contains('EBP')]
q1_r_paths_table[q1_r_paths_table['output_name'].str.contains('CYP51')]
q1_r_paths_table[q1_r_paths_table['output_name'].str.contains('TM7SF2')]

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,5ALPHA-CHOLEST-8-EN-3BETA-OL,ChemicalSubstance,related_to,hmdb,Automat HMDB API,,Gene,EBP,NCBIGene:10682


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id


In [66]:
q1_r_paths_table

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,7314,HGNC:7314
1,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,scibite,Automat CORD19 Scibite API,,Gene,NR1H3,NCBIGene:10062
2,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,pharos,Automat PHAROS API,,Gene,NR1H3,NCBIGene:10062
3,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,pharos,Automat PHAROS API,,Gene,NR1H2,NCBIGene:7376
4,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,hmdb,Automat HMDB API,,Gene,DHCR24,NCBIGene:1718
5,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,CYP27A1,NCBIGene:1593
6,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,CYP7B1,NCBIGene:9420
7,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,NDUFB6,NCBIGene:4712
8,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,DHDDS,NCBIGene:79947
9,24-DEHYDROCHOLESTEROL,ChemicalSubstance,related_to,Translator Text Mining Provider,CORD Chemical API,,Gene,PSMB7,NCBIGene:5695


In [None]:
q1_r_paths_table = filter_table(q1_r_paths_table)

## show number of paths from MSUD to genes
print("There are {0} unique paths.".format( \
    q1_r_paths_table.shape[0]))

In [None]:
q1_r_paths_table

## Step 2: Find representation of "sterol biosyn" in BTE

In [None]:
ht = Hint()  ## neater way to call this BTE module

## the human user gives this input
starting_str = "cholesterol biosynthesis"

start_hint = ht.query(starting_str)
hint_display(starting_str, start_hint)

In [None]:
choice_type = 'BiologicalProcess'
choice_idx = 0

start_hint_obj = start_hint[choice_type][choice_idx]  
start_hint_obj