In [13]:
import pandas as pd
import os
import requests
import json
import csv
import io
from collections import defaultdict
from time import time
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
#from flair.data import Sentence
#from flair.nn import Classifier
from chembl_webresource_client.new_client import new_client

# 1. Get targets from disease

In [28]:
"""
insert code here that
1. Takes an EFO as input
2. displays a dataframe of targets for that disease
3. uploads that dataframe to a table in planetscale called "disease_to_target"

"""
from sqlalchemy import create_engine
from dotenv import dotenv_values
config = dotenv_values('database_url.env')
url = config['DATABASE_URL']

engine = create_engine(url, echo=False)



## Set disease_id variable for desired disease

disease_id = "EFO_0005537"

disease_df = pd.read_sql("SELECT * FROM disease_to_target WHERE disease_id = '{disease_id}';".format(disease_id = disease_id), con=engine)
display(disease_df)




Unnamed: 0,level_0,index,disease_id,target_ensemble_id,target_chembl_id,association_score
0,0,0,EFO_0005537,ENSG00000184292,CHEMBL3856163,0.431490
1,1,1,EFO_0005537,ENSG00000198900,CHEMBL1781,0.424207
2,2,2,EFO_0005537,ENSG00000120217,CHEMBL3580522,0.405569
3,3,3,EFO_0005537,ENSG00000258947,CHEMBL2597,0.402827
4,4,4,EFO_0005537,ENSG00000137267,CHEMBL3797012,0.399973
...,...,...,...,...,...,...
5012,5012,5012,EFO_0005537,ENSG00000135164,CHEMBL3707469,0.001478
5013,5013,5013,EFO_0005537,ENSG00000115665,CHEMBL4507,0.001478
5014,5014,5014,EFO_0005537,ENSG00000146109,CHEMBL4507,0.001478
5015,5015,5015,EFO_0005537,ENSG00000225830,CHEMBL4507,0.001478


# 2. Get compounds and assays

In [31]:
"""
insert code here that
1. takes a target as input
2. searches the database for compounds and assays for that target
3. displays a dataframe with at least these columns: target id, compound id, assay id
"""
engine = create_engine(url, echo=False)
target_id = 'ENSG00000120217'

target_to_compounds_df = pd.read_sql("SELECT * FROM target_to_compounds WHERE target_ensemble_id='{target}';".format(target=target_id), con=engine)
display(target_to_compounds_df[['target_ensemble_id', 'compound_id', 'assay_id']])





Unnamed: 0,target_ensemble_id,compound_id,assay_id
0,ENSG00000120217,CHEMBL4776444,CHEMBL4775268
1,ENSG00000120217,CHEMBL4081869,CHEMBL4775269
2,ENSG00000120217,CHEMBL4776700,CHEMBL4775268
3,ENSG00000120217,CHEMBL4785255,CHEMBL4775268
4,ENSG00000120217,CHEMBL5171101,CHEMBL5096762
5,ENSG00000120217,CHEMBL4071326,CHEMBL4017391
6,ENSG00000120217,CHEMBL4084368,CHEMBL4017391
7,ENSG00000120217,CHEMBL4288470,CHEMBL4263391
8,ENSG00000120217,CHEMBL3582254,CHEMBL3583018
9,ENSG00000120217,CHEMBL4061613,CHEMBL4017391


# 3. Propose a new compound

In [None]:
"""
insert code here that
1. trains a model to predict ic50 values for a compound on a given target
2. downloads ALL the compounds in the database (independent of target)
3. runs the model to predict ic50 values for each compound in the database
4. displays a dataframe that has the following two columns : compound, predicted ic50
5. prints out the compound that has the highest ic50 value that has NOT been tested on the target already.
"""

# 4. Propose an assay

In [42]:
"""
insert code here that
1. retrieves the terms for all the assays that are relevant to the target the user picked.
2. clusters the assays according to their descriptive terms
3. plots the clusters (set n_clusters = 10)
4. prints out the title of one assay from each cluster.
"""

#!pip install flair
print("Finished Installing flair")



# Set disease_id variable for desired disease
disease_id = "EFO_0005537"

# Build query string to get target information as well as count
query_string = """
query AssociatedTargets {
  disease(efoId: "disease_id") {
    id
    name
    associatedTargets(page: { size: 3, index: 0 }) {
      rows {
        target {
          id
          approvedName
          dbXrefs{
              id
            }
        }
        score
      }
    }
  }
}
""".replace("disease_id", disease_id)

# Set variables object of arguments to be passed to endpoint
variables = {"efoId": disease_id}

# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"

# Perform POST request and check status code of response
r = requests.post(base_url, json={"query": query_string, "variables": variables})
print(r.status_code)

#Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)
print('done')
#print(api_response)








a=0
#create list of target IDs with associated evidence scores by calling them from dictionary
target_info = api_response['data']['disease']['associatedTargets']['rows']
#print(target_info)


target_id_list = []
#pull IDs and scores from dictionary and add to new list
while a < len(target_info):
    #print(a)

    for dict in target_info[a]['target']['dbXrefs']:
        #print(dict)
        # Get rid of all IDs that aren't CHEMBL IDs, then put CHEMBL ID in list to add to tuple
        newDict = {key:val for key, val in (dict).items() if "CHEMBL" in val}
        if len(newDict) == 0:
            continue
        chembl_id = newDict['id']
    target_id_list.append((target_info[a]['target'].get('id'), chembl_id, target_info[a]['score']))
    a+=1
#print(target_id_list)







#from Disease_to_Everything
target = new_client.target
activity = new_client.activity
assay = new_client.assay

# Create targets_dict, which will be a dictionary containing target names, with compounds and compound data associated with that target.
targets_dict = {}

# Create darget_data, which will be a dictionary containing target names, with dataframes with compound data associated with that target
target_data_list = []

full_disease_df = pd.DataFrame()
#For each target pulled from OpenTargets:
for target in target_id_list:

    target_data = {}
    print(target[0])
    # Get compound activitiy data from Chembl for each target, including only exact IC50 values. Sort by IC50 value.
    activities = activity.filter(target_chembl_id=target[1], standard_type="IC50", relation = '=').order_by('standard_value')
    #print(assay)
    #print(activities)

    compound_dictionary = {}

    #create seperate list for each type of data
    smiles_list = []
    standard_value_list = []
    compound_list = []
    type_list = []
    assays_list = []
    assay_type_list = []
    descr_list = []
    abstract_list = []
    doc_list = []
    target_name = []

    print(len(activities))

    for compound in activities:
        #print(compound)
        slimDict = {key:val for key, val in compound.items() if key == 'type' or key == 'canonical_smiles' or key == 'standard_value'}


        #slimDict = {key:val for key, val in compound.items() if key == 'value'}


        compound_id = compound['molecule_chembl_id']
        compound_list.append(compound_id)
        type_list.append(compound['type'])
        standard_value_list.append(compound['standard_value'])
        smiles_list.append(compound['canonical_smiles'])
        assays_list.append(compound['assay_chembl_id'])
        assay_type_list.append(compound['assay_type'])
        doc_list.append(compound['document_chembl_id'])
        target_name.append(target[0])
        
        
        doc_id = compound['document_chembl_id']
        document = new_client.document
        abstract = document.filter(document_chembl_id = doc_id, assay_chembl_id = compound['assay_chembl_id']).only('abstract')

        

        #print(abstract[0]['abstract'])
        abstract_list.append(abstract[0]['abstract'])
        descr_list.append(compound['assay_description'])
        #print(compound_id)

        compound_dictionary[compound['molecule_chembl_id']] = slimDict
        df = pd.DataFrame.from_dict(compound_dictionary)


    targets_dict[target[0]] = compound_dictionary


    to_pandas_dict = {"target_ENSG" : target_name, "compound_id": compound_list, "smiles": smiles_list, "type" : type_list, "standard_value": standard_value_list, 'assay_id': assays_list, 'assay_type': assay_type_list, 'assay_description' : descr_list, 'document_ID' : doc_list, 'abstract' : abstract_list}
    pandas_df = pd.DataFrame.from_dict(to_pandas_dict)


    full_disease_df = pd.concat([full_disease_df, pandas_df], ignore_index = True)
    print(len(full_disease_df))

    """
    f = open(target[0]+"dataframe.pickle", "wb")
    pickle.dump(pandas_df, f)
    f.close()
    """
    #display(pandas_df)
    target_data['target_id'] = target[0]
    target_data['data'] = pandas_df
    target_data_list.append(target_data)




"""
import pickle

for target in target_data_list:
    print(target['target_id'])

    display(target['data'])
    f = open(target['target_id']+"dataframe.pickle", "wb")
    pickle.dump(target['data'], f)
    f.close()
    #display(target['data'])
"""
#end of stuff from Disease_to_Everything

df = full_disease_df
display(df)
chembl_aids = full_disease_df['assay_id']



#tagger = Classifier.load('bioner')
print("Finished setting tagger")

Finished Installing flair
200
done
ENSG00000184292
0
0
ENSG00000198900
456
456
ENSG00000120217
34
490


Unnamed: 0,target_ENSG,compound_id,smiles,type,standard_value,assay_id,assay_type,assay_description,document_ID,abstract
0,ENSG00000198900,CHEMBL3968023,C[n+]1ccc2c3c(nc4cc5c(cc42)OCO5)-c2ccccc2-c31....,IC50,0.45,CHEMBL3867357,B,Inhibition of human DNA topoisomerase 1 using ...,CHEMBL3865847,"Eupolauridine, an indenonaphthyridine alkaloid..."
1,ENSG00000198900,CHEMBL3914791,C[n+]1c2c3c(nc(-c4ccccc4)cc3c3ccccc31)-c1ccccc...,IC50,0.475,CHEMBL3867357,B,Inhibition of human DNA topoisomerase 1 using ...,CHEMBL3865847,"Eupolauridine, an indenonaphthyridine alkaloid..."
2,ENSG00000198900,CHEMBL3896867,[Br-].c1ccc(C[n+]2ccc3c4c(nc5cc6c(cc53)OCO6)-c...,IC50,0.52,CHEMBL3867357,B,Inhibition of human DNA topoisomerase 1 using ...,CHEMBL3865847,"Eupolauridine, an indenonaphthyridine alkaloid..."
3,ENSG00000198900,CHEMBL3889816,[Br-].c1ccc(C[n+]2ccc3c4c(nc5ccccc53)-c3ccccc3...,IC50,0.57,CHEMBL3867357,B,Inhibition of human DNA topoisomerase 1 using ...,CHEMBL3865847,"Eupolauridine, an indenonaphthyridine alkaloid..."
4,ENSG00000198900,CHEMBL3984603,Cc1cc2c3c(nc4ccccc42)-c2ccccc2-c3[n+]1C.[I-],IC50,0.7,CHEMBL3867357,B,Inhibition of human DNA topoisomerase 1 using ...,CHEMBL3865847,"Eupolauridine, an indenonaphthyridine alkaloid..."
...,...,...,...,...,...,...,...,...,...,...
485,ENSG00000120217,CHEMBL4791550,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)CN)C(=O)N[...,IC50,5600.0,CHEMBL4688873,B,Antagonist activity at human PDL1 assessed as ...,CHEMBL4680328,Blocking the interaction of programmed cell de...
486,ENSG00000120217,CHEMBL4798338,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)CNC(=O)[C@...,IC50,19200.0,CHEMBL4688873,B,Antagonist activity at human PDL1 assessed as ...,CHEMBL4680328,Blocking the interaction of programmed cell de...
487,ENSG00000120217,CHEMBL366760,O=C(/C=C/c1ccc(O)c(O)c1)O[C@H]1C[C@@](O)(C(=O)...,IC50,36560.0,CHEMBL4613347,B,Antagonist activity against PDL1 (unknown origin),CHEMBL4613205,Blockade the interaction of the programmed cel...
488,ENSG00000120217,CHEMBL4790749,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)CNC(=O)[C@...,IC50,80700.0,CHEMBL4688873,B,Antagonist activity at human PDL1 assessed as ...,CHEMBL4680328,Blocking the interaction of programmed cell de...


Finished setting tagger


In [63]:
new_df = pd.unique(df['assay_id'])

df2 = df[['assay_id', 'assay_description', 'abstract']].copy()

display(df2)




Unnamed: 0,assay_id,assay_description,abstract
0,CHEMBL3867357,Inhibition of human DNA topoisomerase 1 using ...,"Eupolauridine, an indenonaphthyridine alkaloid..."
1,CHEMBL3867357,Inhibition of human DNA topoisomerase 1 using ...,"Eupolauridine, an indenonaphthyridine alkaloid..."
2,CHEMBL3867357,Inhibition of human DNA topoisomerase 1 using ...,"Eupolauridine, an indenonaphthyridine alkaloid..."
3,CHEMBL3867357,Inhibition of human DNA topoisomerase 1 using ...,"Eupolauridine, an indenonaphthyridine alkaloid..."
4,CHEMBL3867357,Inhibition of human DNA topoisomerase 1 using ...,"Eupolauridine, an indenonaphthyridine alkaloid..."
...,...,...,...
485,CHEMBL4688873,Antagonist activity at human PDL1 assessed as ...,Blocking the interaction of programmed cell de...
486,CHEMBL4688873,Antagonist activity at human PDL1 assessed as ...,Blocking the interaction of programmed cell de...
487,CHEMBL4613347,Antagonist activity against PDL1 (unknown origin),Blockade the interaction of the programmed cel...
488,CHEMBL4688873,Antagonist activity at human PDL1 assessed as ...,Blocking the interaction of programmed cell de...


assay_id
assay_description
abstract


In [None]:
for aid in unique_aids_list:
    descr = 