In [1]:
import pandas as pd
import numpy as np 
import re
import os
import requests
from bs4 import BeautifulSoup as bs
import pubchempy
from tqdm import tqdm
import spacy
nlp = spacy.load('en_core_web_sm')

# Question 1

### Load GPT-J outputs into a DataFrame

In [2]:
d = {'file': [],
     'prompt': [], 
     'response': []}

o_path = 'data4/GPT-J-Run-Output'
for output in os.listdir(o_path):
    f_path = os.path.join(o_path, output)
    with open(f_path) as o:
        o_str = o.read()
        for segment in o_str.split('Prompt: ')[1:]:
            prompt, response = segment.split('Response: ')
            prompt, response = prompt.strip(), response.strip()
            d['file'].append(output)
            d['prompt'].append(prompt)
            d['response'].append(response)
df = pd.DataFrame(d)

### Number of outputs in each file:

In [3]:
df.value_counts('file')

file
gene_product_output.txt                42767
names_patents_output.txt                4863
drug_inhibits_output.txt                2835
drug_mechanism_of_action_output.txt     2835
drug_prompt_output.txt                  2835
drug_targets_output.txt                 2835
names_aliases_output.txt                 409
dtype: int64

### Take a look at two prompts for each output type:

In [4]:
for file in df['file'].unique():
    df_p = df[df['file']==file]
    idx = np.random.randint(0, df_p.shape[0])
    print(f"{file}:")
    print("Prompt:", df_p.iat[idx, 1])
    print(df_p.iat[idx, 2])
    print('\n')


names_patents_output.txt:
Prompt: S3757 is a drug mentioned in patent
S3757 is a drug mentioned in patent document 2. It has been reported that S3757 is effective for the treatment of a neurological disorder, and is useful for the treatment of depression, attention deficit hyperactivity disorder, or neuropathic pain.
The drug S3757 is disclosed in, for example, patent document 1.

 
Enter prompt or quit: Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


names_aliases_output.txt:
Prompt: 16-Dehydroprogesterone is a drug also known as
16-Dehydroprogesterone is a drug also known as levonorgestrel and 17α-ethinyl-17α-hydroxyprogesterone. It is a synthetic progestin used in combination with an estrogen to prevent pregnancy. It is used for birth control and in the treatment of endometriosis.

It is available in the form of tablets under the brand name Depo-Provera and in injectable form under the brand name Depo-SubQ Provera.

Side effects

The most common side effect

### Patents
Check responses to patent related prompts with the USPTO results

In [5]:
def get_pto_results(term):
    cookies = {
        '_gid': 'GA1.2.385592734.1638654260',
        '_ga': 'GA1.1.1881375993.1638654260',
        '_4c_': 'lZLbjtowEIZfZeVrEnIwTswdPahCaiuk3a7Uq8ixB2IR4sgxSVnEu3cMCay6V81NZj7P%2FD7MfyZDBQ1ZxizN2YKxKM%2FjaEb2cOrI8kysVv7XkyURPIpyRkWgFnkZ0DyLA54JCChL5TaLE5YqIDPyx2tRRtNkwXmyyC8z0utJA7IyK2MOQbIQKqC8lEG5VTxQglNJWSITmk4a%2Fjx5etWYEedqZHnkP1SU7ah4JtLgtksS8zCOwxib3RumQcoijKHxu3Zuh%2FHRogSpnGu75Xw%2BDEN47Fpnwp3p561w0Lhu3oGwssLib6vi1%2FqL18X3SDO8ShrejkSTq3JrjTpKV7hT67cfoHzq1B4XFPRaQjFo5Srfz%2FLoQSvQu8p5HC2uuLWYJBgNulFm%2BLdtpPe2POdIN7fTYv4bsxcrFByE3XvwE8F6U3wXQ7ExtZanYt1szbjwCqhm35NVL3RdfK1BOmsaLYtP%2Bq14Pt2VGlcXK%2Bl0r52Gib5A5%2FTBNKfiuQWQ1X2htGbowN%2Foc2XNAZ44Q2r88g8hMbSwBWuvFe%2FnsDNmV0MozWGORZ12%2FkXvwxkROnKiwY22frZ%2B4rWRovY9aOTL3T0ZTTi6J4tH96B3J%2Fu09Wif%2BOG1LPLVjE1eo4%2Fq%2FkP1qM2yj9q3Rwig%2BZ%2B2y%2BUv',
        '_ga_CD30TTEK1F': 'GS1.1.1638656598.1.1.1638656609.0',
    }

    headers = {
        'Connection': 'keep-alive',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Dest': 'document',
        'Referer': 'https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&f=S&l=50&d=PTXT&Query={}'.format(term),
        'Accept-Language': 'en-US,en;q=0.9,tr;q=0.8',
    }

    params = (
        ('Sect1', 'PTO2'),
        ('Sect2', 'HITOFF'),
        ('p', '1'),
        ('u', '/netahtml/PTO/search-adv.htm'),
        ('r', '1'),
        ('f', 'G'),
        ('l', '50'),
        ('d', 'PTXT'),
        ('S1', term),
        ('OS', term),
        ('RS', term),
    )

    response = requests.get('https://patft.uspto.gov/netacgi/nph-Parser', headers=headers, params=params, cookies=cookies)
    return response.text


In [6]:
patent_df = df[df['file']=='names_patents_output.txt']

In [8]:
small_patent_df = patent_df.sample(n=100, random_state=1)
iterrows = small_patent_df.iterrows()
preds = []
for i, row in tqdm(iterrows, total=100):
    try:
        drug = row[1].split(' ')[0]
        if len(drug) < 4:
            continue
        results = get_pto_results(drug)
        search_term = re.findall('is a drug mentioned in patent (\S+\s\S+)', row[2])[0].replace(' ', '').strip(' ./,')
        preds.append(search_term in results)
    except:
        continue

100%|██████████| 100/100 [02:53<00:00,  1.73s/it]


In [9]:
print(f"Accuracy of patent outputs: {sum(preds)/len(preds)}")

Accuracy of patent outputs: 0.0


### Name Aliases
Check PubChem synonyms

In [14]:
names_df = df[df.file=='names_aliases_output.txt']

Get drug synonyms from PubChemPy, count prediction as true if any of those synonyms appear in the response text:

In [17]:
preds = []
small_names_df = names_df.sample(frac=.5, random_state=1)
for i, row in tqdm(small_names_df.iterrows(), total=small_names_df.shape[0]): 
    drug = row[1].split(' ')[0]
    try:
        synonyms = pubchempy.get_synonyms(drug, 'name')[0]['Synonym']
    except IndexError:
        continue
    pred = False
    for s in synonyms:
        if s in row[2]:
            pred = True 
            break
    preds.append(pred)


100%|██████████| 204/204 [01:09<00:00,  2.95it/s]


In [19]:
print(f"Accuracy of name alias outputs: {sum(preds)/len(preds)}")

Accuracy of name alias outputs: 0.6299212598425197


### Rest of the prompts

In [20]:
already_looked_at = ['names_aliases_output.txt', 'names_patents_output.txt']
rest_df = df[~df.file.isin(already_looked_at)]

#### We can get PubChem descriptions of drugs as follows:

In [21]:
def get_description(drug_name): 
    cid = pubchempy.get_substances(identifier=drug_name, namespace='name')[0].cids[0]
    response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON')
    d = response.json()
    description = d['Record']['Section'][2]['Section'][0]['Information'][1]['Value']['StringWithMarkup'][0]['String']
    return description

For instance:

In [22]:
get_description('Zucapsaicin')

'Zucapsaicin, the cis-isomer of capsaicin, is a topical analgesic used to treat osteoarthritis of the knee and other neuropathic pain. It is a modulator of transient receptor potential cation channel subfamily V member 1 (TRPV-1), also known as the vanilloid or capsaicin receptor 1, that reduces pain and improves articular functions. Zucapsaicin has also been evaluated for the management of several conditions manifested by chronic nerve pain. These conditions include herpes simplex (HSV) infections, cluster headaches, migraine, and osteoarthritis of the knee. Zucapsaicin was approved by the Health Canada in 2010 as topical cream marketed under the brand name Zuacta but currently not FDA-approved.'

In [23]:
get_description('ZOCOR')

'Simvastatin, also known as the brand name product Zocor, is a lipid-lowering drug derived synthetically from a fermentation product of Aspergillus terreus. It belongs to the statin class of medications, which are used to lower the risk of cardiovascular disease and manage abnormal lipid levels by inhibiting the endogenous production of cholesterol in the liver. More specifically, statin medications competitively inhibit the enzyme hydroxymethylglutaryl-coenzyme A (HMG-CoA) Reductase, which catalyzes the conversion of HMG-CoA to mevalonic acid and is the third step in a sequence of metabolic reactions involved in the production of several compounds involved in lipid metabolism and transport including cholesterol, low-density lipoprotein (LDL) (sometimes referred to as "bad cholesterol"), and very low-density lipoprotein (VLDL). Prescribing of statin medications is considered standard practice following any cardiovascular events and for people with a moderate to high risk of development

In [24]:
get_description('VALSARTAN')

"Valsartan belongs to the angiotensin II receptor blocker (ARB) family of drugs, which also includes [telmisartan], [candesartan], [losartan], [olmesartan], and [irbesartan]. ARBs selectively bind to angiotensin receptor 1 (AT1) and prevent the protein angiotensin II from binding and exerting its hypertensive effects, which include vasoconstriction, stimulation and synthesis of aldosterone and ADH, cardiac stimulation, and renal reabsorption of sodium, among others. Overall, valsartan's physiologic effects lead to reduced blood pressure, lower aldosterone levels, reduced cardiac activity, and increased excretion of sodium.   Valsartan also affects the renin-angiotensin aldosterone system (RAAS), which plays an important role in hemostasis and regulation of kidney, vascular, and cardiac functions. Pharmacological blockade of RAAS via  AT1 receptor blockade inhibits negative regulatory feedback within RAAS, which is a contributing factor to the pathogenesis and progression of cardiovascu

I am going to compare responses with actual PubChem descriptions. I will use spaCy to find cosine similarities between two text. 

In [29]:
n_drugs = 30
targets_df = df[df.file=='drug_targets_output.txt']
drugs = np.array(list(set(targets_df.prompt.apply(lambda x: x.split(' ')[0]))))
idx = np.random.randint(0, len(drugs), n_drugs)
some_drugs = drugs[idx]
files = set(rest_df.file.unique())

similarities = {}
for drug in tqdm(some_drugs):
    try:
        description = nlp(get_description(drug))
    except (IndexError, KeyError) as e:
        continue
    drug_df = rest_df[rest_df.prompt.str.contains(drug)]
    for i, row in drug_df.iterrows():
        response = nlp(row[2])
        score = description.similarity(response)
        similarities[row[0]] = similarities.get(row[0], []) + [score]


  score = description.similarity(response)
100%|██████████| 30/30 [01:15<00:00,  2.51s/it]


In [39]:
for k, v in similarities.items():
    avg = sum(v) / len(v)
    print(f'mean prediction score for {k} is {avg}')

mean prediction score for drug_mechanism_of_action_output.txt is 0.7183346132702794
mean prediction score for drug_inhibits_output.txt is 0.7019262031595568
mean prediction score for drug_prompt_output.txt is 0.6867630164744416
mean prediction score for drug_targets_output.txt is 0.7003673931935903
mean prediction score for gene_product_output.txt is 0.723272126684177


# Question 2

In [57]:
n_drugs = 100
outputs_to_keep = ["drug_mechanism_of_action_output.txt", "drug_targets_output.txt", "drug_inhibits_output.txt"]
q2df = df[df.file.isin(outputs_to_keep)]
idx = np.random.randint(0, len(drugs), n_drugs)
some_drugs = drugs[idx]
print("Does the model predict same targets?")
for drug in some_drugs:
    try:
        drug_df = rest_df[rest_df.prompt.str.contains(drug)]
        try: 
            target_response = nlp(drug_df.loc[drug_df.file==outputs_to_keep[1], 'response'].iloc[0])
        except IndexError:
            continue
        similarities = []
        for r in drug_df.response:
            similarities.append(target_response.similarity(nlp(r)))
        similarities = np.array(similarities) > .8 # setting a similarity threshold of .8
        print(f"For {drug}: {all(similarities)}")
    except: 
        continue



Does the model predict same targets?
For Saccharin: True


  similarities.append(target_response.similarity(nlp(r)))


For Roxithromycin: False
For Linezolid: False
For Canagliflozin: False
For OC000459: True
For S3719: False
For S5385: False
For S9383: False
For Anisodamine: True
For L-Adrenaline: True
For S5159: False
For S5052: True
For Gestodene: False
For EXJADE: False
For S5034: True
For AMG319: False
For ENALAPRILAT: False
For Iloperidone: False
For S4686: False
For S9032: False
For 2-Naphthol: True


  drug_df = rest_df[rest_df.prompt.str.contains(drug)]


For Carbamyl-beta-methylcholine: False
For Ropinirole: True
For S4935: False
For Nizatidine: True
For Clevudine: False
For S4637: True
For Suxibuzone: False
For Diroximel: False
For proadifen: True
For Methenamine: True
For Clarithromycin: False
For S3948: False
For S4656: False
For pasiniazid: False
For XL-184: True
For Ketorolac: False
For LM-3093: False
For Dolasetron: True
For S4986: False
For Midodrine: False
For pimozide: False
For FELDENE: True
For Tinoridine: False
For AESCULIN: True
For S3657: True
For Methacycline: True
For Ornidazole: False
For pentetate: False
For Micafungin: False
For RANITIDINE: False
For S5457: False
For S5583: False
For Dexlansoprazole: False
For Abacavir: False
For CX-4945: False
For ACILAC: False
For Cinitapride: False
For Oxymatrine: False
For Uracil: False
For VX-745: True
For S5450: False
For Floxuridine: True
For MLN9708: False
For Raloxifene: False
For S5492: False
For S4680: False
For Emtricitabine: False
For S5007: False
For S3781: False
For Tr

# Question 3

In [62]:
gene_df = df[df.file=='gene_product_output.txt']

In [77]:
genes = gene_df.prompt.apply(lambda x: re.findall(r'The gene symbol\s(\S+)', x)[0])

Extract gene products from response text.

In [112]:
def get_gene_product(s):
    gp = re.findall(r'The gene product of \S+ is  ([^\.,]*)', s)
    if len(gp) > 0:
        return gp[0]
    else:
        return None

gene_products = gene_df.response.apply(get_gene_product)

In [115]:
gene_products_df = pd.DataFrame()
gene_products_df['gene'] = genes
gene_products_df['gene_product'] = gene_products

In [116]:
gene_products_df

Unnamed: 0,gene,gene_product
16612,symbol,
16613,A1BG,alpha-1-B glycoprotein
16614,A1BG-AS1,an antisense RNA that may regulate A1BG (alph...
16615,A1CF,a novel cytidine deaminase
16616,A2M,
...,...,...
59374,ZYG11B,a cytosolic protein that belongs to the GRAS f...
59375,ZYX,a ...
59376,ZYXP1,zyxin
59377,ZZEF1,a zinc finger protein with zinc finger ZZ-typ...


Take a look at few:

In [133]:
gene_products_df.iloc[:10, 0:2]

Unnamed: 0,gene,gene_product
16612,symbol,
16613,A1BG,alpha-1-B glycoprotein
16614,A1BG-AS1,an antisense RNA that may regulate A1BG (alph...
16615,A1CF,a novel cytidine deaminase
16616,A2M,
16617,A2M-AS1,
16618,A2ML1,
16619,A2ML1-AS1,A2ML1 antisense RNA 1
16620,A2ML1-AS2,
16621,A2MP1,alpha-2-macroglobulin


Check whether there are matching drugs for gene targets in the DataFrame. If so, add those drugs to the DataFrame under the column 'drug'.

In [161]:
targets_df['drug'] = targets_df.prompt.apply(lambda x: x.split(' ')[0])
associated_drugs = []
for gene in tqdm(genes): 
    gene_target_idx = targets_df.response.str.contains(gene)
    if gene_target_idx.sum() > 0:
        gene_target_df = targets_df[gene_target_idx]
        associated_drug = gene_target_df.reset_index(drop=True).iat[0, 3]
    else: 
        associated_drug = None
    associated_drugs.append(associated_drug)
gene_products_df['drug'] = associated_drugs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets_df['drug'] = targets_df.prompt.apply(lambda x: x.split(' ')[0])
100%|██████████| 42767/42767 [02:12<00:00, 323.25it/s]


In [162]:
gene_products_df[(~gene_products_df.drug.isna()) & (~gene_products_df.gene_product.isna())]

Unnamed: 0,gene,gene_product,drug
16672,ABCB1,P-glycoprotein,AZD4547
16684,ABCC1,ABCC1 (ATP-binding cassette subfamily C member 1),AZD4547
16696,ABCC11,a membrane protein that functions as a drug tr...,S9142
16760,ABL1,a tyrosine kinase that is encoded by a retrovi...,Evobrutinib
16768,ABRA,a protein of the ADP-ribosylation factor family,S3811
...,...,...,...
57633,VIT,the human vitrin protein,Bepridil
58133,XCL1,the chemokine CXCL1,CX-4945
58137,XG,a high-molecular-weight glycoprotein found in ...,S3717
58140,XIAP,X-linked inhibitor of apoptosis protein (XIAP),MK-1775


# Question 4

In [197]:
found_drugs = gene_products_df[(~gene_products_df.drug.isna()) & (~gene_products_df.gene_product.isna())].drug

Checking the PTO to find the patents associated with drugs related to gene products. 

In [214]:
associated_patents = []
for drug in tqdm(found_drugs):
    try:
        patent_result = bs(get_pto_results(drug)).find('title').string
    except:
       patent_result = None
    associated_patents.append(patent_result)

100%|██████████| 477/477 [14:51<00:00,  1.87s/it]


In [216]:
associated_patents[:15]

['United States Patent: 11185592',
 'United States Patent: 11185592',
 'United States Patent: 8450246',
 'United States Patent: 11179397',
 'United States Patent: 11153401',
 'United States Patent: 11169145',
 'United States Patent: 11116737',
 'United States Patent: 10966935',
 'United States Patent: 11186584',
 'United States Patent: 11180471',
 'United States Patent: 11116737',
 'United States Patent: 11185552',
 'United States Patent: 11148856',
 'United States Patent: 6645480',
 'United States Patent: 11185584']

In [218]:
gene_products_df.loc[(~gene_products_df.drug.isna()) & (~gene_products_df.gene_product.isna()), 'patent'] = associated_patents

In [221]:
gene_products_df[~gene_products_df.patent.isna()].tail(10)

Unnamed: 0,gene,gene_product,drug,patent
57273,UNG,uracil DNA glycosylase,Uracil,United States Patent: 11189361
57501,UST,an enzyme which catalyzes the transfer of a su...,Ticagrelor,United States Patent: 11180471
57571,VDAC1,voltage dependent anion channel 1 (VDAC1),Isosorbide,United States Patent: 11186842
57593,VDR,vitamin D receptor (VDR),Doxercalciferol,United States Patent: 11185584
57621,VIM,a member of the intermediate filament (IF) fam...,AZACTAM,United States Patent: 11186829
57633,VIT,the human vitrin protein,Bepridil,United States Patent: 11185584
58133,XCL1,the chemokine CXCL1,CX-4945,United States Patent: 11174464
58137,XG,a high-molecular-weight glycoprotein found in ...,S3717,United States Patent: 11036585
58140,XIAP,X-linked inhibitor of apoptosis protein (XIAP),MK-1775,United States Patent: 11179412
58183,XRCC1,a protein involved in the repair of damaged DNA,S3721,United States Patent: 11046724
