In [None]:
import pandas as pd
import numpy as np 
import re
import os
import requests
from bs4 import BeautifulSoup as bs
import pubchempy
from tqdm import tqdm
import spacy
# nlp = spacy.load('en_core_web_sm')

### Load GPT-J outputs into a DataFrame

In [6]:
d = {'file': [],
     'prompt': [], 
     'response': []}

o_path = 'data/GPT-J-Run-Output'
for output in os.listdir(o_path):
    f_path = os.path.join(o_path, output)
    with open(f_path) as o:
        o_str = o.read()
        for segment in o_str.split('Prompt: ')[1:]:
            prompt, response = segment.split('Response: ')
            prompt, response = prompt.strip(), response.strip()
            d['file'].append(output)
            d['prompt'].append(prompt)
            d['response'].append(response)
df = pd.DataFrame(d)

### Number of outputs in each file:

In [7]:
df.value_counts('file')

file
gene_product_output.txt                42767
names_patents_output.txt                4863
drug_inhibits_output.txt                2835
drug_mechanism_of_action_output.txt     2835
drug_prompt_output.txt                  2835
drug_targets_output.txt                 2835
names_aliases_output.txt                 409
dtype: int64

### Take a look at two prompts for each output type:

In [8]:
for file in df['file'].unique():
    df_p = df[df['file']==file]
    idx = np.random.randint(0, df_p.shape[0])
    print(f"{file}:")
    print("Prompt:", df_p.iat[idx, 1])
    print(df_p.iat[idx, 2])
    print('\n')


names_patents_output.txt:
Prompt: GI-532967 is a drug mentioned in patent
GI-532967 is a drug mentioned in patent applications WO2008065190, WO2008065192, WO2008065195, WO2008065198, WO2008065199, WO2008065200, WO2008065201, WO2008065202, WO2008065203, WO2008065204, WO2008065205, WO2008065206, WO2008065207, WO2008065208, WO2008065209, WO2008065210, WO2008065211, WO2008065212, WO2008065213, WO2008065214, WO2008065215, WO2008065216, WO2008065217, WO2008065218, WO2008065219, WO2008065220, WO2008065221
 
Enter prompt or quit: Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


names_aliases_output.txt:
Prompt: 2-Ethyl-6-methylaniline is a drug also known as
2-Ethyl-6-methylaniline is a drug also known as N-ethyl-o-toluidine (NEM) and o-toluidine (OTL). It is used as a color developer in photography and as a herbicide.

References

Category:Anilines
Category:Anilides
 
Enter prompt or quit:


drug_mechanism_of_action_output.txt:
Prompt: The mechanism of action of Aprep

### Patents
Check responses to patent related prompts with the USPTO results

In [13]:
def get_pto_results(term):
    cookies = {
        '_gid': 'GA1.2.385592734.1638654260',
        '_ga': 'GA1.1.1881375993.1638654260',
        '_4c_': 'lZLbjtowEIZfZeVrEnIwTswdPahCaiuk3a7Uq8ixB2IR4sgxSVnEu3cMCay6V81NZj7P%2FD7MfyZDBQ1ZxizN2YKxKM%2FjaEb2cOrI8kysVv7XkyURPIpyRkWgFnkZ0DyLA54JCChL5TaLE5YqIDPyx2tRRtNkwXmyyC8z0utJA7IyK2MOQbIQKqC8lEG5VTxQglNJWSITmk4a%2Fjx5etWYEedqZHnkP1SU7ah4JtLgtksS8zCOwxib3RumQcoijKHxu3Zuh%2FHRogSpnGu75Xw%2BDEN47Fpnwp3p561w0Lhu3oGwssLib6vi1%2FqL18X3SDO8ShrejkSTq3JrjTpKV7hT67cfoHzq1B4XFPRaQjFo5Srfz%2FLoQSvQu8p5HC2uuLWYJBgNulFm%2BLdtpPe2POdIN7fTYv4bsxcrFByE3XvwE8F6U3wXQ7ExtZanYt1szbjwCqhm35NVL3RdfK1BOmsaLYtP%2Bq14Pt2VGlcXK%2Bl0r52Gib5A5%2FTBNKfiuQWQ1X2htGbowN%2Foc2XNAZ44Q2r88g8hMbSwBWuvFe%2FnsDNmV0MozWGORZ12%2FkXvwxkROnKiwY22frZ%2B4rWRovY9aOTL3T0ZTTi6J4tH96B3J%2Fu09Wif%2BOG1LPLVjE1eo4%2Fq%2FkP1qM2yj9q3Rwig%2BZ%2B2y%2BUv',
        '_ga_CD30TTEK1F': 'GS1.1.1638656598.1.1.1638656609.0',
    }

    headers = {
        'Connection': 'keep-alive',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Dest': 'document',
        'Referer': 'https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&f=S&l=50&d=PTXT&Query={}'.format(term),
        'Accept-Language': 'en-US,en;q=0.9,tr;q=0.8',
    }

    params = (
        ('Sect1', 'PTO2'),
        ('Sect2', 'HITOFF'),
        ('p', '1'),
        ('u', '/netahtml/PTO/search-adv.htm'),
        ('r', '1'),
        ('f', 'G'),
        ('l', '50'),
        ('d', 'PTXT'),
        ('S1', term),
        ('OS', term),
        ('RS', term),
    )

    response = requests.get('https://patft.uspto.gov/netacgi/nph-Parser', headers=headers, params=params, cookies=cookies)
    return response.text


In [14]:
patent_df = df[df['file']=='names_patents_output.txt']

In [15]:
small_patent_df = patent_df.sample(n=100)
iterrows = small_patent_df.iterrows()
preds = []
for i, row in tqdm(iterrows, total=100):
    drug = row[1].split(' ')[0]
    results = get_pto_results(drug)
    search_term = re.findall('is a drug mentioned in patent (\S+\s\S+)', row[2])[0].replace(' ', '').strip(' ./,')
    preds.append(search_term in results)

  2%|▏         | 2/100 [00:02<02:24,  1.48s/it]


KeyboardInterrupt: 

In [16]:
print(f"Accuracy of patent outputs: {sum(preds)/len(preds)}")

Accuracy of patent outputs: 0.0


### Name Aliases
Check PubChem synonyms

In [17]:
names_df = df[df.file=='names_aliases_output.txt']

Get drug synonyms from PubChemPy, count prediction as true if any of those synonyms appear in the response text:

In [128]:
preds = []
small_names_df = names_df.sample(frac=.5, random_state=1)
for i, row in tqdm(small_names_df.iterrows(), total=small_names_df.shape[0]): 
    drug = row[1].split(' ')[0]
    try:
        synonyms = pubchempy.get_synonyms(drug, 'name')[0]['Synonym']
    except IndexError:
        continue
    pred = False
    for s in synonyms:
        if s in row[2]:
            pred = True 
            break
    preds.append(pred)


100%|██████████| 204/204 [01:07<00:00,  3.04it/s]


In [129]:
print(f"Accuracy of name alias outputs: {sum(preds)/len(preds)}")

Accuracy of name alias outputs: 0.6299212598425197


### Rest of the prompts

In [18]:
already_looked_at = ['names_aliases_output.txt', 'names_patents_output.txt']
rest_df = df[~df.file.isin(already_looked_at)]

#### We can get PubChem descriptions of drugs as follows:

In [19]:
def get_description(drug_name): 
    cid = pubchempy.get_substances(identifier=drug_name, namespace='name')[0].cids[0]
    response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON')
    d = response.json()
    description = d['Record']['Section'][2]['Section'][0]['Information'][1]['Value']['StringWithMarkup'][0]['String']
    return description

For instance:

In [20]:
get_description('Zucapsaicin')

'Zucapsaicin, the cis-isomer of capsaicin, is a topical analgesic used to treat osteoarthritis of the knee and other neuropathic pain. It is a modulator of transient receptor potential cation channel subfamily V member 1 (TRPV-1), also known as the vanilloid or capsaicin receptor 1, that reduces pain and improves articular functions. Zucapsaicin has also been evaluated for the management of several conditions manifested by chronic nerve pain. These conditions include herpes simplex (HSV) infections, cluster headaches, migraine, and osteoarthritis of the knee. Zucapsaicin was approved by the Health Canada in 2010 as topical cream marketed under the brand name Zuacta but currently not FDA-approved.'

In [21]:
get_description('ZOCOR')

'Simvastatin, also known as the brand name product Zocor, is a lipid-lowering drug derived synthetically from a fermentation product of Aspergillus terreus. It belongs to the statin class of medications, which are used to lower the risk of cardiovascular disease and manage abnormal lipid levels by inhibiting the endogenous production of cholesterol in the liver. More specifically, statin medications competitively inhibit the enzyme hydroxymethylglutaryl-coenzyme A (HMG-CoA) Reductase, which catalyzes the conversion of HMG-CoA to mevalonic acid and is the third step in a sequence of metabolic reactions involved in the production of several compounds involved in lipid metabolism and transport including cholesterol, low-density lipoprotein (LDL) (sometimes referred to as "bad cholesterol"), and very low-density lipoprotein (VLDL). Prescribing of statin medications is considered standard practice following any cardiovascular events and for people with a moderate to high risk of development

In [22]:
get_description('VALSARTAN')

"Valsartan belongs to the angiotensin II receptor blocker (ARB) family of drugs, which also includes [telmisartan], [candesartan], [losartan], [olmesartan], and [irbesartan]. ARBs selectively bind to angiotensin receptor 1 (AT1) and prevent the protein angiotensin II from binding and exerting its hypertensive effects, which include vasoconstriction, stimulation and synthesis of aldosterone and ADH, cardiac stimulation, and renal reabsorption of sodium, among others. Overall, valsartan's physiologic effects lead to reduced blood pressure, lower aldosterone levels, reduced cardiac activity, and increased excretion of sodium.   Valsartan also affects the renin-angiotensin aldosterone system (RAAS), which plays an important role in hemostasis and regulation of kidney, vascular, and cardiac functions. Pharmacological blockade of RAAS via  AT1 receptor blockade inhibits negative regulatory feedback within RAAS, which is a contributing factor to the pathogenesis and progression of cardiovascu

I am going to compare responses with actual PubChem descriptions. I will use spaCy to find cosine similarities between two text. 

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [1]:
targets_df = df[df.file=='drug_targets_output.txt']
drugs = np.array(list(set(targets_df.prompt.apply(lambda x: x.split(' ')[0]))))
idx = np.random.randint(0, len(drugs), 30)
some_drugs = drugs[idx]

similarities = {}
for drug in drugs:
    description = nlp(get_description(drug))
    drug_df = rest_df[rest_df.prompt.str.contains(drug)]
    for i, row in drug_df.iterrows():
        response = nlp(row[2])
        score = description.similarity(response)
        similarities[row[0]] = similarities.get(row[0], []) + [score]


NameError: name 'df' is not defined

In [55]:
dfs[0]

Unnamed: 0,file,prompt,response
6287,drug_mechanism_of_action_output.txt,The mechanism of action of Ivabradine is,The mechanism of action of Ivabradine is not f...
9122,drug_inhibits_output.txt,Ivabradineis a drug that inhibits,Ivabradineis a drug that inhibits sinus node f...
11957,drug_prompt_output.txt,Ivabradine is a drug that,Ivabradine is a drug that has been used in the...
14792,drug_targets_output.txt,Ivabradine is a drug that targets,Ivabradine is a drug that targets the heart an...
