In [19]:
import pandas as pd
import numpy as np
from Bio import Entrez
from Bio import Medline
from Bio import SeqIO
import random
import requests
import re
from tqdm import tqdm
import nltk
import time
from itertools import cycle, islice
from xml.etree import ElementTree
import xmltodict
import openai

In [20]:
def parse_document(document_ID):
    query = str(document_ID)
    def search(TERM):
        Entrez.email = 'random@example.com'
        handle = Entrez.esearch(db='pubmed', sort='relevance', retmax=1, retmode='text', term=TERM)
        result = Entrez.read(handle)
        ids = result['IdList']
        handle = Entrez.efetch(db='pubmed', sort='relevance', retmode='text', rettype='medline', id=ids)
        records = Medline.parse(handle)
        return records


    for record in search(query):
        title = record.get('TI','')
        abstract = record.get('AB','')
        ID = record.get('PMID','')
        mesh = record.get('MH','')
        other_terms = record.get('OT','')

    document = title + ' ' + abstract
    with open('document.txt','w') as f:
        f.write(document)
    return ID, title, abstract, document

In [21]:
def pubtator(document_ID):
    FORMAT = 'biocxml'
    TYPE = "pmids"
    BIOCONCEPTS = 'gene,disease'
    url = f"https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/{FORMAT}?{TYPE}={document_ID}&concepts={BIOCONCEPTS}"
    response = requests.get(url)
    time.sleep(1.5)
    doc  = ElementTree.fromstring(response.content)
    tree = ElementTree.ElementTree(doc)
    tree.write('content.xml',encoding = 'utf-8')
    root = tree.getroot()
    doc = root[3]
    passage = doc[1:]
    TEXT = []
    ELEMENT = []
    IDENTIFIER = []
    TYPE = []
    for i in passage:
        for text in i.iterfind('text'):
            text = text.text
            TEXT.append(text)
        for annotation in i.iterfind('annotation'):
            for text in annotation.iterfind('text'):
                element = text.text
                ELEMENT.append(element)
            infos = annotation.findall('infon')
            try: 
                identifier = infos[0].text
            except:
                identifier = ''
            IDENTIFIER.append(identifier.replace('MESH:',''))
            try:
                typex = infos[1].text
            except:
                typex = infos[0].text
            TYPE.append(typex)
    df = pd.DataFrame(data=zip(ELEMENT,IDENTIFIER,TYPE), columns=['element','identifier','type'])
    df.identifier = df.identifier.replace('Disease',np.nan)
    df = df.drop_duplicates('identifier')
    disease_df = df[df.type == 'Disease']
    gene_df = df[df.type == 'Gene']
    return gene_df, disease_df

In [22]:
def pubtator_offline(document_ID):
    with open('../DATI ABSTRACT/filtered_data/anns.txt','r') as f:
        data = f.read().split('\n')
        data = [i for i in data if i]
        data = [i for i in data if document_ID in i]
        ELEMENT = []
        IDENTIFIER = []
        TYPE = []
        for i in range(len(data)):
            x = data[i].split('\t')
            ELEMENT.append(x[3])
            IDENTIFIER.append(x[5])
            TYPE.append(x[4])
    df = pd.DataFrame(data=zip(ELEMENT,IDENTIFIER,TYPE), columns=['element','identifier','type'])
    df.identifier = df.identifier.replace('Disease',np.nan)
    df = df.drop_duplicates('identifier')
    disease_df = df[df.type == 'Disease']
    gene_df = df[df.type == 'Gene']
    return gene_df, disease_df

In [23]:
def get_pairs(gene_df, disease_df):
    a = list(gene_df[['element', 'identifier']].itertuples(index=False, name=None))
    b = list(disease_df[['element','identifier']].itertuples(index=False, name=None))
    pairs = list(set([(i, j) for i in a for j in b]))
    return pairs

In [24]:
def check_associations_openai(document, document_id, pairs):
    auth_params = {"email": "salvatoredanilopalumbo@gmail.com","password" : "PVFXYXpkV1p1VkdkeUVETTFrak4wVm1ibGQyY3BSR1JCNVVTTTkwVE1sa1RCUkVacE4zWmw1V1owWlRPMUFUTXlRWFp1VjJaemxHWg=="}
    api_host = "https://www.disgenet.org/api"
    VOCABULARY = 'mesh'
    SOURCE = 'CURATED'
    
    openai.api_key = "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"
    
    # engine = "text-davinci-001"
    engine = 'text-curie-001'

    
    temperature = 0
    max_tokens = 500
    top_p = 1
    frequency_penalty = 0
    presence_penalty = 0
    
    
    DATA = pd.DataFrame(columns = ['geneId','diseaseId'])
    gene_id = []
    disease_id = []
    disease_umls = []
    
    for j in pairs:
        prompt = f"According to this text:\n\n{document}\n\nIs {j[0][0]} associated with {j[1][0]}?"
        response = openai.Completion.create(engine=engine,prompt=prompt,temperature=temperature,max_tokens=max_tokens,top_p=top_p,frequency_penalty=frequency_penalty,
                    presence_penalty=presence_penalty)
        time.sleep(1.5)
        print(prompt)
        for i in response['choices']:
            p = str(i['text'])
            p = p.replace('\n','')
            print(p)
            if p.startswith('Yes'):
                gene_id.append(j[0][1])
                disease_id.append(j[1][1])
                # print(f'{j[0][0]} - {j[0][1]}')
                # print(f'{j[1][0]} - {j[1][1]}')
    DATA.geneId = gene_id
    DATA.diseaseId = disease_id
    return DATA

In [30]:
x = '11578815'
RESULT = pd.DataFrame(columns = ['geneId','diseaseId'])
ID, title, abstract, document = parse_document(x)
gene_df, disease_df = pubtator(ID)
pairs = get_pairs(gene_df, disease_df)
result = check_associations_openai(document, ID, pairs)
RESULT = RESULT.append(result, ignore_index = True)

According to this text:

Strong association of a novel Tau promoter haplotype in progressive supranuclear palsy. The microtubule associated protein, tau, is found in fibrillar lesions that characterise progressive supranuclear palsy (PSP) and related tauopathies. Mutations in the tau gene in frontotemporal dementia with parkinsonism linked to chromosome 17 (FTDP-17) and genetic association of the H1 haplotype of the tau gene with PSP has firmly established a direct role for tau in disease pathogenesis. However, the functional significance of the tau genetic association in PSP is unknown. We analysed the tau gene promoter sequence and identified two novel single nucleotide polymorphisms. Here we report the genetic association of a novel tau promoter haplotype with PSP which may influence tau transcription.

Is microtubule associated protein associated with supranuclear palsy?
Yes, the microtubule associated protein, tau, is found in fibrillar lesions that characterise progressive supran

According to this text:

Strong association of a novel Tau promoter haplotype in progressive supranuclear palsy. The microtubule associated protein, tau, is found in fibrillar lesions that characterise progressive supranuclear palsy (PSP) and related tauopathies. Mutations in the tau gene in frontotemporal dementia with parkinsonism linked to chromosome 17 (FTDP-17) and genetic association of the H1 haplotype of the tau gene with PSP has firmly established a direct role for tau in disease pathogenesis. However, the functional significance of the tau genetic association in PSP is unknown. We analysed the tau gene promoter sequence and identified two novel single nucleotide polymorphisms. Here we report the genetic association of a novel tau promoter haplotype with PSP which may influence tau transcription.

Is microtubule associated protein associated with tauopathies?
Yes, the microtubule associated protein, tau, is found in fibrillar lesions that characterise progressive supranuclear 

In [31]:
gene_df

Unnamed: 0,element,identifier,type
0,Tau,4137,Gene
2,microtubule associated protein,51115,Gene


In [37]:
print(gene_df.to_latex(index=False))

\begin{tabular}{lll}
\toprule
                       element & identifier & type \\
\midrule
                           Tau &       4137 & Gene \\
microtubule associated protein &      51115 & Gene \\
\bottomrule
\end{tabular}



In [38]:
disease_df

Unnamed: 0,element,identifier,type
1,supranuclear palsy,D013494,Disease
5,PSP,D011030,Disease
6,tauopathies,D024801,Disease
8,dementia,D003704,Disease
9,parkinsonism,D010302,Disease


In [39]:
print(disease_df.to_latex(index=False))

\begin{tabular}{lll}
\toprule
           element & identifier &    type \\
\midrule
supranuclear palsy &    D013494 & Disease \\
               PSP &    D011030 & Disease \\
       tauopathies &    D024801 & Disease \\
          dementia &    D003704 & Disease \\
      parkinsonism &    D010302 & Disease \\
\bottomrule
\end{tabular}



In [33]:
for i in pairs:
    print(i)

(('microtubule associated protein', '51115'), ('supranuclear palsy', 'D013494'))
(('Tau', '4137'), ('supranuclear palsy', 'D013494'))
(('microtubule associated protein', '51115'), ('dementia', 'D003704'))
(('Tau', '4137'), ('dementia', 'D003704'))
(('microtubule associated protein', '51115'), ('parkinsonism', 'D010302'))
(('Tau', '4137'), ('parkinsonism', 'D010302'))
(('microtubule associated protein', '51115'), ('PSP', 'D011030'))
(('Tau', '4137'), ('PSP', 'D011030'))
(('Tau', '4137'), ('tauopathies', 'D024801'))
(('microtubule associated protein', '51115'), ('tauopathies', 'D024801'))


In [40]:
RESULT

Unnamed: 0,geneId,diseaseId
0,51115,D013494
1,4137,D013494
2,51115,D003704
3,4137,D003704
4,51115,D010302
5,4137,D010302
6,51115,D011030
7,4137,D011030
8,4137,D024801
9,51115,D024801


In [41]:
print(RESULT.to_latex(index=False))

\begin{tabular}{ll}
\toprule
geneId & diseaseId \\
\midrule
 51115 &   D013494 \\
  4137 &   D013494 \\
 51115 &   D003704 \\
  4137 &   D003704 \\
 51115 &   D010302 \\
  4137 &   D010302 \\
 51115 &   D011030 \\
  4137 &   D011030 \\
  4137 &   D024801 \\
 51115 &   D024801 \\
\bottomrule
\end{tabular}



In [19]:
input_document = 'documents_in_filtered_data'
documents_input = pd.read_csv(f'../{input_document}.csv')
LIST_ID = list(set([str(i) for i in documents_input.pmid.tolist()]))
len(LIST_ID)

100

In [20]:
COUNTER = 4
RESULT = pd.DataFrame(columns = ['geneId','diseaseId','diseaseUMLS'])
for x in tqdm(LIST_ID):
    ID, title, abstract, document = parse_document(x)
    # gene_df, disease_df = pubtator(ID)
    gene_df, disease_df = pubtator_offline(ID)
    pairs = get_pairs(gene_df, disease_df)
    result = check_associations_openai(document, ID, pairs)
    RESULT = RESULT.append(result, ignore_index = True)
RESULT.geneId = RESULT.geneId.astype(np.int64)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [21:18<00:00, 12.79s/it]


ValueError: invalid literal for int() with base 10: '4681|100532736'

In [21]:
RESULT.to_csv(f'openai_results_no_conversion_{COUNTER}_{input_document}.csv',index=False)

In [None]:
auth_params = {"email": "salvatoredanilopalumbo@gmail.com","password" : "PVFXYXpkV1p1VkdkeUVETTFrak4wVm1ibGQyY3BSR1JCNVVTTTkwVE1sa1RCUkVacE4zWmw1V1owWlRPMUFUTXlRWFp1VjJaemxHWg=="}
api_host = "https://www.disgenet.org/api"
VOCABULARY = 'mesh'
DISEASE_UMLS = []

s = requests.Session()
r = s.post(api_host+'/auth/', data=auth_params)
json_response = r.json()
api_key = json_response.get("token")


for i in tqdm(RESULT.diseaseId):
    if api_key:
        s.headers.update({"Authorization": "Bearer %s" % api_key})
        try:
            gda_response = s.get(f"{api_host}/gda/disease/{VOCABULARY}/{i}").json()
            DISEASE_UMLS.append(list(set([i['diseaseid'] for i in gda_response])))
        except:
            DISEASE_UMLS.append(np.nan)
if s:
    s.close()
        
RESULT['diseaseUMLS'] = DISEASE_UMLS
RESULT.dropna(inplace=True)

In [None]:
RESULT

In [None]:
RESULT.to_csv(f'openai_results_{COUNTER}_unexploded_{input_document}.csv',index=False)

In [None]:
RESULT = RESULT.explode('diseaseUMLS')

In [None]:
RESULT

In [None]:
RESULT.to_csv(f'openai_results_{COUNTER}_{input_document}.csv',index=False)