In [1]:
import pandas as pd
import numpy as np
from Bio import Entrez
from Bio import Medline
from Bio import SeqIO
import random
import requests
import re
from tqdm import tqdm
import nltk
import time
from itertools import cycle, islice
from xml.etree import ElementTree
import xmltodict

In [2]:
def parse_document(document_ID):
    query = str(document_ID)
    def search(TERM):
        Entrez.email = 'random@example.com'
        handle = Entrez.esearch(db='pubmed', sort='relevance', retmax=1, retmode='text', term=TERM)
        result = Entrez.read(handle)
        ids = result['IdList']
        handle = Entrez.efetch(db='pubmed', sort='relevance', retmode='text', rettype='medline', id=ids)
        records = Medline.parse(handle)
        return records


    for record in search(query):
        title = record.get('TI','')
        abstract = record.get('AB','')
        ID = record.get('PMID','')
        mesh = record.get('MH','')
        other_terms = record.get('OT','')

    document = title + ' ' + abstract
    with open('document.txt','w') as f:
        f.write(document)
    return ID, title, abstract, document

In [3]:
def pubtator(document_ID):
    FORMAT = 'biocxml'
    TYPE = "pmids"
    BIOCONCEPTS = 'gene,disease'
    url = f"https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/{FORMAT}?{TYPE}={document_ID}&concepts={BIOCONCEPTS}"
    response = requests.get(url)
    time.sleep(1.5)
    doc  = ElementTree.fromstring(response.content)
    tree = ElementTree.ElementTree(doc)
    tree.write('content.xml',encoding = 'utf-8')
    root = tree.getroot()
    doc = root[3]
    passage = doc[1:]
    TEXT = []
    ELEMENT = []
    IDENTIFIER = []
    TYPE = []
    for i in passage:
        for text in i.iterfind('text'):
            text = text.text
            TEXT.append(text)
        for annotation in i.iterfind('annotation'):
            for text in annotation.iterfind('text'):
                element = text.text
                ELEMENT.append(element)
            infos = annotation.findall('infon')
            try: 
                identifier = infos[0].text
            except:
                identifier = ''
            IDENTIFIER.append(identifier.replace('MESH:',''))
            try:
                typex = infos[1].text
            except:
                typex = infos[0].text
            TYPE.append(typex)
    df = pd.DataFrame(data=zip(ELEMENT,IDENTIFIER,TYPE), columns=['element','identifier','type'])
    df.identifier = df.identifier.replace('Disease',np.nan)
    df = df.drop_duplicates('identifier')
    disease_df = df[df.type == 'Disease']
    gene_df = df[df.type == 'Gene']
    return gene_df, disease_df

In [4]:
def pubtator_offline(document_ID):
    with open('../DATI ABSTRACT/filtered_data/anns.txt','r') as f:
        data = f.read().split('\n')
        data = [i for i in data if i]
        data = [i for i in data if document_ID in i]
        ELEMENT = []
        IDENTIFIER = []
        TYPE = []
        for i in range(len(data)):
            x = data[i].split('\t')
            ELEMENT.append(x[3])
            IDENTIFIER.append(x[5])
            TYPE.append(x[4])
    df = pd.DataFrame(data=zip(ELEMENT,IDENTIFIER,TYPE), columns=['element','identifier','type'])
    df.identifier = df.identifier.replace('Disease',np.nan)
    df = df.drop_duplicates('identifier')
    disease_df = df[df.type == 'Disease']
    gene_df = df[df.type == 'Gene']
    return gene_df, disease_df

In [5]:
def process_text(document, gene_df, disease_df):
    document = re.sub(r'[()]', '', document)
    document = re.sub(r'[\[\]]', '', document)
    document = document.replace('-->','')
    document = document.replace('<--','')
    document = document.replace('->','')
    document = document.replace('<-','')

    sentence = nltk.sent_tokenize(document)
    genes_discovered = gene_df.element.tolist()
    diseases_discovered = disease_df.element.tolist()

    genes_id = gene_df.identifier.tolist()
    diseases_id = disease_df.identifier.tolist()


    phrases = []
    for i in sentence:
        for j in genes_discovered:
            for k in diseases_discovered:
                if (str(j) in i and str(k) in i) and ('association' in i or 'is associated' in i or 'is related' in i):
                    phrases.append(i)
    phrases = list(set(phrases))
  
    DATA = pd.DataFrame(columns = ['geneId','diseaseId'])
    for i in phrases:
        genes = [j for j in genes_discovered if j in i]
        diseases = [j for j in diseases_discovered if j in i]
        gene_id = [gene_df['identifier'][gene_df['element'] == i].values[0] for i in genes]
        disease_id = [disease_df['identifier'][disease_df['element'] == i].values[0] for i in diseases]
        max_value = max(len(gene_id),len(disease_id))
        DATA = DATA.append({
            'geneId' :  list(islice(cycle(gene_id), max_value)),
            'diseaseId'  : list(islice(cycle(disease_id), max_value)),
        },ignore_index = True)
    DATA = DATA.explode(['geneId','diseaseId'])
    DATA = DATA.drop_duplicates()
    return DATA

In [6]:
x = '11578815'
RESULT = pd.DataFrame(columns = ['geneId','diseaseId'])
ID, title, abstract, document = parse_document(x)
gene_df, disease_df = pubtator(ID)
result = process_text(document, gene_df, disease_df)
RESULT = RESULT.append(result, ignore_index = True)

In [7]:
gene_df

Unnamed: 0,element,identifier,type
0,Tau,4137,Gene
2,microtubule associated protein,51115,Gene


In [8]:
print(gene_df.to_latex(index=False))

\begin{tabular}{lll}
\toprule
                       element & identifier & type \\
\midrule
                           Tau &       4137 & Gene \\
microtubule associated protein &      51115 & Gene \\
\bottomrule
\end{tabular}



In [9]:
disease_df

Unnamed: 0,element,identifier,type
1,supranuclear palsy,D013494,Disease
6,tauopathies,D024801,Disease
8,dementia,D003704,Disease
9,parkinsonism,D010302,Disease


In [10]:
print(disease_df.to_latex(index=False))

\begin{tabular}{lll}
\toprule
           element & identifier &    type \\
\midrule
supranuclear palsy &    D013494 & Disease \\
       tauopathies &    D024801 & Disease \\
          dementia &    D003704 & Disease \\
      parkinsonism &    D010302 & Disease \\
\bottomrule
\end{tabular}



In [11]:
RESULT

Unnamed: 0,geneId,diseaseId
0,4137,D013494


In [12]:
print(RESULT.to_latex(index=False))

\begin{tabular}{ll}
\toprule
geneId & diseaseId \\
\midrule
  4137 &   D013494 \\
\bottomrule
\end{tabular}



In [13]:
input_document = 'documents_in_filtered_data'
documents_input = pd.read_csv(f'../{input_document}.csv')
LIST_ID = list(set([str(i) for i in documents_input.pmid.tolist()]))
len(LIST_ID)

500

In [14]:
RESULT = pd.DataFrame(columns = ['geneId','diseaseId','diseaseUMLS'])
for x in tqdm(LIST_ID):
    ID, title, abstract, document = parse_document(x)
    # gene_df, disease_df = pubtator(ID)
    gene_df, disease_df = pubtator_offline(ID)
    result = process_text(document, gene_df, disease_df)
    RESULT = RESULT.append(result, ignore_index = True)
RESULT.geneId = RESULT.geneId.astype(np.int64)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [13:55<00:00,  1.67s/it]


In [15]:
RESULT.to_csv(f'naive_results_no_conversion_{input_document}.csv',index=False)

In [16]:
auth_params = {"email": "salvatoredanilopalumbo@gmail.com","password" : "PVFXYXpkV1p1VkdkeUVETTFrak4wVm1ibGQyY3BSR1JCNVVTTTkwVE1sa1RCUkVacE4zWmw1V1owWlRPMUFUTXlRWFp1VjJaemxHWg=="}
api_host = "https://www.disgenet.org/api"
VOCABULARY = 'mesh'
DISEASE_UMLS = []

s = requests.Session()
r = s.post(api_host+'/auth/', data=auth_params)
json_response = r.json()
api_key = json_response.get("token")


for i in tqdm(RESULT.diseaseId):
    if api_key:
        s.headers.update({"Authorization": "Bearer %s" % api_key})
        try:
            gda_response = s.get(f"{api_host}/gda/disease/{VOCABULARY}/{i}").json()
            DISEASE_UMLS.append(list(set([i['diseaseid'] for i in gda_response])))
        except:
            DISEASE_UMLS.append(np.nan)
if s:
    s.close()
        
RESULT['diseaseUMLS'] = DISEASE_UMLS
RESULT.dropna(inplace=True)

100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [37:35<00:00,  9.94s/it]


In [17]:
RESULT

Unnamed: 0,geneId,diseaseId,diseaseUMLS
0,4313,D054160,[C0232257]
1,2100,D014647,[C0042344]
2,2099,D014647,[C0042344]
3,2153,D054556,[C1861172]
4,2900,D012559,[C0036341]
...,...,...,...
222,2944,D011471,"[C0033578, C0376358]"
223,1543,D009369,"[C0086692, C0027651, C0006826]"
224,5309,D010300,[C0030567]
225,3586,D002006,"[C2231324, C0006309]"


In [18]:
RESULT.to_csv(f'naive_results_unexploded_{input_document}.csv',index=False)

In [19]:
RESULT = RESULT.explode('diseaseUMLS')

In [20]:
RESULT

Unnamed: 0,geneId,diseaseId,diseaseUMLS
0,4313,D054160,C0232257
1,2100,D014647,C0042344
2,2099,D014647,C0042344
3,2153,D054556,C1861172
4,2900,D012559,C0036341
...,...,...,...
224,5309,D010300,C0030567
225,3586,D002006,C2231324
225,3586,D002006,C0006309
226,3569,D002006,C2231324


In [21]:
RESULT.to_csv(f'naive_results_{input_document}.csv',index=False)