# Extract Demographic data from AACT

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import sys
import gc
import time
import itertools
import functools
import operator
import json
import psycopg2
import pickle
import os
import requests

In [None]:
conn_acct = psycopg2.connect(
    host="aact-db.ctti-clinicaltrials.org",
    database="aact",
    user="xxxx",
    password="xxxx")
cur_acct = conn_acct.cursor()

## get age segmentation of nctids

In [None]:
sql = """
select b.nct_id, b.category as cat, sum(b.param_value_num) as total_participants
from
(select bm.nct_id, lower(concat(bm.category, bm.classification)) as category, bm.param_value_num, bm.title, bm.param_type, bm.ctgov_group_code
from baseline_measurements as bm join result_groups as r
on bm.nct_id = r.nct_id and bm.ctgov_group_code = r.ctgov_group_code
and lower(r.title) != 'total') as b
join studies as s on b.nct_id = s.nct_id
where s.overall_status = 'Completed' 
and s.study_first_submitted_date < '2022-01-01'
and (b.param_type = 'Count of Participants' or b.param_type = 'Number')
and (b.title ~* 'Age, Categorical' or b.title ~* 'Age, Customized')
-- and nct_id = 'NCT03086369'
group by b.nct_id, cat
order by b.nct_id;
"""

In [None]:
cur_acct.execute(sql)
trial_with_age_info = cur_acct.fetchall()
table_trial_age = pd.DataFrame().from_dict(trial_with_age_info)

In [None]:
len(table_trial_age.nctid.unique())

17972

In [None]:
table_trial_age.columns = ["nctid", "age", "count"]
table_trial_age.head(10)

Unnamed: 0,nctid,age,count
0,NCT00000134,<=18 years,0.0
1,NCT00000134,>=65 years,0.0
2,NCT00000134,between 18 and 65 years,279.0
3,NCT00000135,<=18 years,0.0
4,NCT00000135,>=65 years,0.0
5,NCT00000135,between 18 and 65 years,209.0
6,NCT00000136,<=18 years,0.0
7,NCT00000136,>=65 years,0.0
8,NCT00000136,between 18 and 65 years,234.0
9,NCT00000142,<=18 years,0.0


In [None]:
file_name = 'age_segment_by_nctid.tsv'
table_trial_age.to_csv(file_name, sep='\t', encoding='utf-8', index=False)

!cp age_segment_by_nctid.tsv  /content/gdrive/MyDrive/Demographic\ Analysis\ Project/

## get race segmentation of nctids

In [None]:
sql = """
select b.nct_id, b.category as cat, sum(b.param_value_num) as total_participants
from
(select bm.nct_id, lower(concat(bm.category, bm.classification)) as category, bm.param_value_num, bm.title, bm.param_type, bm.ctgov_group_code
from baseline_measurements as bm join result_groups as r
on bm.nct_id = r.nct_id and bm.ctgov_group_code = r.ctgov_group_code
and lower(r.title) != 'total') as b
join studies as s on b.nct_id = s.nct_id
where s.overall_status = 'Completed' 
and s.study_first_submitted_date < '2022-01-01'
and (b.param_type = 'Count of Participants' or b.param_type = 'Number')
and (b.title ~* 'race' or b.title ~* 'ethnicity') 
and (b.category ~* 'africa' or b.category ~* 'black' or b.category ~*'white')
-- and nct_id = 'NCT03086369'
group by b.nct_id, cat
order by b.nct_id;
"""

In [None]:
cur_acct.execute(sql)
trial_with_race_info = cur_acct.fetchall()
table_trial_race = pd.DataFrame().from_dict(trial_with_race_info)

In [None]:
table_trial_race.columns = ["nctid", "race", "count"]
table_trial_race.head(10)

Unnamed: 0,nctid,race,count
0,NCT00000125,african american,408.0
1,NCT00000125,white,1137.0
2,NCT00000392,black,10.0
3,NCT00000392,white,177.0
4,NCT00000620,nonwhite,3647.0
5,NCT00000620,white,6604.0
6,NCT00001262,black or african american,10.0
7,NCT00001262,white,44.0
8,NCT00001277,black or african american,101.0
9,NCT00001277,white,735.0


In [None]:
table_trial_race.nctid.nunique()

19940

In [None]:
nctid_race_list= table_trial_race.nctid.to_list()
print(f'nctid race list len: {len(set(nctid_race_list))}')

nctid race list len: 19940


In [None]:
file_name = 'race_segment_by_nctid.tsv'
table_trial_race.to_csv(file_name, sep='\t', encoding='utf-8', index=False)

!cp race_segment_by_nctid.tsv  /content/gdrive/MyDrive/Demographic\ Analysis\ Project/

## get nctids from age and race

In [None]:
nctid_age_list= table_trial_age.nctid.to_list()
nctid_race_list= table_trial_race.nctid.to_list()
print(f'nctid age list len: {len(set(nctid_age_list))}')
print(f'nctid race list len: {len(set(nctid_race_list))}')

nctid_list = list(set(nctid_age_list).union(set(nctid_race_list)))

print(f'nctid total list len: {len(nctid_list)}')

nctid age list len: 18037
nctid race list len: 18417
nctid total list len: 30151


old results
```
nctid age list len: 18029
nctid race list len: 22480
nctid total list len: 34210
```

In [None]:
df = pd.DataFrame(nctid_list)
df.columns =['nctid']
print(df[:10])
file_name = 'nctid_list.tsv'
df.to_csv(file_name, sep='\t', encoding='utf-8', index=False)

         nctid
0  NCT03633929
1  NCT01822678
2  NCT00787761
3  NCT02272413
4  NCT03720470
5  NCT02703259
6  NCT02783170
7  NCT00723190
8  NCT01033747
9  NCT01431638


In [None]:
!cp nctid_list.tsv  /content/gdrive/MyDrive/Demographic\ Analysis\ Project/


In [None]:
!ls /content/gdrive/MyDrive/Demographic\ Analysis\ Project/

'Demographic Bias Project Proposal.gdoc'   nctid_to_pmid_dict.pickle
 nctid_list.tsv				  'Script v1.ipynb'


## match nctid to pubmed id

In [None]:
from xml.etree import ElementTree
import requests
import time
import datetime

In [None]:
def get_pubmed_linked_articles(nct_id, completion_date, query_type):
    '''
    Given an NCT ID, search PubMed for related results articles.
    '''
    url = get_pubmed_linked_articles_url(nct_id, completion_date,
                                         query_type)
    resp = get_response(url)
    data = resp.json()
    ids = extract_pubmed_ids_from_json(data)
    for id1 in ids[:]:
        title = get_pubmed_title(id1)
        if is_study_protocol(title):
            ids.remove(id1)
        time.sleep(0.5)
    return ids

def get_pubmed_linked_articles_url(nct_id, completion_date,
                                   query_type):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    url += 'esearch.fcgi?db=pubmed&retmode=json&term='
    url += '(%s[si] OR %s[Title/Abstract]) ' % (nct_id, nct_id)
    url += 'AND ("%s"[pdat] : ' % completion_date.strftime('%Y/%m/%d')
    url += '"3000"[pdat]) '
    if query_type == 'broad':
        url += "AND ((clinical[Title/Abstract] AND trial[Title/Abstract]) "
        url += "OR clinical trials as topic[MeSH Terms] "
        url += "OR clinical trial[Publication Type] "
        url += "OR random*[Title/Abstract] "
        url += "OR random allocation[MeSH Terms] "
        url += "OR therapeutic use[MeSH Subheading])"
    elif query_type == 'narrow':
        url += "AND (randomized controlled trial[Publication Type] OR "
        url += "(randomized[Title/Abstract] "
        url += "AND controlled[Title/Abstract] AND trial[Title/Abstract]))"
    return url

def get_response(url):
    return requests.get(url)

def extract_pubmed_ids_from_json(data):
    ids = []
    esearchresult = data['esearchresult']
    if 'idlist' in esearchresult:
        ids = esearchresult['idlist']
    return ids

def get_pubmed_title(pmid):
    '''
    Retrieve the title of a PubMed article, from its PMID.
    '''
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
    url += 'db=pubmed&rettype=abstract&id=%s' % pmid
    resp = get_response(url)
    title = extract_title_from_pubmed_data(resp.content)
    return title

def is_study_protocol(title):
    return (title and b'study protocol' in title.lower())

def extract_title_from_pubmed_data(text):
    try:
        tree = ElementTree.fromstring(text)
        title = tree.find('.//Article/ArticleTitle')
        if title is not None and title.text is not None:
            title = title.text.encode('utf8')
        if type(title) != bytes:
          title = ''
    except ElementTree.ParseError:
        print('ParseError', text)
        title = ''
    return title

In [None]:
nctid_input_file = '/content/gdrive/MyDrive/Demographic Analysis Project/nctid_list.tsv'
nctid_df = pd.read_csv(nctid_input_file, sep='\t')
display(nctid_df)

Unnamed: 0,nctid
0,NCT03633929
1,NCT01822678
2,NCT00787761
3,NCT02272413
4,NCT03720470
...,...
30146,NCT01377194
30147,NCT01629589
30148,NCT01967940
30149,NCT02517905


In [None]:
# testing 
d = datetime.datetime(2000, 1, 1)
print(get_pubmed_linked_articles('NCT01020916', d, 'narrow'))
print(get_pubmed_linked_articles('NCT03086369', d, 'narrow'))
print(get_pubmed_linked_articles('NCT00007644', d, 'narrow'))

['28099439', '27887653', '27523954', '27438111', '26525271', '26433116', '25975474', '25844993', '25789868', '25756419', '25365723', '25270900', '25267568', '24237006', '24044764', '22520518']
['28838379']
['32089359', '28700844', '23988464', '22808955']


In [None]:
from tqdm.notebook import tqdm

In [None]:
# nctid_list_to_map = nctid_df.nctid.to_list()
# d = datetime.datetime(2000, 1, 1)
# results_dict ={}
# for i, nctid in enumerate(nctid_list_to_map):
#   print(f'processing {i}')
#   pubmed_id_list = get_pubmed_linked_articles(nctid, d, 'narrow')
#   print(f'get: {nctid}\t{pubmed_id_list}')
#   results_dict[nctid] = pubmed_id_list
#   break

In [None]:
import pickle
import os

filename = 'nctid_to_pmid_dict.pickle'
def save_pickle(a):
  with open(filename, 'wb') as handle:
      pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)


def save_pickle_and_cp_to_cloud(a, i):
  file_name = f'nctid_to_pmid_dict_{i}.pickle'
  with open(file_name, 'wb') as handle:
      pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
  os.system(f'cp {file_name}  /content/gdrive/MyDrive/Demographic\ Analysis\ Project/')

In [None]:
# nctid_list_to_map = nctid_df.nctid.to_list()
# d = datetime.datetime(2000, 1, 1)
# results_dict ={}
# for i, nctid in enumerate(tqdm(nctid_list_to_map)):
#   pubmed_id_list = get_pubmed_linked_articles(nctid, d, 'narrow')
#   print(f'get: {nctid}\t{pubmed_id_list}')
#   results_dict[nctid] = pubmed_id_list
#   time.sleep(0.5)
#   if i%100==0:
#     save_pickle(results_dict)



In [None]:
nctid_list_to_map = nctid_df.nctid.to_list()
d = datetime.datetime(2000, 1, 1)

with open(filename, 'rb') as handle:
    b = pickle.load(handle)

print(len(b.keys()))
results_dict = b

for i, nctid in enumerate(tqdm(nctid_list_to_map)):
  if nctid in results_dict: continue
  pubmed_id_list = get_pubmed_linked_articles(nctid, d, 'narrow')
  print(f'get: {nctid}\t{pubmed_id_list}')
  results_dict[nctid] = pubmed_id_list
  time.sleep(0.5)
  if i%100==0:
    save_pickle(results_dict)
  if i%1000==0:
    save_pickle_and_cp_to_cloud(results_dict, i)
    

25983


  0%|          | 0/30151 [00:00<?, ?it/s]

get: NCT00546377	[]
get: NCT00920829	[]
get: NCT02661178	[]
get: NCT03078946	[]
get: NCT02057406	['30098961']
get: NCT01399099	['24804638']
get: NCT01185171	[]
get: NCT01105975	['27206939', '26564598', '22089718']
get: NCT01827163	[]
get: NCT02641561	['27816497']
get: NCT03448224	['33353412']
get: NCT02214121	['30972696']
get: NCT01329562	[]
get: NCT04109703	[]
get: NCT03007953	[]
get: NCT00237666	[]
get: NCT02413034	[]
get: NCT02466646	[]
get: NCT02188459	['33493703']
get: NCT01296152	[]
get: NCT03288714	[]
get: NCT00259090	['23497452']
get: NCT00313729	[]
get: NCT01006369	[]
get: NCT01828593	[]
get: NCT00849472	[]
get: NCT02552121	[]
get: NCT02611362	[]
get: NCT02250651	[]
get: NCT02909153	[]
get: NCT03362944	[]
get: NCT00606931	[]
get: NCT02888093	[]
get: NCT00030147	['33470755']
get: NCT01375140	[]
get: NCT01120236	['25847934']
get: NCT00422058	['23942319', '19853906']
get: NCT01223196	['23811853']
get: NCT03557034	[]
get: NCT04083404	[]
get: NCT00945945	[]
get: NCT03041038	[]
get:

In [None]:
print(f'size of result dict: {len(results_dict)}')
results_dict_updated = {}
for k, v in results_dict.items():
  if k in nctid_list_to_map:
    results_dict_updated[k] = v
print(f'size of result dict updated: {len(results_dict_updated)}')

size of result dict: 30604
size of result dict updated: 30151


In [None]:
save_pickle(results_dict_updated)
!cp nctid_to_pmid_dict.pickle  /content/gdrive/MyDrive/Demographic\ Analysis\ Project/

In [None]:
print(len(results_dict_updated.keys()))
nctid_count = 0
pmid_count = 0
pmid_set = set()
for k, v in results_dict_updated.items():
  if v:
    nctid_count += 1
    print(f'{k}:\t{v}')
    for pm in v:
      pmid_count += 1 
      pmid_set.add(pm)

print(f'number of nctid: {nctid_count}')
print(f'number of pmid: {pmid_count}')
print(f'number of unique pmid: {len(pmid_set)}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
NCT00970944:	['22375973']
NCT01844115:	['27486544', '27382289', '27232052']
NCT02136069:	['32445184']
NCT01640951:	['28580579']
NCT03091920:	['31858186']
NCT02597127:	['31630870', '31553410', '30487231', '29735484', '28306389']
NCT02425098:	['31843269']
NCT00711009:	['22730929', '22180523']
NCT03228433:	['33990969']
NCT02974153:	['34823467', '33781209', '33314079', '33023473', '32209650']
NCT02542410:	['34934987']
NCT01969708:	['33415354', '31600368', '29476687', '28492860']
NCT03428750:	['33840781']
NCT00332202:	['27217449']
NCT02394730:	['30257802']
NCT02863328:	['32827435', '31530666']
NCT02104739:	['32228379']
NCT00444080:	['24210765']
NCT02820038:	['32799397']
NCT02092220:	['28007348']
NCT01130844:	['26893546']
NCT00307684:	['21798108']
NCT00735644:	['24656175', '22777096']
NCT00794196:	['23950967', '19656386']
NCT01170533:	['21521834']
NCT03058991:	['31918169']
NCT01475734:	['25263215']
NCT03247322:	['34197699']
NCT

- number of nctid mapped with pmid: 8162
- number of pmid: 14066
- number of unique pmid: 12864

## Extract abstract by pmid


In [None]:
import sys
import regex as re
import urllib.request as ur
import os,codecs
from time import sleep
from xml.dom import minidom 
import xml.etree.ElementTree as xml_parser

import urllib.parse
import requests, json             
import pickle

def get_abstract_bypmid(pmid):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+str(pmid)+"&retmode=XML&rettype=abstract"
    
    try:
        fetch = ur.urlopen(url)
        datam = fetch.read().decode('utf-8')  # xml of one article

    except:
        return None
    
    datam = re.sub("<i>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</i>", "", datam)
    datam = re.sub("<sup>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</sup>", "", datam)
    datam = re.sub("<sub>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</sub>", "", datam)
    datam = re.sub("<em>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</em>", "", datam)
    datam = re.sub("<strong>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</strong>", "", datam)
    datam = re.sub("<b>", "", datam)  # remove <b> in xml to avoid failure in parsing .text
    datam = re.sub("</b>", "", datam)

    xmldoc = xml_parser.fromstring(datam)
    
    PubmedArticle = xmldoc.find("PubmedArticle")
    abstract_text = ""
    title_text=""
    meta_data={}

    meta_data['pubdate'] = ""
    meta_data['source'] = ""
    meta_data['volume'] = ""
    meta_data['pages'] = ""
    meta_data['pubtype'] = []
    meta_data['authors'] = []

    if PubmedArticle is not None:
        data = PubmedArticle.find("MedlineCitation")
        if data is not None:
            article = data.find("Article")
            if article is not None:
                title = article.find("ArticleTitle")
                #print("Title:",title.text)
                title_text = title.text
                abstract = article.find("Abstract")
                if abstract is not None:
                    for seg in abstract:
                        if seg.text:
                            if 'NlmCategory' in seg.attrib:
                                abstract_text = abstract_text + "\n" + seg.attrib['NlmCategory'] + " : " + seg.text
                            elif 'Label' in seg.attrib:
                                abstract_text = abstract_text + "\n" + seg.attrib['Label'] + " : " + seg.text
                            else:
                                abstract_text = abstract_text + "\n" + seg.text
                        elif 'Label' in seg.attrib and len(seg.attrib['Label'].split(':'))>1:
                            abstract_text = abstract_text + "\n" + seg.attrib['Label'].split(':')[0] + " : " + seg.attrib['Label'].split(':')[1]
                
                journal = article.find("Journal")
                if journal is not None:
                    if journal.find("ISOAbbreviation") is not None:
                        meta_data['source'] = journal.find("ISOAbbreviation").text
                    if journal.find("JournalIssue").find('Volume') is not None:
                        meta_data['volume'] = journal.find("JournalIssue").find('Volume').text
                   
                Pagination = article.find("Pagination")
                if Pagination is not None:
                    meta_data['pages'] = Pagination.find("MedlinePgn").text 
    

                AuthorList = article.find('AuthorList')

                if AuthorList is not None:
                    for author in AuthorList.findall('Author'):
                        if author.find('Initials') is not None and author.find('LastName') is not None:
                            meta_data['authors'].append(author.find('Initials').text+'. '+author.find('LastName').text)
                PublicationTypeList = article.find('PublicationTypeList')
                if PublicationTypeList is not None:
                    for PublicationType in PublicationTypeList.findall('PublicationType'):
                        meta_data['pubtype'].append(PublicationType.text)

        PubmedData = PubmedArticle.find('PubmedData')
        if PubmedData is not None:
            History = PubmedData.find('History')
            if History.find("PubMedPubDate") is not None:
                date = History.find("PubMedPubDate")
                Year=""
                Month=""
                Day=""
                if date.find('Year') is not None:
                    Year =date.find('Year').text
                if date.find('Month') is not None:
                    Month=date.find('Month').text+'/'
                if date.find('Day') is not None:
                    Day=date.find('Day').text+'/'
                
                meta_data['pubdate'] = Month + Day +Year
                
    meta_data['title'] = title_text
    #print (meta_data)

    #abstract_text = re.search("©|(copyright)","", abstract_text)
    abstract_text = re.sub("^\s+", "", abstract_text)

    return title_text, abstract_text, meta_data

In [None]:
# test
id = 31950977
title, ab, meta_data = get_abstract_bypmid(id)
print(ab)

Importance : Proton pump inhibitors (PPIs) or histamine-2 receptor blockers (H2RBs) are often prescribed for patients as stress ulcer prophylaxis drugs in the intensive care unit (ICU). The comparative effect of these drugs on mortality is unknown.
Objective : To compare in-hospital mortality rates using PPIs vs H2RBs for stress ulcer prophylaxis.
Design, Setting, and Participants : Cluster crossover randomized clinical trial conducted at 50 ICUs in 5 countries between August 2016 and January 2019. Patients requiring invasive mechanical ventilation within 24 hours of ICU admission were followed up for 90 days at the hospital.
Interventions : Two stress ulcer prophylaxis strategies were compared (preferential use with PPIs vs preferential use with H2RBs). Each ICU used each strategy sequentially for 6 months in random order; 25 ICUs were randomized to the sequence with use of PPIs and then use of H2RBs and 25 ICUs were randomized to the sequence with use of H2RBs and then use of PPIs (1

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

!ls /content/gdrive/MyDrive/Demographic\ Analysis\ Project/

Mounted at /content/gdrive
 aact_primary_outcome_with_cui.tsv
 aact_primary_outcome_with_cui_v2.tsv
'age_segment_by_nctid (1).tsv'
 age_segment_by_nctid.tsv
'AMIA 2022 paper.gsheet'
 asian_embedding.model
 asian_fasttext.model
 baseline_model2
 black_embedding.model
 black_fasttext.model
 cui_voc.pickle
'Demographic Bias Project Proposal.gdoc'
'Demographic-sensitive concept embeddings for bias investigation in clinical trials.gdoc'
 embeddings
 final_dataframe_demographic_analysis.csv
 hao_test
 hispanic_embedding.model
 hispanic_fasttext.model
 icd_diagnoses_with_cui.gsheet
 icd_diagnoses_with_cui.tsv
'july 12th'
'june 17th'
'june 23rd'
'june 27th'
'may 16th'
 mimic-iv-1.0
 native_embedding.model
 native_fasttext.model
 nctid_list.tsv
 nctid_to_pmid_dict_25000.pickle
 nctid_to_pmid_dict_26000.pickle
 nctid_to_pmid_dict_27000.pickle
 nctid_to_pmid_dict_29000.pickle
 nctid_to_pmid_dict_30000.pickle
 nctid_to_pmid_dict.pickle
 neutral_embedding.model
 neutral_fasttext.model
 neutral_fast

In [None]:
import pickle
filename = '/content/gdrive/MyDrive/Demographic Analysis Project/nctid_to_pmid_dict.pickle'

with open(filename, 'rb') as handle:
    results_dict = pickle.load(handle)

print(len(results_dict.keys()))


30151


In [None]:
pmid_set = set()
for k, v in results_dict.items():
  if v:
    for vv in v:
      pmid_set.add(vv)
pmidlist = list(pmid_set)

print(f'size of pmid list: {len(pmidlist)}')

size of pmid list: 12864


In [None]:
import os
import codecs
import re
from time import sleep
pm_path = '/content/gdrive/MyDrive/Demographic Analysis Project/pmid_abstracts/'
if not os.path.exists(pm_path):
    os.mkdir(pm_path)

In [None]:
def download_abstract_by_pmid(pmid, save_path):
    test_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id='+str(pmid)+'&retmode=XML'

    try:
        fetch = ur.urlopen(test_url)
        datam = fetch.read().decode('utf-8')  # xml of one article
        filename_xml = str(pmid)+'.xml'
        file_dir = os.path.join(save_path, filename_xml)
        xml = open(file_dir, "w")
        xml.write(datam)
        xml.close()
    except Exception as e:
        print(f'{e} at {pmid}')

In [None]:
print(pmidlist[:10])

['31766977', '32665528', '31362741', '32199723', '27328486', '25693783', '25485670', '27062245', '28607708', '29754812']


In [None]:
from random import randint
from tqdm.auto import tqdm
def save_xmls(pmidlist, save_path):
    pm_path = save_path
    print("dir for xml saved in", pm_path)
    if not os.path.exists(pm_path):
        os.mkdir(pm_path)

    exceptionlistfile=codecs.open(os.path.join(pm_path,"exceptionlist"),'w')
    length = len(pmidlist)

    for (index,id) in enumerate(tqdm(pmidlist, total=length, position=0, leave=True)):
        id = id.rstrip()
        filename_xml=id+".xml"

        print(f'processing id: {id}')
        
        if os.path.exists(os.path.join(pm_path,filename_xml)):
            print ("existed, skipped")
            continue
        if re.search("^#",id):
            continue 
        if re.search("^pmid", id):
            continue
        if re.search("^\s*$",id):
            continue
        id = id.rstrip().split("\t")[0]

        if (index) % 1000 == 0:
            print(index,"finished...")
        sleep(0.3)

        pmid = id
        # print(f'processing pmid: {pmid}')
        for i in range(5):
            try:
                download_abstract_by_pmid(pmid, pm_path)
                break
            except Exception as e:
                print(f"Retrying: {i}")
                sleep(randint(1,10))
        else:
            print(f'extraction failed at {pmid}')
            exceptionlistfile.write(str(id)+"\n")
            continue

    print("Retrieved xml saved in", pm_path)

save_xmls(pmidlist, pm_path)

dir for xml saved in /content/gdrive/MyDrive/Demographic Analysis Project/pmid_abstracts/


  0%|          | 0/12864 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
processing id: 27093976
processing id: 30962681
processing id: 27133551
processing id: 27720142
processing id: 22658389
processing id: 30376130
processing id: 30857647
processing id: 32721009
processing id: 29374054
processing id: 30976782
processing id: 23802840
processing id: 22569355
processing id: 19935992
processing id: 28374645
processing id: 31635976
processing id: 28103841
processing id: 31276502
processing id: 33883254
processing id: 33653824
processing id: 26493248
processing id: 24415559
processing id: 32644148
processing id: 29789238
processing id: 32816874
processing id: 26283236
processing id: 24798072
processing id: 32653447
processing id: 32164888
processing id: 22704099
processing id: 21791356
processing id: 34197181
processing id: 29207976
processing id: 32248638
processing id: 34412104
processing id: 29461870
processing id: 25941016
processing id: 18789766
processing id: 27811204
processing id: 33516082

In [None]:
import xml.etree.ElementTree as xml_parser
import re
import tarfile
from tqdm.auto import tqdm
import json
import os,codecs
from time import sleep
import pandas as pd
import urllib.request as ur
import pickle
from random import randint

# must run

path = '/content/gdrive/MyDrive/Demographic Analysis Project/pmid_abstracts/'


def parse_abstract_bypmid(pmid, pm_from_path):
    one_file = os.path.join(pm_from_path, str(pmid)+'.xml')
    try:
        with open(one_file, 'r') as f:
            datam = f.read()
    except Exception as e:
        print(e)
        return None
    
    datam = re.sub("<i>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</i>", "", datam)
    datam = re.sub("<sup>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</sup>", "", datam)
    datam = re.sub("<sub>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</sub>", "", datam)
    datam = re.sub("<em>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</em>", "", datam)
    datam = re.sub("<strong>", "", datam)  # remove <i> in xml to avoid failure in parsing .text
    datam = re.sub("</strong>", "", datam)
    datam = re.sub("<b>", "", datam)  # remove <b> in xml to avoid failure in parsing .text
    datam = re.sub("</b>", "", datam)

    xmldoc = xml_parser.fromstring(datam)
    
    PubmedArticle = xmldoc.find("PubmedArticle")
    abstract_text = ""
    title_text=""
    meta_data={}

    meta_data['pubdate'] = ""
    meta_data['source'] = ""
    meta_data['volume'] = ""
    meta_data['pages'] = ""
    meta_data['pubtype'] = []
    meta_data['authors'] = []

    if PubmedArticle is not None:
        data = PubmedArticle.find("MedlineCitation")
        if data is not None:
            article = data.find("Article")
            if article is not None:
                title = article.find("ArticleTitle")
                #print("Title:",title.text)
                title_text = title.text
                abstract = article.find("Abstract")
                if abstract is not None:
                    for seg in abstract:
                        if seg.text:
                            if 'NlmCategory' in seg.attrib:
                                abstract_text = abstract_text + "\n" + seg.attrib['NlmCategory'] + " : " + seg.text
                            elif 'Label' in seg.attrib:
                                abstract_text = abstract_text + "\n" + seg.attrib['Label'] + " : " + seg.text
                            else:
                                abstract_text = abstract_text + "\n" + seg.text
                        elif 'Label' in seg.attrib and len(seg.attrib['Label'].split(':'))>1:
                            abstract_text = abstract_text + "\n" + seg.attrib['Label'].split(':')[0] + " : " + seg.attrib['Label'].split(':')[1]
                
                journal = article.find("Journal")
                if journal is not None:
                    if journal.find("Title") is not None:
                        meta_data['source_title'] = journal.find("Title").text
                    if journal.find("ISOAbbreviation") is not None:
                        meta_data['source'] = journal.find("ISOAbbreviation").text
                    if journal.find("JournalIssue").find('Volume') is not None:
                        meta_data['volume'] = journal.find("JournalIssue").find('Volume').text
                   
                Pagination = article.find("Pagination")
                if Pagination is not None:
                    meta_data['pages'] = Pagination.find("MedlinePgn").text 
    

                AuthorList = article.find('AuthorList')

                if AuthorList is not None:
                    for author in AuthorList.findall('Author'):
                        if author.find('Initials') is not None and author.find('LastName') is not None:
                            meta_data['authors'].append(author.find('Initials').text+'. '+author.find('LastName').text)
                PublicationTypeList = article.find('PublicationTypeList')
                if PublicationTypeList is not None:
                    for PublicationType in PublicationTypeList.findall('PublicationType'):
                        meta_data['pubtype'].append(PublicationType.text)

        PubmedData = PubmedArticle.find('PubmedData')
        if PubmedData is not None:
            History = PubmedData.find('History')
            if History.find("PubMedPubDate") is not None:
                date = History.find("PubMedPubDate")
                Year=""
                Month=""
                Day=""
                if date.find('Year') is not None:
                    Year =date.find('Year').text
                if date.find('Month') is not None:
                    Month=date.find('Month').text+'/'
                if date.find('Day') is not None:
                    Day=date.find('Day').text+'/'
                
                meta_data['pubdate'] = Month + Day +Year
                
    meta_data['title'] = title_text
    #print (meta_data)

    #abstract_text = re.search("©|(copyright)","", abstract_text)
    abstract_text = re.sub("^\s+", "", abstract_text)

    return title_text, abstract_text, meta_data


def get_all_files(rootdir):
    files_dirs = []
    for root, dirs, files in os.walk(rootdir):
        for name in files:
            if name.endswith((".xml")):
                full_path = os.path.join(root, name)
                files_dirs.append(full_path)
    return files_dirs


def save_abstracts(pmidlist, path):
    pm_from_path = path
    pm_path = os.path.join(path, 'txt/')
    os.makedirs(pm_path, exist_ok=True)
    
    exceptionlistfile=codecs.open(os.path.join(pm_from_path,"exceptionlist"),'w')
    for id in tqdm(pmidlist):
#             print(f'processing {id}')
        id = id.rstrip()
        filename=id+".txt"
        # file_meta=id+".meta"
        file_meta_pkl = id+"_meta"+".pkl"
        #print (id)
        #Yingcheng added
        if os.path.exists(os.path.join(pm_path,filename)):
            print ("existed, skipped")
            continue
        if re.search("^#",id):
            continue 
        if re.search("^pmid", id):
            continue
        if re.search("^\s*$",id):
            continue
        id = id.rstrip().split("\t")[0]

        try:
            title, ab, meta_data= parse_abstract_bypmid(id, pm_from_path)
#                 print(f'title:\t{title}')
#                 print(ab)
#                 print(f'meta:\t{meta_data}')
            if ab == "":
                print(id, " no abstract")
                exceptionlistfile.write(str(id)+"\n")
                continue

            sections = ab.split("\n")
            
            outfile = codecs.open(os.path.join(pm_path,filename),'w')
            outfile.write("TITLE : "+str(title)+"\n")
            for s in sections:
                if s == "":
                    continue
                outfile.write(s+"\n")

            with open(os.path.join(pm_path,file_meta_pkl),'wb') as fout:
                pickle.dump(meta_data, fout)                  

        except Exception as e:
            print("Exection at ", id,"!!!")
#                 print("Exection message: ", e,"!!!")
            exceptionlistfile.write(str(id)+"\n")
            continue

    print ("Retrieved abstracts saved in", pm_path)
        

save_abstracts(pmidlist, path)


  0%|          | 0/12864 [00:00<?, ?it/s]

34190999  no abstract
22012178  no abstract
20040671  no abstract
28306371  no abstract
24898665  no abstract
27387422  no abstract
24930531  no abstract
28704847  no abstract
26844757  no abstract
32950849  no abstract
25631362  no abstract
28760541  no abstract
26011755  no abstract
28532759  no abstract
30139779  no abstract
33483617  no abstract
28389391  no abstract
31192894  no abstract
28389393  no abstract
26971692  no abstract
20732794  no abstract
24966061  no abstract
26977746  no abstract
32515118  no abstract
28668546  no abstract
27007007  no abstract
27466629  no abstract
24612976  no abstract
25456692  no abstract
31707833  no abstract
25010170  no abstract
30991823  no abstract
29046319  no abstract
24046475  no abstract
24467979  no abstract
26663235  no abstract
30255316  no abstract
25840600  no abstract
23018312  no abstract
30117142  no abstract
28687231  no abstract
30003479  no abstract
32701367  no abstract
35006163  no abstract
32077361  no abstract
28679611  

In [None]:
ls /content/gdrive/MyDrive/Demographic\ Analysis\ Project/pmid_abstracts/txt/ | wc -l

ls: cannot open directory '/content/gdrive/MyDrive/Demographic Analysis Project/pmid_abstracts/txt/': Input/output error
0


In [None]:
import os
import codecs
import re
from time import sleep
pm_path = '/content/gdrive/MyDrive/Demographic Analysis Project/pmid_abstracts/'
if not os.path.exists(pm_path):
    os.mkdir(pm_path)
exceptionlistfile=codecs.open(os.path.join(pm_path,"exceptionlist"),'w')
for (index,id) in enumerate(pmidlist):
    id = id.rstrip()
    filename=id+".txt"
    # file_meta=id+".meta"
    file_meta_pkl = id+"_meta"+".pkl"
    #print (id)
    #Yingcheng added
    if os.path.exists(os.path.join(pm_path,filename)):
        print ("existed, skipped")
        continue
    if re.search("^#",id):
        continue 
    if re.search("^pmid", id):
        continue
    if re.search("^\s*$",id):
        continue
    id = id.rstrip().split("\t")[0]

    if (index) % 1000 == 0:
        print(index,"finished...")
    sleep(0.5)

    try:
    #if True:
        title, ab, meta_data= get_abstract_bypmid(id)
        if ab == "":
            print (id, " no abstract")
            exceptionlistfile.write(str(id)+"\n")
            continue

        sections = ab.split("\n")
        
        outfile = codecs.open(os.path.join(pm_path,filename),'w')
        outfile.write("TITLE : "+str(title)+"\n")
        for s in sections:
            if s == "":
                continue
            outfile.write(s+"\n")
        
        # outfile_meta = codecs.open(os.path.join(pm_path,file_meta),'w')
        # outfile_meta.write(str(meta_data))

        with open(os.path.join(pm_path,file_meta_pkl),'wb') as fout:
            pickle.dump(meta_data, fout)
            

    except:
        print("Expection at ", id,"!!!")
        exceptionlistfile.write(str(id)+"\n")
        continue

print ("Retrieved abstracts saved in", pm_path)

0 finished...
29348179  no abstract
30003479  no abstract
28668546  no abstract
23018312  no abstract
28679611  no abstract
25695181  no abstract
22012178  no abstract
26977746  no abstract
1000 finished...
26971692  no abstract
31707833  no abstract
30882240  no abstract
25456692  no abstract
2000 finished...
27306615  no abstract
32077361  no abstract
25840600  no abstract
34293527  no abstract
24018545  no abstract
27007007  no abstract
3000 finished...
33483617  no abstract
26011755  no abstract
29729809  no abstract
28687231  no abstract
24046475  no abstract
26278063  no abstract
32031690  no abstract
4000 finished...
29653640  no abstract
28389391  no abstract
29412688  no abstract
5000 finished...
29046319  no abstract
31192894  no abstract
25010170  no abstract
22991405  no abstract
25818754  no abstract
30991823  no abstract
6000 finished...
28389393  no abstract
28532759  no abstract
25631362  no abstract
26663235  no abstract
20040671  no abstract
24467979  no abstract
7000

## mulitiprocess version (Skip)
Not quite working with request exceeding errors.

In [None]:
from multiprocessing import Process, Manager, Semaphore
import pprint

da = datetime.datetime(2000, 1, 1)
def dict_all(d, nctid, sema):
    d[nctid] = get_pubmed_linked_articles(nctid, da, 'narrow')
    time.sleep(1)
    sema.release()



In [None]:

concurrency = 20

sema = Semaphore(concurrency)
with Manager() as manager:
    d = manager.dict()
    # jobs = [Process(target=dict_all, args=(d, i)) for i in range(5)]
    all_processes = []
    for i in tqdm(nctid_list_to_map):
        # once 20 processes are running, the following `acquire` call
        # will block the main process since `sema` has been reduced
        # to 0. This loop will continue only after one or more
        # previously created processes complete.
        sema.acquire()
        p = Process(target=dict_all, args=(d, i, sema))
        all_processes.append(p)
        # time.sleep()
        p.start()
    # `d` is a DictProxy object that can be converted to dict
    # _ = [p.start() for p in jobs]
    # _ = [p.join() for p in jobs]
    # inside main process, wait for all processes to finish
    for p in all_processes:
        p.join()

    pprint.pprint(dict(d))