In [1]:
import os
import sys
from pathlib import Path

In [2]:
project_name = 'scosy'
project_path = Path(os.getcwd()).parent
data_path = Path(project_path, 'dataset')

# including the project folder and the utils folder
if project_name not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(Path(project_path, 'utils'))])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('sys.path =')
sys.path

project path = /Users/guerramarj/github/scosy
data path = /Users/guerramarj/github/scosy/dataset
sys.path =


['',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python36.zip',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6/lib-dynload',
 '/Users/guerramarj/.local/lib/python3.6/site-packages',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6/site-packages',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6/site-packages/cycler-0.10.0-py3.6.egg',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6/site-packages/stanfordcorenlp-3.9.1.1-py3.6.egg',
 '/Users/guerramarj/anaconda3/envs/nlp/lib/python3.6/site-packages/IPython/extensions',
 '/Users/guerramarj/.ipython',
 '/Users/guerramarj/github/scosy',
 '/Users/guerramarj/github/scosy/utils']

In [3]:
from Bio import Entrez
import csv
import argparse
import pandas as pd
from utils.parse import parse
import traceback

In [4]:
def assign_roles(author_list):
    """
    assign the chief author, ordinary author or principal investigator role to each author
    :param author_list: a list of all the authors in the paper
    :return: role_list: the authors' respective roles
    """

    role_list = list()

    for author_index in range(len(author_list)):
        # Assign the author's rle
        # if less than 2 authors then they are considered "Chief Authors"
        if author_index <= 1 and author_index != len(author_list) - 1:
            role_list.append('CA')
        # If a person is after the first two authors and it'snt the last author its considered
        # "Ordinary Author"
        elif author_index > 1 and author_index != len(author_list) - 1:
            role_list.append('OA')
        # else "Principal Investigator)
        elif author_index == len(author_list) - 1:
            role_list.append('PI')

    return role_list

In [34]:
def assign_organization(affiliation_list):
    """
    check and assign whether the authors belong to the CHOP or PENN organization.
    :param affiliation_list: a list of all the affiliations of the authors
    :return: chop_list, penn_list: lists with whether the author belong to the CHOP or PENN organization
    """
    # initialize CHOP and PENN authors' organization to None = 0
    chop_list = [0] * len(affiliation_list)
    penn_list = [0] * len(affiliation_list)

    for affiliation_index, affiliation in enumerate(affiliation_list):

        sub_affiliation = affiliation.split(';')

        for sa in sub_affiliation:
            # Assign the author organization
            if 'children' in sa.lower():
                chop_list[affiliation_index] = 1
                break
            elif 'perelman' in sa.lower() or 'school of medicine' in sa.lower() or \
                 'pennsylvania' in affiliation.lower():
                penn_list[affiliation_index] = 1
                break

    return chop_list, penn_list


In [5]:
def obtain_descriptions():

    # get the description, related to the MESH, in the 2017MeshTree.csv File
    mesh_tree_file_object = Path(project_path, 'template/2017MeshTree.csv').open()
    file_reader = csv.reader(mesh_tree_file_object, delimiter=',')
    mesh_description_dict = dict()

    for line in file_reader:
        # split_line[0] = Number, split_line[1] = Description and split_line[2] = MESH
        mesh_description_dict[line[2]] = line[1]
    mesh_tree_file_object.close()

    return mesh_description_dict

In [16]:
Entrez.email = "guerramarj@email.chop.edu"     # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2020/08/21",
                        term="Perelman School of Medicine[Affiliation] OR Children's Hospital of "
                             "Philadelphia[Affiliation] OR University of Pennsylvania School of "
                             "Medicine[Affiliation] OR School of Medicine University of "
                             "Pennsylvania[Affiliation]",
                        usehistory="y")
search_results = Entrez.read(handle)
handle.close()
# obtaining the list of relevant PMIDs
id_list = search_results["IdList"]

# get all the record based on the PMIDs
# logging.getLogger('regular.time').info('getting relevant authors\' records based on PMIDs')
fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline")

In [17]:
out_filename = "results.xml"
with open(out_filename, "w") as out_handle:
    out_handle.write(fetch_records_handle.read())
fetch_records_handle.close()

In [39]:
records_handle = open(out_filename)
fetch_records = parse(handle=records_handle)

In [40]:
# initializing variables
mesh_description_dict = obtain_descriptions()

# contains all the metadata elements on the author level: PubMed unique Identifier number(PMID), AuthorID (as a
# (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
author_record_df = pd.DataFrame(columns=['PMID', 'Author', 'author_chop', 'author_penn', 'Role',
                                         'AffiliationInfo'])
# contains all the metadata elements on the paper level: PubMed unique Identifier number(PMID), Title, Abstract,
# Year, Month, AuthorList, SubjectList, date
paper_record_df = pd.DataFrame(columns=['PMID', 'Title', 'Abstract', 'Year', 'Month', 'author_list',
                                        'subject_list',
                                        'date'])
# contains all the metadata of the medical information: PubMed unique Identifier number(PMID), Primary Medical
# Subject Header (MESH) and the description ID
medical_record_df = pd.DataFrame(columns=['PMID', 'Desc', 'Primary_MeSH'])

In [41]:
title_list = list()
abstract_list = list()
total_records = len(fetch_records)

try:
    for record_index, record in enumerate(fetch_records):

        print('processing {0} out of {1}'.format(record_index, total_records))

        pmid = record.get('PMID')
        title = record.get('TI')
        abstract = record.get('AB')
        authors = record.get('FAU')
        affiliations = record.get('AD')
        publication_type = record.get('PT')
        mesh_term = record.get('MH')
        date = record.get('EDAT')
        year, month = date.split('/')[:2]

    #     print('pmid = {0}'.format(pmid))
    #     print('title = {0}'.format(title))
    #     print('abstract = {0}'.format(abstract))
    #     print('authors = {0}'.format(authors))
    #     print('affiliations = {0}'.format(affiliations))
    #     print('publication type = {0}'.format(publication_type))
    #     print('mesh term = {0}'.format(mesh_term))
    #     print('date created = {0}'.format(date))

        if pmid is not None:
            # assign the chief author, ordinary author or principal investigator role to each author
            roles = assign_roles(authors)
            # check and assign whether the authors belong to the CHOP or PENN organization
            chop_organization, penn_organization = assign_organization(affiliations)

            mesh_description = ''
            if mesh_term is None:
                mesh_term = ''
            else:
                mesh_description, term = convert_mesh_description(mesh_description_dict, mesh_term)
                mesh_term = ';'.join(mesh_term)

            # output information
            if mesh_description:
                row = pd.DataFrame([[pmid, term, mesh_description]], columns=['PMID', 'Primary_MeSH', 'Desc'])
                medical_record_df = medical_record_df.append(row, ignore_index=True)

            for author_index, organizations in enumerate(zip(chop_organization, penn_organization)):
                # check if the author belongs to either CHOP or PENN
                if 1 in organizations:
                    row = pd.DataFrame([[pmid, authors[author_index], organizations[0], organizations[1],
                                        roles[author_index], affiliations[author_index]]],
                                       columns=['PMID', 'Author', 'author_chop', 'author_penn', 'Role',
                                                'AffiliationInfo'])
                    author_record_df = author_record_df.append(row, ignore_index=True)

            authors = ';'.join(authors)

            row = pd.DataFrame([[pmid, title, abstract, year, month, authors, mesh_term, date]],
                               columns=['PMID', 'Title', 'Abstract', 'Year', 'Month', 'author_list', 'subject_list',
                                        'date'])
            paper_record_df = paper_record_df.append(row)

            if title and abstract:
                title_list.append(title)
                abstract_list.append(abstract)
except:
    next

processing 0 out of 0
processing 1 out of 1
processing 2 out of 2
processing 3 out of 3
processing 4 out of 4
processing 5 out of 5
processing 6 out of 6
processing 7 out of 7
processing 8 out of 8
processing 9 out of 9
processing 10 out of 10
processing 11 out of 11
processing 12 out of 12
processing 13 out of 13
processing 14 out of 14
processing 15 out of 15
processing 16 out of 16
processing 17 out of 17
processing 18 out of 18
processing 19 out of 19
processing 20 out of 20
processing 21 out of 21
processing 22 out of 22
processing 23 out of 23
processing 24 out of 24
processing 25 out of 25
processing 26 out of 26
processing 27 out of 27
processing 28 out of 28
processing 29 out of 29
processing 30 out of 30
processing 31 out of 31
processing 32 out of 32
processing 33 out of 33
processing 34 out of 34
processing 35 out of 35
processing 36 out of 36
processing 37 out of 37
processing 38 out of 38
processing 39 out of 39
processing 40 out of 40
processing 41 out of 41
processing 4

In [29]:
paper_record_df.head()

Unnamed: 0,PMID,Title,Abstract,Year,Month,author_list,subject_list,date


In [30]:
author_record_df.head()

Unnamed: 0,PMID,Author,author_chop,author_penn,Role,AffiliationInfo


In [31]:
medical_record_df.head()

Unnamed: 0,PMID,Desc,Primary_MeSH
