In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import statistics as s

In [2]:
from functions import log

In [3]:
numFiles = 4

fileList_list = []

for i in range (0, numFiles):
    fileList_list.append('files/file_list_'+str(i)+'.txt')

In [4]:
all_df = []
for file in fileList_list:
    df = pd.read_csv(file, delimiter= '\t')
    all_df.append(df)

df = pd.concat(all_df)
    

In [5]:
df = df[df["Article Citation"].str.startswith("PLoS")]
df = df[["Article File", "PMID"]]
df = df[df["PMID"] != 0]
df.head()

Unnamed: 0,Article File,PMID
0,PMC000xxxxxx/PMC176545.xml,12929205
1,PMC000xxxxxx/PMC176546.xml,12929206
4,PMC000xxxxxx/PMC193604.xml,12975658
5,PMC000xxxxxx/PMC193605.xml,12975657
9,PMC000xxxxxx/PMC212687.xml,14551903


In [6]:
# initialize empty lists
titles = []
abstracts = []
refs = []

In [7]:
def main(article_file):
    tree = ET.parse('./files/'+article_file) # parse file
    root = tree.getroot() # get root
    
    # get title
    title_element = root.findall('front')[0].findall('article-meta')[0].findall('title-group')[0].findall('article-title')[0]
    title = "".join(title_element.itertext())
    title = title.replace('\n','')
    
    # get abstract
    try:
        abstract_element = root.findall('front')[0].findall('article-meta')[0].findall('abstract')[0]
        abstract = "".join(abstract_element.itertext())
        abstract = abstract.replace('\n','')
    except: # no abstract, take body?
        abstract_element = root.findall('body')[0]
        abstract = "".join(abstract_element.itertext())
        abstract = abstract.replace('\n','')
    
    # get references
    pmid_list = []
    try:
        reference_element = root.findall('back')[0].findall('ref-list')[0].findall('ref')
        counter = 0
        for ref in reference_element:
            try:
                pmid = "".join(ref.findall('element-citation')[0].findall('pub-id')[0].itertext())
                if len(pmid) < 10:
                    pmid_list.append(pmid)
            except: # no pmid
                pass
    except:
        pmid_list = "NA"
        
    titles.append(title)
    abstracts.append(abstract)
    refs.append(pmid_list)
    
    log(f'Ran select_files.ipynb succesfully')
        

In [8]:
for i in df["Article File"]:
    main(i)

In [9]:
df["title"] = titles
df = df[["PMID", "title"]]
df["abstract"] = abstracts
df["references"] = refs

In [10]:
df = df[df["abstract"] != 'xx']
df = df[df["references"] != 'NA']
df = df.dropna()

In [11]:
df.head()

Unnamed: 0,PMID,title,abstract,references
0,12929205,The Transcriptome of the Intraerythrocytic Dev...,Plasmodium falciparum is the causative agent o...,"[12427465, 12351791, 12519984, 11123685, 11475..."
1,12929206,DNA Analysis Indicates That Asian Elephants Ar...,The origin of Borneo's elephants is controvers...,"[10739137, 8978083, 10762406, 12721223, 116817..."
4,12975658,Drosophila Free-Running Rhythms Require Interc...,Robust self-sustained oscillations are a ubiqu...,"[9630223, 11520929, 12839998, 10684876, 963542..."
5,12975657,From Gene Trees to Organismal Phylogeny in Pro...,The rapid increase in published genomic sequen...,"[12219091, 9254694, 11752243, 9278503, 1110269..."
9,14551903,A Functional Analysis of the Spacer of V(D)J R...,"During lymphocyte development, V(D)J recombina...","[9671477, 3120312, 9665841, 11983152, 7594539,..."


In [12]:
lens = df['abstract'].str.split().apply(len)
print(max(lens))
print(min(lens))
print(s.mean(lens))
print(s.stdev(lens))

22935
1
245.52868614295525
234.8821178619131


In [13]:
df.to_csv(r'files/select_files.txt', sep='\t', index = False)

In [14]:
df

Unnamed: 0,PMID,title,abstract,references
0,12929205,The Transcriptome of the Intraerythrocytic Dev...,Plasmodium falciparum is the causative agent o...,"[12427465, 12351791, 12519984, 11123685, 11475..."
1,12929206,DNA Analysis Indicates That Asian Elephants Ar...,The origin of Borneo's elephants is controvers...,"[10739137, 8978083, 10762406, 12721223, 116817..."
4,12975658,Drosophila Free-Running Rhythms Require Interc...,Robust self-sustained oscillations are a ubiqu...,"[9630223, 11520929, 12839998, 10684876, 963542..."
5,12975657,From Gene Trees to Organismal Phylogeny in Pro...,The rapid increase in published genomic sequen...,"[12219091, 9254694, 11752243, 9278503, 1110269..."
9,14551903,A Functional Analysis of the Spacer of V(D)J R...,"During lymphocyte development, V(D)J recombina...","[9671477, 3120312, 9665841, 11983152, 7594539,..."
13,14551908,The Roles of APC and Axin Derived from Experim...,Wnt signaling plays an important role in both ...,"[9923680, 12554650, 10508976, 4830198, 1204973..."
14,14551910,Genome-Wide RNAi of C. elegans Using the Hyper...,RNA-mediated interference (RNAi) is a method t...,"[12529643, 12471266, 9486653, 11099033, 110990..."
15,14551911,Heterochromatin Dynamics,Heterochromatin is usually thought of as a sta...,"[12560555, 10753776, 12560554, 11893491, 11498..."
16,14551912,Microarray Analysis,Microarrays can survey genome-wide expression ...,"[10676951, 11340206, 12042820, 11823860, 10610..."
17,14551913,V(D)J Recombination and the Evolution of the A...,In order for the immune system to generate its...,"[3120312, 11983152, 3416632, 10837067, 9768756..."
