In [1]:
import pandas as pd
import datetime
import calendar
from lxml import etree
import glob

In [2]:
abstractlist = glob.glob("abstracts/*")

xml_elements = ['datecreated', 'title', 'articletitle', 'abstracttext', 'language',  'country', 'publicationstatus']

In [3]:
def parse_date(time):
    time = time.getchildren()
    cal = dict((v,k) for k,v in enumerate(calendar.month_abbr))
    try:
        date = datetime.date(int(time[0].text), int(time[1].text), int(time[2].text))        
    except Exception:
        #print cal(time[1].text)
        date = datetime.date(int(time[0].text), int(cal[time[1].text.strip()]), int(time[2].text))
    return date

def create_df(articlelist):
    findelems = lambda arr, elem: [item[1] for item in arr if item[0] == elem]
    stripelems =  lambda arr: [item.text.strip() for item in arr]
    
    df = pd.DataFrame(columns=xml_elements)
    
    for article in range(len(articlelist)):
        article_dict = dict()

        context = list(etree.iterparse(articlelist[article], events=("end",), tag=xml_elements))
        context = [(_[1].tag, _[1]) for _ in context]

        article_dict['abstracttext'] = ' '.join(stripelems(findelems(context, 'abstracttext')))
        try:
            article_dict['datecreated'] = parse_date(findelems(context, 'datecreated')[0])
        except Exception:
            article_dict['datecreated'] = 'not available'
        article_dict['title'] = ' '.join(stripelems(findelems(context, 'title')))
        article_dict['articletitle'] = ' '.join(stripelems(findelems(context, 'articletitle')))
        article_dict['language'] = ' '.join(stripelems(findelems(context, 'language')))
        article_dict['publicationstatus'] = ' '.join(stripelems(findelems(context, 'publicationstatus')))

        article_dict['country'] = ' '.join(set([_.lower() for _ in stripelems(findelems(context, 'country'))]))
        
        for key in article_dict.keys():
            df.loc[article,key] = article_dict[key]
    return df

In [4]:
df = create_df(abstractlist)

In [5]:
df

Unnamed: 0,datecreated,title,articletitle,abstracttext,language,country,publicationstatus
0,2001-03-06,Diabetes care,Pioglitazone hydrochloride monotherapy improve...,To evaluate the efficacy and safety of four do...,eng,united states,ppublish
1,1996-05-28,Archives of general psychiatry,Six-month follow-up of naltrexone and psychoth...,The goal of this study was to examine the pers...,eng,united states,ppublish
2,2003-10-27,Diabetes care,The treat-to-target trial: randomized addition...,To compare the abilities and associated hypogl...,eng,united states,ppublish
3,2001-05-04,Clinical pediatrics,Are follow-up throat cultures necessary when r...,The frequency of obtaining false-negative Grou...,eng,united states,ppublish
4,2006-11-08,Acta obstetricia et gynecologica Scandinavica,Prevention of postpartum hemorrhage by uteroto...,To determine the efficacy of intravenous oxyto...,eng,denmark,ppublish
5,1995-10-17,JAMA : the journal of the American Medical Ass...,From the Centers for Disease Control and Preve...,,eng,united states,ppublish
6,2004-02-19,Cochrane database of systematic reviews (Online),Acupuncture for induction of labour.,This is one of a series of reviews of methods ...,eng,england,ppublish
7,2001-05-01,Drug safety : an international journal of medi...,Should celecoxib be contraindicated in patient...,"Celecoxib, a selective cyclo-oxygenase-2 inhib...",eng,new zealand,ppublish
8,2000-09-12,Journal of the American Academy of Dermatology,Pulsed-dye laser versus conventional therapy i...,The clinical management of verrucae vulgaris i...,eng,united states,ppublish
9,1999-09-02,Lancet,Dietary supplementation with n-3 polyunsaturat...,There is conflicting evidence on the benefits ...,eng,england,ppublish


In [7]:
df[df['datecreated'] == 'not available'] 

Unnamed: 0,datecreated,title,articletitle,abstracttext,language,country,publicationstatus
907,not available,,,PURPOSE: Both the US Preventive Services Task ...,eng,,ppublish
2071,not available,,,This report represents a departure from the ch...,eng,,ppublish
2518,not available,,,The aim of this guideline is to offer best pra...,eng,,ppublish
