# Extracting data from XML using regular expressions

## Regular expression - first contact

In [11]:
line = "<Title>Nucleic acids research</Title>"

In [12]:
# Regular Expressions
import re

In [13]:
re.findall("acid", line)

['acid']

In [14]:
re.findall("a.*", line)

['acids research</Title>']

In [15]:
re.findall("a.{3}", line)

['acid', 'arch']

In [16]:
re.findall(".{2}a.{3}", line)

['c acid', 'search']

In [17]:
re.findall("<Title>.*</Title>", line)

['<Title>Nucleic acids research</Title>']

In [18]:
re.findall("<Title>(.*)</Title>", line)

['Nucleic acids research']

## Downloading an XML example file

In [19]:
import urllib.request


xml_url = "https://raw.githubusercontent.com/foerstner-lab/Bits_and_pieces_for_the_carpentries_workshops/master/text_file_examples/PubMed.xml"
xml_file = xml_url.split("/")[-1]

urllib.request.urlretrieve(xml_url, xml_file)

('PubMed.xml', <http.client.HTTPMessage at 0x7f50d86190f0>)

In [20]:
xml_file

'PubMed.xml'

## Extracting patterns from that file using regular expressions

In [21]:
for line in open(xml_file):
    matches = re.findall("<Year>(.*)</Year>", line)
    if len(matches) > 0:
        print(matches)

['2013']
['2018']
['2013']
['2012']
['2012']
['2012']
['2013']


In [22]:
# Read the full file in a variable
article_meta_data = open(xml_file).read()

In [23]:
print(article_meta_data)

<?xml version="1.0"?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
<PubmedArticleSet>

<PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
        <PMID Version="1">23203889</PMID>
        <DateCompleted>
            <Year>2013</Year>
            <Month>05</Month>
            <Day>13</Day>
        </DateCompleted>
        <DateRevised>
            <Year>2018</Year>
            <Month>11</Month>
            <Day>13</Day>
        </DateRevised>
        <Article PubModel="Print-Electronic">
            <Journal>
                <ISSN IssnType="Electronic">1362-4962</ISSN>
                <JournalIssue CitedMedium="Internet">
                    <Volume>41</Volume>
                    <Issue>Database issue</Issue>
                    <PubDate>
                        <Year>2013</Year>
                        <Month>Jan</Month>
                    </PubDate>
                </J

In [24]:
re.findall("<Year>(\d{4})</Year>", article_meta_data)

['2013', '2018', '2013', '2012', '2012', '2012', '2013']

In [25]:
re.findall("\d{5}", article_meta_data)

['19010',
 '23203',
 '20892',
 '01642',
 '05206',
 '04110',
 '00169',
 '01620',
 '02040',
 '00931',
 '01201',
 '01201',
 '01851',
 '01448',
 '23203',
 '35312',
 '17971',
 '21062',
 '16467',
 '16888',
 '22140']