# Data in More Complex Formats

## Parsing  XLM

In [1]:
import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('data/exampleResearchArticle.xml')
root = tree.getroot()

In [4]:
# Children of root
for child in root:
    print(child.tag)

ui
ji
fm
bdy
bm


In [5]:
title = root.find('./fm/bibl/title')
title_text = ""
for p in title:
    title_text += p.text
print("Title: ", title_text)

Title:  Standardization of the functional syndesmosis widening by dynamic U.S examination


In [7]:
# Author email addresses
for a in root.findall('./fm/bibl/aug/au'):
    email = a.find('email')
    if email is not None:
        print(email.text)

omer@extremegate.com
mcarmont@hotmail.com
laver17@gmail.com
nyska@internet-zahav.net
kammarh@gmail.com
gideon.mann.md@gmail.com
barns.nz@gmail.com
eukots@gmail.com


In [14]:
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }
        
        for tag in ['fnm', 'snm', 'email']:
            tagvalue = author.find(tag)
            if tagvalue is not None:
                data[tag] = tagvalue.text

        authors.append(data)

    return authors


root = get_root(article_file)
data = get_authors(root)

data


[{'email': 'omer@extremegate.com', 'fnm': 'Omer', 'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com', 'fnm': 'Mike', 'snm': 'Carmont'},
 {'email': 'laver17@gmail.com', 'fnm': 'Lior', 'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net', 'fnm': 'Meir', 'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com', 'fnm': 'Hagay', 'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com', 'fnm': 'Gideon', 'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com', 'fnm': 'Barnaby', 'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'snm': 'Kots'}]

### Handing Attributes

In [18]:
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": author.find('fnm').text,
                "snm": author.find('snm').text,
                "email": author.find('email').text,
                "insr": [ x.attrib['iid'] for x in author.findall('insr')]
        }

        authors.append(data)

    return authors

root = get_root(article_file)
data = get_authors(root)

data

[{'email': 'omer@extremegate.com',
  'fnm': 'Omer',
  'insr': ['I1'],
  'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com',
  'fnm': 'Mike',
  'insr': ['I2'],
  'snm': 'Carmont'},
 {'email': 'laver17@gmail.com',
  'fnm': 'Lior',
  'insr': ['I3', 'I4'],
  'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net',
  'fnm': 'Meir',
  'insr': ['I3'],
  'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com',
  'fnm': 'Hagay',
  'insr': ['I8'],
  'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com',
  'fnm': 'Gideon',
  'insr': ['I3', 'I5'],
  'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com',
  'fnm': 'Barnaby',
  'insr': ['I6'],
  'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'insr': ['I7'], 'snm': 'Kots'}]

## Parsing HTML

In [37]:
from bs4 import BeautifulSoup

def options(soup, id):
    option_values = []
    carrier_list = soup.find(id=id)
    for option in carrier_list.find_all('option'):
        option_values.append(option['value'])
    return option_values

def print_list(label, codes):
    print('\n%s:' % label)
    for c in codes:
        print(c)
        
soup = BeautifulSoup(open('data/airport.html'), 'lxml')

codes = options(soup, 'CarrierList')
print_list('Carriers', codes)

codes = options(soup, 'AirportList')
#print_list('Airports', codes)


Carriers:
All
AllUS
AllForeign
AS
G4
AA
5Y
DL
MQ
EV
F9
HA
B6
OO
WN
NK
UA
VX


In [40]:
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = 'data/airport.html'


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html, 'lxml')
        data["eventvalidation"] = soup.find(id="__EVENTVALIDATION")['value']
        data["viewstate"] = soup.find(id="__VIEWSTATE")['value']

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text



data = extract_data(html_page)


### Scraping 

In [42]:
import requests
from bs4 import BeautifulSoup

s = requests.Session()

r = s.get("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
soup = BeautifulSoup(r.text, 'lxml')
viewstate = soup.find(id="__VIEWSTATE")['value']
eventvalidation = soup.find(id="__EVENTVALIDATION")['value']

r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
          data = (
                   ("__EVENTTARGET", ""),
                   ("__EVENTARGUMENT", ""),
                   ("__VIEWSTATE", viewstate),
                   ("__EVENTVALIDATION", eventvalidation),
                   ("CarrierList", "VX"),
                   ("AirportList", "BOS"),
                   ("Submit", "Submit")          
          ))

f = open('data/virgin_and_logan_airport.html', 'w')
f.write(r.text)

337742