In [1]:
# Preparatory notebook for the database scraper

In [27]:
from bs4 import BeautifulSoup
import requests



In [2]:
%%file Scrapers.py
import requests
from bs4 import BeautifulSoup
import pytest 

class Scraper_7_coeffs:
    """
    Scraper for the 7-degree reaction coefficients from the BURCAT Database.
    
    Requires following imports:
    from bs4 import BeautifulSoup
    import requests
    
    The default url to scrape from is defaulted in initialization:
    http://garfield.chem.elte.hu/Burcat/BURCAT.THR
    
    INPUTS
    =======
    The url where the database file resides.
    
    RETURNS
    ========
    String with the database table, to be passed to the parser to create XML or SQL object
    
    EXAMPLES
    =========
    Showing of the main scraper functionality. Returns true if everything runs properly.
    
    >>> test_scraper = Scraper_7_coeffs(url='http://garfield.chem.elte.hu/Burcat/BURCAT.THR')
    >>> table_of_coeffs = test_scraper.scrape_coeffs()
    >>> table_of_coeffs.__class__.__name__=='str'
    True
    """
    
    def __init__(self,url='http://garfield.chem.elte.hu/Burcat/BURCAT.THR'):
        
        self.url = url
        self.result = None
        self.text_body = None
        self.webpage = None
        self.soup = None
    
    def scrape_url(self):
        self.web_page = requests.get(self.url)
        
    def create_soup_object(self):
        self.soup = BeautifulSoup(self.web_page.content, 'lxml')
        
    def scrape_coeffs(self):
        self.scrape_url()
        self.create_soup_object()
        
        my_tag = self.soup.find('egil.jahnsen')

        #Identify the tag with the 7-coeffs polynomial
        for item in my_tag.contents:
            if item.__class__.__name__ == "Tag":
                text_body = item
        
        # Find the exact element with the body text
        for child in text_body.contents:
            if child.__class__.__name__ == "Tag":
                raw_coeffs_body = child.get_text()
        
        #Separate the table
        for idx,line in enumerate(raw_coeffs_body.split('\n')):
            if 'THE NUMBER PRECEDING EACH SPECIES IS THE CHEMICAL ABSTRACT' in line:
                self.result = raw_coeffs_body.split('THE NUMBER PRECEDING EACH SPECIES IS THE CHEMICAL ABSTRACT (CAS) IDENTIFICATION.')[1]

        return self.result

if __name__ == "__main__":
    import doctest
    from bs4 import BeautifulSoup
    import requests
    import numpy as np
    doctest.testmod(verbose=True)


Overwriting Scrapers.py


In [170]:
test_scraper = Scraper_7_coeffs(url='http://garfield.chem.elte.hu/Burcat/BURCAT.THR')
table_of_coeffs = test_scraper.scrape_coeffs()

In [171]:
%%bash
python Scrapers.py -v

Trying:
    test_scraper = Scraper_7_coeffs(url='http://garfield.chem.elte.hu/Burcat/BURCAT.THR')
Expecting nothing
ok
Trying:
    table_of_coeffs = test_scraper.scrape_coeffs()
Expecting nothing
ok
Trying:
    table_of_coeffs.__class__.__name__=='str'
Expecting:
    True
ok
5 items had no tests:
    __main__
    __main__.Scraper_7_coeffs.__init__
    __main__.Scraper_7_coeffs.create_soup_object
    __main__.Scraper_7_coeffs.scrape_coeffs
    __main__.Scraper_7_coeffs.scrape_url
1 items passed all tests:
   3 tests in __main__.Scraper_7_coeffs
3 tests in 6 items.
3 passed and 0 failed.
Test passed.


In [3]:
%%file ../../tests/test_Scrapers.py

#Import the desired module
from pychemkin.thermo_database.Scrapers import *

class Test_Scrapers():
    """
    Class for testing the parsers basic functionality.
    """
    
    #def __init__(self):
    #    self.success = True
    #    self.fail = False
    
    @staticmethod
    def test_Scraper_7():
        """
        Tests for Parser of 7-coeff NASA Polynomials
        """
        test7 = Scraper_7_coeffs()
        # The following method scrape_coeffs() implicitly calls scrape_url() and create_soup_object()
        with pytest.warns(DeprecationWarning):
            table = test7.scrape_coeffs()
        
        #Test that the server response to the parser was 200.
        assert str(test7.web_page.__repr__) == "<bound method Response.__repr__ of <Response [200]>>"
        
        # Test soup object creation
        assert str(test7.soup.__class__.__name__) == "BeautifulSoup"
        
        # Test isolation of table of interest by testing type and length
        
        assert type(table) == str and len(table)>1400000
              

Overwriting ../../tests/test_Scrapers.py


In [168]:
%%bash
python -m pytest -v test_Scrapers.py

platform darwin -- Python 3.6.4, pytest-3.3.2, py-1.5.2, pluggy-0.6.0 -- /Users/filipmichalsky/anaconda/bin/python
cachedir: ../../.cache
rootdir: /Users/filipmichalsky/Harvard_Spring_18/pychemkin, inifile:
plugins: cov-2.5.1
collecting ... collected 1 item

test_Scrapers.py::Test_Scrapers::test_Scraper_7 FAILED                   [100%]

_________________________ Test_Scrapers.test_Scraper_7 _________________________

    @staticmethod
    def test_Scraper_7():
        """
            Tests for Parser of 7-coeff NASA Polynomials
            """
        test7 = Scraper_7_coeffs()
        # The following method scrape_coeffs() implicitly calls scrape_url() and create_soup_object()
>       table = test7.scrape_coeffs()

test_Scrapers.py:21: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <Scrapers.Scraper_7_coeffs object at 0x10d114208>

    def scrape_coeffs(self):
        self.scrape_url()
        self.create_soup_object()
    
        my_tag = 

In [117]:
Test_Parsers.test_Parser_7()

In [87]:
test_scraper = Scraper_7_coeffs(url='http://garfield.chem.elte.hu/Burcat/BURCAT.THR')
table_of_coeffs = test_scraper.scrape_coeffs()
print(table_of_coeffs.__class__.__name__)

str


In [89]:
print(table_of_coeffs.__class__.__name__=='str')

True


In [13]:
# Download 7-coeff polynomials
wiki_url = "http://garfield.chem.elte.hu/Burcat/BURCAT.THR"

requests.get(wiki_url)

# Create BeatifulSoup object
soup = BeautifulSoup(web_page.content, 'lxml')
#table_classes = {"class": ["sortable", "plainrowheaders"]}

mytag = soup.find('egil.jahnsen')

#Identify the tag with the 7-coeffs polynomial
for item in mytag.contents:
    if item.__class__.__name__ == "Tag":
        text_body = item

# Find the exact element with the body text
for child in text_body.contents:
    if child.__class__.__name__ == "Tag":
        raw_coeffs_body = child.get_text()

pre_cleaned_coeffs = ''
#print(raw_coeffs_body)

#parse just the coeffs table

start_idx = None
end_idx = True

for idx,line in enumerate(raw_coeffs_body.split('\n')):
    if 'THE NUMBER PRECEDING EACH SPECIES IS THE CHEMICAL ABSTRACT' in line:
        table = raw_coeffs_body.split('THE NUMBER PRECEDING EACH SPECIES IS THE CHEMICAL ABSTRACT (CAS) IDENTIFICATION.')[1]






7440-22-4
Ag cr,liq  REFERENCE ELEMENT  Calculated by S. Gordon  HF298=0. kJ  REF=CODATA
Cox Wagman 1984  Max Lst Sq Error Cp @ 200 K 0.58%.
Ag (solid)        T 6/12AG 1.   0.   0.   0.S   200.000  1235.080  A 107.86820 1
 2.07216824E+00 2.46393729E-03-1.34351116E-06 3.69321107E-10 0.00000000E+00    2
-6.37725170E+02-7.18810718E+00 2.25225065E+00 5.43263008E-03-1.32153990E-05    3
 1.50423505E-08-5.94991675E-12-8.23132027E+02-8.86835190E+00 0.00000000E+00    4
Ag (liquid)       T 6/12AG 1.   0.   0.   0.L  1235.080  6000.000  A 107.86820 1
 4.04091552E+00-3.49297186E-05 1.60169701E-08-2.96225835E-12 1.92332513E-16    2
-4.77718035E+02-1.78491707E+01 0.00000000E+00 0.00000000E+00 0.00000000E+00    3
 0.00000000E+00 0.00000000E+00 0.00000000E+00 0.00000000E+00 0.00000000E+00    4

7440-22-4
Ag (gas)  Calculated by B McBride  HF298=284.9+/-0.8  REF=CODATA Cox Wagman 1984
Max Lst Sq Error Cp @ 5400 K 0.16%
Ag                g10/97AG 1.   0.   0.   0.G   200.000  6000.000