This notebook is for creating the functionality of querying an external database and extracting data from that database, and storing it into a numpy array

In [1]:
#import the needed packages
import numpy as np
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
import re

In [35]:
#define the needed functions

def make_web_address(dspacing1, dspacing2):
    '''
    make_web_address will compile a string to use for a webpage address
    
    For example, if I want to search for rutile on the American 
    Minerals Society webpage, I might use this address:
    
    http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.2435,2.4836),opt(),type(d-spacing),tolerance(.001)
    
    Where 3.2435 and 2.4836 are two d spacings in the rutile structure
    '''
    
    web_address = 'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(' + str(dspacing1) + ',' + str(dspacing2) +'),opt(),type(d-spacing),tolerance(.001)'
    
    return web_address

def find_diffraction_files(href):
    '''
    find_diffraction_files is a sorting function that can be passed 
    into compile_links to generate a specific set of diffraction text 
    file web addresses from the American Mineral Society database
    '''
    return href and re.compile("txt").search(href) and not re.compile("dif").search(href)        

def compile_links(web_address):
    '''
    access_web accesses a webpage at a given address,
    finds all of the links on that page, and appends those links 
    to a list called links.
    '''
    
    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in html_page.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(html_page.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding)
    links = []
    
    for link in soup.find_all(href=find_diffraction_files):
        links.append(link['href'])
    
    return links

generate a web search of the minerals with matching d spacings

In [36]:
dspacing1 = 3.3
dspacing2 = 3.5

make_web_address(dspacing1, dspacing2)


'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.3,3.5),opt(),type(d-spacing),tolerance(.001)'

create a list of links to diffraction text files for matching mineral structures

In [37]:
web_address = 'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.3,3.5),opt(),type(d-spacing),tolerance(.001)'
compile_links(web_address)

['/AMS/download.php?id=01755.txt&down=text',
 '/AMS/download.php?id=16200.txt&down=text',
 '/AMS/download.php?id=17097.txt&down=text',
 '/AMS/download.php?id=18445.txt&down=text',
 '/AMS/download.php?id=20133.txt&down=text']