This notebook is for creating the functionality of querying an external database and extracting data from that database, and storing it into a numpy array

In [5]:
#import the needed packages
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
import re
import csv

In [6]:
#define the needed functions
#take one input that's a list to be able to have a variable number of dspacings
def make_web_address(dspacing1, dspacing2):
    '''
    make_web_address will compile a string to use for a webpage address
    
    For example, if I want to search for rutile on the American 
    Minerals Society webpage, I might use this address:
    
    http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.2435,2.4836),opt(),type(d-spacing),tolerance(.001)
    
    Where 3.2435 and 2.4836 are two d spacings in the rutile structure
    '''
    
    web_address = 'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(' + str(dspacing1) + ',' + str(dspacing2) +'),opt(),type(d-spacing),tolerance(.001)'
    
    return web_address

def find_diffraction_files(href):
    '''
    find_diffraction_files is a sorting function that can be passed 
    into compile_links to generate a specific set of diffraction text 
    file web addresses from the American Mineral Society database
    '''
# commented out logic will return the text file links
#     return href and re.compile("txt").search(href) and not re.compile("dif").search(href)        
    return href and re.compile("dif").search(href)        

def compile_links(web_address):
    '''
    access_web accesses a webpage at a given address,
    finds all of the links on that page, and appends those links 
    to a list called links.
    
    inputs are a web address, and the array for storing links
    '''
    
    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in html_page.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(html_page.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding)
    links_list = []
    for link in soup.find_all(href=find_diffraction_files):
        links_list.append('http://rruff.geo.arizona.edu'+link['href'])
    
    return links_list

def split_diffraction_data(link):
    '''
    get_diffraction_data will compile crystal structure parameters and 
    indexes of all potential planes into a pandas DataFrame
    
    Inputs are an array of truncated href, two d spacings from the FFT, 
    and the pandas DataFrame into which data will be stored. Returns the
    populated DataFrame.
    '''

    with requests.Session() as s:
        download = s.get(link)
        decoded_content = download.content.decode('utf-8')
        metadata_list = []
            reader = csv.reader(decoded_content.splitlines())
            raw_data_list = []
            for i, row in enumerate(reader):
                #This is for handling [], empty rows
                if len(row) != 0:
                    entry = row[0]
                    entry.split()
                    raw_data_list.append(entry)
                    string = row[0]
                    if string.find('2-THETA') != -1:
                        metadata_end_index = i
                    elif string.find('==============================') != -1:
                        data_end_index = i
                    else:
                        pass
                
            metadata_list = raw_data_list[0:metadata_end_index-1]
            data_list = raw_data_list[metadata_end_index-1:data_end_index-1]
    return metadata_list, data_list

def query_wrapper():
    for link in links_list:
        split_diffraction_data(link)
    return 

generate a web search of the minerals with matching d spacings

In [7]:
dspacing1 = 3.3
dspacing2 = 3.5
#may need put this into a wrapper function
#create a skeleton for a wrapper function
web_address = make_web_address(dspacing1, dspacing2)
web_address

'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.3,3.5),opt(),type(d-spacing),tolerance(.001)'

create a list of links to diffraction text files for matching mineral structures

In [8]:
links_list = compile_links(web_address)
links_list

['http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=dif',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=16200.txt&down=dif',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=17097.txt&down=dif',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=18445.txt&down=dif',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=20133.txt&down=dif']

In [9]:
dspacing1 = 3.3
dspacing2 = 3.5

column_names = ["mineral", "symetry", "a", "b", "c", "alpha", "beta", "gama", "dspacing", "h", "k", "l"]
pot_planes_df = pd.DataFrame(columns = column_names)

get_diffraction_data(links_list, dspacing1, dspacing2, pot_planes_df)

      Gillulyite
      Foit F F
      American Mineralogist 80 (1995) 394-399
      The crystal structure of gillulyite
      deposit
      _database_code_amcsd 0001733
      CELL PARAMETERS:   9.5840  5.6790 21.5010   90.000  100.070   90.000
      SPACE GROUP: P2/n      
      X-RAY WAVELENGTH:     1.541838
      Cell Volume:   1152.219
      Density (g/cm3):      4.136
      MAX. ABS. INTENSITY / VOLUME**2:      15.64160708    
      RIR:      1.231
      RIR based on corundum from Acta Crystallographica A38 (1982) 733-739
               2-THETA      INTENSITY    D-SPACING   H   K   L   Multiplicity
                10.91          4.16        8.1078    1   0   1         2
                14.30         16.99        6.1945   -1   0   3         2
                16.16         12.17        5.4851    0   1   1         4
                16.75         32.42        5.2924    0   0   4         2
                16.95          4.90        5.2296    1   0   3         2
                17.72    

      Hydrate - tetrahydrofuran
      Jones C Y
      Journal of Physical Chemistry B107 (2003) 6026-6031
      Structure and thermal expansivity of tetrahydrofuran deuterate determined by
      neutron powder diffraction
      Sample: T = 140 K
      _database_code_amcsd 0013157
      CELL PARAMETERS:  17.1500 17.1500 17.1500   90.000   90.000   90.000
      SPACE GROUP: Fd3m      
      X-RAY WAVELENGTH:     1.541838
      Cell Volume:   5044.201
      Density (g/cm3):      0.806
      MAX. ABS. INTENSITY / VOLUME**2:      2.246937120    
      RIR:      0.907
      RIR based on corundum from Acta Crystallographica A38 (1982) 733-739
               2-THETA      INTENSITY    D-SPACING   H   K   L   Multiplicity
                 8.93         30.81        9.9016    1   1   1         8
                14.61         60.97        6.0634    2   2   0        12
                17.15        100.00        5.1709    3   1   1        24
                17.92         60.52        4.9508    2   2 

      Dervillite
      Bindi L
      Mineralogical Magazine 77 (2013) 3105-3112
      Dervillite
      occurrence and crystal structure
      Locality: Lengenbach quarry
      _database_code_amcsd 0019987
      CELL PARAMETERS:   9.6155 12.9331  6.8616   90.000   99.352   90.000
      SPACE GROUP: Pc        
      X-RAY WAVELENGTH:     1.541838
      Cell Volume:    841.955
      Density (g/cm3):      5.597
      MAX. ABS. INTENSITY / VOLUME**2:      27.10095834    
      RIR:      1.577
      RIR based on corundum from Acta Crystallographica A38 (1982) 733-739
               2-THETA      INTENSITY    D-SPACING   H   K   L   Multiplicity
                 6.83          1.30       12.9331    0   1   0         2
                 9.32          4.77        9.4877    1   0   0         1
                11.57          5.67        7.6500    1   1   0         2
                13.69          2.33        6.4665    0   2   0         2
                14.77          1.27        5.9982    0   1   1