This notebook is for creating the functionality of querying an external database and extracting data from that database, and storing it into a numpy array

In [2]:
#import the needed packages
#%load_ext pycodestyle_magic
#%pycodestyle_on
#%flake8_on
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
import re
import csv

In [3]:
#define the needed functions
#take one input that's a list to be able to have a variable number of dspacings
def make_web_address(dspacing1, dspacing2):
    '''
    make_web_address will compile a string to use for a webpage address
    
    For example, if I want to search for rutile on the American 
    Minerals Society webpage, I might use this address:
    
    http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.2435,2.4836),opt(),type(d-spacing),tolerance(.001)
    
    Where 3.2435 and 2.4836 are two d spacings in the rutile structure
    '''
    
    web_address = 'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(' + str(dspacing1) + ',' + str(dspacing2) +'),opt(),type(d-spacing),tolerance(.001)'
    
    return web_address

def find_diffraction_files(href):
    '''
    find_diffraction_files is a sorting function that can be passed 
    into compile_links to generate a specific set of diffraction text 
    file web addresses from the American Mineral Society database
    
    find_diffraction_files works together with compile_links
    '''
# commented out logic will return the text file links
#     return href and re.compile("txt").search(href) and not re.compile("dif").search(href)        
    return href and re.compile("dif").search(href)        

def compile_links(web_address):
    '''
    compile_links accesses a webpage at a given address,
    finds all of the links on that page, and appends certain links 
    to a list called links_list.
    
    compile links works together with find_diffraction_files to 
    get only the relevant links.
    
    inputs are a web address, and the list for storing links
    '''
    
    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in html_page.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(html_page.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding)
    links_list = []
    for link in soup.find_all(href=find_diffraction_files):
        links_list.append('http://rruff.geo.arizona.edu'+link['href'])
    
    return links_list

def split_diffraction_data(link):
    '''
    get_diffraction_data will compile crystal structure parameters and 
    indexes of all potential planes into a pandas DataFrame
    
    Inputs are an array of truncated href, two d spacings from the FFT, 
    and the pandas DataFrame into which data will be stored. Returns the
    populated DataFrame.
    '''

    with requests.Session() as s:
        download = s.get(link)
        decoded_content = download.content.decode('utf-8')
        metadata_list = []
        reader = csv.reader(decoded_content.splitlines())
        raw_data_list = []
        for i, row in enumerate(reader):
            #This is for handling [], empty rows
            if len(row) != 0:
                entry = row[0]
                entry.split()
                raw_data_list.append(entry)
                string = row[0]
            if string.find('2-THETA') != -1:
                metadata_end_index = i
            elif string.find('==============================') != -1:
                data_end_index = i
            else:
                pass
                
        metadata_list = raw_data_list[0:metadata_end_index-1]
        data_list = raw_data_list[metadata_end_index-1:data_end_index-1]
    return (metadata_list, data_list)

def sort_lists(link):
    '''
    sort_lists is a function that returns formated data aquired from
    the American Mineral Society for analysis
    
    inputs are a tuple of two lists, one containing metadata and one 
    containing diffraction data.  
    These lists come from the split_diffraction_data function
    
    the output is a tuple of three lists containing our 
    desired metadata, our data labels, and diffraction data 
    
    desired metadata: Mineral Name, Space Group, Cell Parameters 
    '''
    metadata_list, data_list = split_diffraction_data(link)
    structure_list = [metadata_list[0]]
    data_labels = [data_list[0].split()]
    clean_data_list = []
    #this loop works through the metadata and 
    #extracts the values that we want  
    #currently we have strings containing what we want and 
    #they need to be processed further to remove whitespace, 
    #cell paramaters need to be split up, 
    #extra label text needs to be removed, and 
    #strings of numbers need to be converted to floats

    for row in metadata_list:
        string = row
        entry = row
        if string.find('SPACE') != -1:
            structure_list.append(entry)
        if string.find('PARAMETERS') != -1:
            structure_list.append(entry)
    #print(structure_list) 

    #this loop works through the actuall data and 
    #extracts the rows with d spacing close to measured values from fft
    #needs to split the rows,
    #convert the strings of numbers into floats,
    #compile the rows which contain d spacings close to measured values from fft
    #remove extra data that we don't need to only keep d spacing and h k l values
    del data_list[0] #this command is removing the labels, which have been stored in a separate list
    for row in data_list:
        string = row
        entry = string.split()
        clean_row = []
        for item in entry:
            item = float(item)
            clean_row.append(item)
        clean_data_list.append(clean_row)
    return structure_list, data_labels, clean_data_list

    

def lists_to_dfs(link):
    '''
    lists_to_dfs generates pandas dataframes for the data and metadata 
    '''
    structure_list, data_labels, clean_data_list = sort_lists(link)
    mineral = structure_list[0]
    metadata_dict = {'Mineral_Name': mineral.strip()}
    cell_params = structure_list[1]
    cell_params_list = cell_params.split()
    del cell_params_list[0:2]#here i want to remove all list entries with a non digit \D
    cell_params_labels = ['a','b','c','alpha', 'gamma', 'beta']
    i = 0
    for entry in cell_params_labels:
        label = cell_params_labels[i]
        entry = cell_params_list[i]
        metadata_dict.update({label:[entry]})
        i = i + 1
    metadata_df = pd.DataFrame(metadata_dict)
    diffraction_df = pd.DataFrame(clean_data_list, columns=data_labels[0])
    diffraction_df = diffraction_df[['D-SPACING', 'H', 'K', 'L']]
    return metadata_df, diffraction_df

def select_data(link, d_spacing_list):
    '''
    select_data compiles a new dataframe from the diffraction data
    containing only the rows with relevant data to our search 
    '''
    metadata_df, diffraction_df = lists_to_dfs(link)
    structure_df = pd.DataFrame()
    for entry in d_spacing_list:
        d_spacing_df = diffraction_df.loc[np.isclose(diffraction_df['D-SPACING'], entry, atol=.1)]
        structure_df = structure_df.append(d_spacing_df)
#    crystalmaths_df = pd.concat([metadata_df, structure_df], axis=1, ignore_index=True)
    return structure_df, metadata_df


def query_wrapper(links_list, d_spacing_list):
    crystalmaths_master_list = []
    for link in links_list:
        mineral_tuple = (select_data(link, d_spacing_list))
        crystalmaths_master_list.append(mineral_tuple)
    return crystalmaths_master_list



In [4]:
dspacing1 = 3.3
dspacing2 = 3.5
d_spacing_list = [3.3, 3.5]
web_address = make_web_address(dspacing1, dspacing2)
links_list = compile_links(web_address)
query_wrapper(links_list, d_spacing_list)

[(    D-SPACING    H    K    L
  23     3.3944  0.0  1.0  5.0
  24     3.3470 -1.0  1.0  5.0
  25     3.2995  2.0  1.0  2.0
  26     3.2508  2.0  0.0  4.0
  27     3.2005 -2.0  1.0  4.0
  20     3.5840 -2.0  1.0  2.0
  21     3.4994  2.0  1.0  1.0
  22     3.4312  1.0  1.0  4.0,
    Mineral_Name       a       b        c   alpha    gamma    beta
  0   Gillulyite  9.5840  5.6790  21.5010  90.000  100.070  90.000),
 (   D-SPACING    H    K    L
  7     3.3005  5.0  1.0  1.0
  8     3.3005  3.0  3.0  3.0
  6     3.5007  4.0  2.0  2.0,
                  Mineral_Name        a        b        c   alpha   gamma  \
  0  Hydrate - tetrahydrofuran  17.1500  17.1500  17.1500  90.000  90.000   
  
       beta  
  0  90.000  ),
 (    D-SPACING    H    K    L
  19     3.2991  2.0  0.0  2.0
  20     3.2531 -2.0  2.0  1.0
  21     3.2457  3.0  0.0  0.0
  17     3.5000 -1.0  1.0  3.0
  18     3.4381  0.0  1.0  3.0,
      Mineral_Name        a       b        c   alpha    gamma    beta
  0  CsMo2O3(PO4)2 

In [5]:
link = 'http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=dif'
d_spacing = 3.3
d_spacing_list = [3.3, 3.5]
select_data(link, d_spacing_list)

(    D-SPACING    H    K    L
 23     3.3944  0.0  1.0  5.0
 24     3.3470 -1.0  1.0  5.0
 25     3.2995  2.0  1.0  2.0
 26     3.2508  2.0  0.0  4.0
 27     3.2005 -2.0  1.0  4.0
 20     3.5840 -2.0  1.0  2.0
 21     3.4994  2.0  1.0  1.0
 22     3.4312  1.0  1.0  4.0,
   Mineral_Name       a       b        c   alpha    gamma    beta
 0   Gillulyite  9.5840  5.6790  21.5010  90.000  100.070  90.000)

In [6]:
link = 'http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=dif'
d_spacing = 3.3
lists_to_dfs(link)

(  Mineral_Name       a       b        c   alpha    gamma    beta
 0   Gillulyite  9.5840  5.6790  21.5010  90.000  100.070  90.000,
      D-SPACING    H    K     L
 0       8.1078  1.0  0.0   1.0
 1       6.1945 -1.0  0.0   3.0
 2       5.4851  0.0  1.0   1.0
 3       5.2924  0.0  0.0   4.0
 4       5.2296  1.0  0.0   3.0
 ..         ...  ...  ...   ...
 234     1.1108  8.0  0.0   4.0
 235     1.0949  4.0  4.0   7.0
 236     1.0942  3.0  0.0  17.0
 237     1.0936 -7.0  3.0   6.0
 238     1.0925 -6.0  3.0  11.0
 
 [239 rows x 4 columns])

generate a web search of the minerals with matching d spacings

In [None]:
dspacing1 = 3.3
dspacing2 = 3.5
#may need put this into a wrapper function
#create a skeleton for a wrapper function
web_address = make_web_address(dspacing1, dspacing2)
link = web_address
link

create a list of links to diffraction text files for matching mineral structures

In [None]:
links_list = compile_links(web_address)
links_list

Testing that the split_diffraction_data function generates a tuple of two lists, one for metadata and one for diffraction data

In [8]:
link = 'http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=dif'
split_diffraction_data(link)

(['      Gillulyite',
  '      Foit F F',
  '      American Mineralogist 80 (1995) 394-399',
  '      The crystal structure of gillulyite',
  '      deposit',
  '      _database_code_amcsd 0001733',
  '      CELL PARAMETERS:   9.5840  5.6790 21.5010   90.000  100.070   90.000',
  '      SPACE GROUP: P2/n      ',
  '      X-RAY WAVELENGTH:     1.541838',
  '      Cell Volume:   1152.219',
  '      Density (g/cm3):      4.136',
  '      MAX. ABS. INTENSITY / VOLUME**2:      15.64160708    ',
  '      RIR:      1.231',
  '      RIR based on corundum from Acta Crystallographica A38 (1982) 733-739'],
 ['               2-THETA      INTENSITY    D-SPACING   H   K   L   Multiplicity',
  '                10.91          4.16        8.1078    1   0   1         2',
  '                14.30         16.99        6.1945   -1   0   3         2',
  '                16.16         12.17        5.4851    0   1   1         4',
  '                16.75         32.42        5.2924    0   0   4         2',
  

In [9]:
dspacing1 = 3.3
dspacing2 = 3.5

column_names = ["mineral", "symetry", "a", "b", "c", "alpha", "beta", "gama", "dspacing", "h", "k", "l"]
pot_planes_df = pd.DataFrame(columns = column_names)

#get_diffraction_data(links_list, dspacing1, dspacing2, pot_planes_df)

testing the sort_lists function

In [10]:
link = 'http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=dif'
sort_lists(link)

(['      Gillulyite',
  '      CELL PARAMETERS:   9.5840  5.6790 21.5010   90.000  100.070   90.000',
  '      SPACE GROUP: P2/n      '],
 [['2-THETA', 'INTENSITY', 'D-SPACING', 'H', 'K', 'L', 'Multiplicity']],
 [[10.91, 4.16, 8.1078, 1.0, 0.0, 1.0, 2.0],
  [14.3, 16.99, 6.1945, -1.0, 0.0, 3.0, 2.0],
  [16.16, 12.17, 5.4851, 0.0, 1.0, 1.0, 4.0],
  [16.75, 32.42, 5.2924, 0.0, 0.0, 4.0, 2.0],
  [16.95, 4.9, 5.2296, 1.0, 0.0, 3.0, 2.0],
  [17.72, 19.73, 5.0042, 0.0, 1.0, 2.0, 4.0],
  [18.23, 20.07, 4.8658, 1.0, 1.0, 0.0, 4.0],
  [18.81, 46.36, 4.7182, 2.0, 0.0, 0.0, 2.0],
  [19.08, 4.31, 4.6515, 1.0, 1.0, 1.0, 4.0],
  [20.07, 3.63, 4.4242, 0.0, 1.0, 3.0, 4.0],
  [20.77, 1.24, 4.2771, 1.0, 1.0, 2.0, 4.0],
  [21.22, 34.3, 4.1861, -1.0, 1.0, 3.0, 4.0],
  [21.45, 100.0, 4.143, -1.0, 0.0, 5.0, 2.0],
  [21.93, 16.8, 4.0539, 2.0, 0.0, 2.0, 2.0],
  [22.97, 33.52, 3.8718, 0.0, 1.0, 4.0, 4.0],
  [23.12, 6.2, 3.847, 1.0, 1.0, 3.0, 4.0],
  [23.7, 31.2, 3.7546, -1.0, 1.0, 4.0, 4.0],
  [24.32, 3.52, 3.