This notebook is for creating the functionality of querying an external database and extracting data from that database, and storing it into a numpy array

In [29]:
#import the needed packages
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
import re
import csv

In [169]:
#define the needed functions
#take one input that's a list to be able to have a variable number of dspacings
def make_web_address(dspacing1, dspacing2):
    '''
    make_web_address will compile a string to use for a webpage address
    
    For example, if I want to search for rutile on the American 
    Minerals Society webpage, I might use this address:
    
    http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.2435,2.4836),opt(),type(d-spacing),tolerance(.001)
    
    Where 3.2435 and 2.4836 are two d spacings in the rutile structure
    '''
    
    web_address = 'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(' + str(dspacing1) + ',' + str(dspacing2) +'),opt(),type(d-spacing),tolerance(.001)'
    
    return web_address

def find_diffraction_files(href):
    '''
    find_diffraction_files is a sorting function that can be passed 
    into compile_links to generate a specific set of diffraction text 
    file web addresses from the American Mineral Society database
    '''
# commented out logic will return the text file links
#     return href and re.compile("txt").search(href) and not re.compile("dif").search(href)        
    return href and re.compile("dif").search(href)        

def compile_links(web_address):
    '''
    access_web accesses a webpage at a given address,
    finds all of the links on that page, and appends those links 
    to a list called links.
    
    inputs are a web address, and the array for storing links
    '''
    
    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in html_page.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(html_page.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding)
    links_list = []
    for link in soup.find_all(href=find_diffraction_files):
        links_list.append('http://rruff.geo.arizona.edu'+link['href'])
    
    return links_list

def get_diffraction_data(links_list, dspacing1, dspacing2, pot_planes_df):
    '''
    get_diffraction_data will compile crystal structure parameters and 
    indexes of all potential planes into a pandas DataFrame
    
    Inputs are an array of truncated href, two d spacings from the FFT, 
    and the pandas DataFrame into which data will be stored. Returns the
    populated DataFrame.
    '''
    
#     with requests.Session() as s:
#     download = s.get(CSV_URL)

#     decoded_content = download.content.decode('utf-8')

#     cr = csv.reader(decoded_content.splitlines(), delimiter=',')
#     my_list = list(cr)
#     for row in my_list:
#         print(row)
    
    for link in links_list:
        with requests.Session() as s:
            download = s.get(link)
            decoded_content = download.content.decode('utf-8')
            metadata_list = []
            reader = csv.reader(decoded_content.splitlines())
            raw_data_list = []
            for i, row in enumerate(reader):
                #This is for handling [], empty rows
                if len(row) != 0:
                    raw_data_list.append(row)
                    string = row[0]
                    if string.find('2-THETA') != -1:
                        metadata_end_index = i
                    elif string.find('==============================') != -1:
                        data_end_index = i
                    else:
                        pass
                
            metadata_list = raw_data_list[0:metadata_end_index-1]
            data_list = raw_data_list[metadata_end_index-1:data_end_index-1]
            data_df = pd.DataFrame(data_list, delimiter =' ')
#             print(metadata_list,'\n\n\n\n', data_list)
    
        
#         dif_text_url = requests.get(entry)
#         dif_text = dif_text_url.csv
        
#         temp_df = pd.read_csv(entry)
 
    # 
    return data_df
  

generate a web search of the minerals with matching d spacings

In [68]:
dspacing1 = 3.3
dspacing2 = 3.5
#may need put this into a wrapper function
#create a skeleton for a wrapper function
web_address = make_web_address(dspacing1, dspacing2)
web_address

'http://rruff.geo.arizona.edu/AMS/result.php?diff=vals(3.3,3.5),opt(),type(d-spacing),tolerance(.001)'

create a list of links to diffraction text files for matching mineral structures

In [69]:
links_list = compile_links(web_address)
links_list

['http://rruff.geo.arizona.edu/AMS/download.php?id=01755.txt&down=text',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=16200.txt&down=text',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=17097.txt&down=text',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=18445.txt&down=text',
 'http://rruff.geo.arizona.edu/AMS/download.php?id=20133.txt&down=text']

In [170]:
dspacing1 = 3.3
dspacing2 = 3.5

column_names = ["mineral", "symetry", "a", "b", "c", "alpha", "beta", "gama", "dspacing", "h", "k", "l"]
pot_planes_df = pd.DataFrame(columns = column_names)

get_diffraction_data(links_list, dspacing1, dspacing2, pot_planes_df)

Unnamed: 0,0
0,2-THETA INTENSITY D-SPA...
1,6.31 5.36 14.0...
2,8.93 33.15 9.8...
3,12.65 13.77 7.0...
4,14.15 2.42 6.2...
...,...
151,88.92 3.56 1.1...
152,88.99 2.06 1.0...
153,89.29 1.18 1.0...
154,89.38 3.50 1.0...
