In [1]:
import numpy as np
import pandas as pd
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
mpdr = MPDataRetrieval() # or mpdr = MPDataRetrieval(api_key='YOUR_KEY')

In [2]:
# This statements obtains and stores the relevant data from MPD
# NOTE: Si was used as the criteria only for testing purposes. It will be changed later on
MPD_data = mpdr.get_dataframe(criteria='Si', properties=['xrd', 'band_gap', 'efermi'])

In [3]:
def extract_data(MPD_data_row):
    """
    Extracts the relevant XRD data from the dictionary obtained from MPD
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Extracting out the amplitude and two theta values from the dictionary contained inside the received data
    # then turning it into a pandas dataframe.
    dirty_df = pd.DataFrame(MPD_data_row['xrd']['Cu']['pattern'], columns=MPD_data_row['xrd']['Cu']['meta']) # Converts data into dataframe
    dirty_df.drop(['hkl','d_spacing'], axis=1, inplace=True) # Disposes of the hkl and d-spacing data

    # Sorting the peaks into the top 10 with the highest peaks
    dirty_df.sort_values('amplitude', ascending=False, inplace=True) # Sorts peaks from highest to smallest
    dirty_df.reset_index(drop=True, inplace=True) # Reseting index
    clean_df = dirty_df[:10] # Dropping all peaks below the top ten 

    return clean_df

In [4]:
# Function to reformat the data after cleaning
# Takes the dataframe and turns it into a dictionary wwhere all data points have a unique key
def reformat_data(MPD_data_row):
    """
    Reformats the cleaned data obtained from the extract_data function into a dictionary
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Cleaning data and creating empty dictionary
    clean_df = extract_data(MPD_data_row)
    mat_dict = {}

    # Loop to assign each data point to a key and stores it within the dictionary
    for i in range(0,20):
        if i < 10:
            amp_key = ('amplitude_' + str(i))
            mat_dict[amp_key] = clean_df['amplitude'][i]

        else:
            theta_key = ('two_theta_' + str(i-10))
            mat_dict[theta_key] = clean_df['two_theta'][i-10]

    return mat_dict

In [5]:
# Function 
def produce_data(MPD_data):
    """
    Produces the XRD and DOS data for all the materials passed to the function 
    
    Parameters:
    ----------
    MPD_data : Pandas dataframe
      The dataframe filled with data obtained from MPD 
    
    Returns:
    ----------
    full_df: Pandas dataframe
        The peaks, two theta values, band gap, and fermi energy for all the materials passed to the function
    """
    
    # Creating prelimanry containers for XRD and DOS data
    xrd_data = {}
    dos_data = MPD_data.drop(['xrd'], axis=1)
    
    # Loop to run through each row of the dataframe
    for i in range(len(MPD_data)):
        
        # Conditional to skip over materials with less than 10 XRD peaks
        # or no fermi energies
        if len(MPD_data.iloc[i]['xrd']['Cu']['pattern']) >= 10 and np.isnan(MPD_data.iloc[i]['efermi']) == False:
            
            # Obtaining and storing the XRD data for a material into a dictionary
            ID = MPD_data.index[i]
            mat_dict = reformat_data(MPD_data.iloc[i])
            xrd_data[ID] = mat_dict
            
        else:
            
            # Replaces rows that failed the conditional with NaN
            # This is for easy removal od the rows
            dos_data.iloc[i] = float('nan')
    
    # Creating the final dataframe from the obtained XRD and DOS dataframes
    dos_df = dos_data.dropna()
    xrd_df = pd.DataFrame.from_dict(xrd_data, orient='index')
    full_df = pd.concat([xrd_df, dos_df], axis=1, sort=False)
    
    return full_df

In [6]:
produce_data(MPD_data)

Unnamed: 0,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,amplitude_6,amplitude_7,amplitude_8,amplitude_9,...,two_theta_2,two_theta_3,two_theta_4,two_theta_5,two_theta_6,two_theta_7,two_theta_8,two_theta_9,band_gap,efermi
mp-1001113,100.0,78.288443,45.261343,40.440821,30.707488,26.515207,22.092113,21.73767,16.821364,16.690467,...,38.160738,171.874808,45.443514,176.442253,60.797841,148.648332,87.640251,80.839115,0.0,9.167451
mp-1056579,100.0,91.828984,79.970463,45.423897,30.872127,26.312484,23.760237,15.234896,14.045885,13.052846,...,37.945623,73.006051,140.704562,54.746898,123.597176,85.886608,135.942882,93.269152,0.0,10.142574
mp-1072544,100.0,74.250236,32.8647,32.201109,18.813035,17.05364,15.411324,14.668014,14.527636,13.937115,...,46.743236,32.580631,160.193335,50.741974,54.52423,42.462094,26.480052,98.859603,0.1444,3.791856
mp-109,100.0,71.345225,64.937703,44.646425,31.309877,23.430551,18.238651,16.929197,15.766726,15.187868,...,36.713745,38.289916,54.089719,38.027277,90.802389,155.580355,162.010286,169.996134,0.0,9.233677
mp-149,100.0,66.810009,39.697299,23.446055,19.43133,17.901536,16.361733,13.659079,12.509826,11.039952,...,55.74954,87.35576,126.141124,113.020106,75.826656,94.191997,155.193701,135.15437,0.6119,5.634221
mp-16220,100.0,91.19433,82.558929,64.17861,58.432972,54.837508,39.433914,35.692211,30.283545,28.405679,...,52.695561,43.866856,17.015078,20.877953,170.458021,34.420518,165.871291,153.977258,0.5334,4.097343
mp-165,100.0,76.453355,75.130992,62.759221,57.677657,51.843314,44.605352,32.784208,32.075634,30.306187,...,47.209853,30.25723,28.034159,166.986591,55.806977,39.12487,90.702622,130.816041,0.4517,5.898865
mp-168,100.0,70.590906,66.583676,24.133148,18.174017,17.474408,14.443226,13.804826,11.952374,11.798358,...,170.27677,91.12596,26.789507,55.202796,151.438382,116.678065,78.754528,140.441238,0.0,7.052171
mp-571520,100.0,43.334802,40.005817,32.91898,32.076427,30.139918,29.994833,25.861406,24.983342,21.405271,...,32.854949,51.589783,51.187368,170.098781,55.391229,52.089674,168.027021,51.288184,0.0,7.133047
mp-676011,100.0,63.941827,37.274526,36.587043,34.737404,33.876036,30.127408,28.958163,28.511417,27.622809,...,27.177174,32.059515,45.915358,36.2831,38.803881,51.718617,20.404388,35.42094,0.0,6.38371
