In [1]:
from pymatgen.ext.matproj import MPRester
import numpy as np
import pandas as pd
import os

In [24]:
def CIFdownload(api, filepath, lower_bound, higher_bound):
    
    """
    This is a function that allows the user to download band gap values and the crystal structure
    of the interested materialsfrom the library of Materials Project. The cif data will be extracted 
    from the downloaded raw data and save as individual files in the provided path with a naming 
    format "index.cif".
    
    References of query syntax and downlodable properties:
    https://docs.mongodb.com/manual/tutorial/query-documents/
    https://pymatgen.org/introduction.html
    """
    
    mpr = MPRester(api)
    data = mpr.query(criteria = {'$and':[{'band_gap': {'$gt': lower_bound}},{'band_gap': {'$lt': higher_bound}}]}, 
                     properties = ['pretty_formula','cif', 'band_gap'])
    
    for i in range(len(data)):
        with open(filepath+'%i.cif'%i, 'w') as w:
            w.write(data[i]['cif'])
    return data

    


def CIFconvert(filepath):
    
    """
    This is a function that reads cif files in the same path and 
    combine the cell parameters into a dataframe. The user will have 
    to first name their cif files with the sequential index.
        
    E.g. "1.cif", "2.cif",...
    """
    col = ['cell_length_a', 'cell_length_b', 'cell_length_c',
               'cell_angle_alpha', 'cell_angle_beta','cell_angle_gamma']
    df = pd.DataFrame(columns = col)
    for j in range(len(os.listdir(filepath))-1):# ignore the checkpoint
        with open(filepath+'%i.cif'%j, 'r') as r:
            line = r.readlines()
            val = []
            for l in line: # i don't know why AND won't work here
                if 'length' in l:
                    x = l.split(' ')
                    val.append(x[-1][:-1])
                if 'angle' in l:
                    x = l.split(' ')
                    val.append(x[-1][:-1])
        df_j = pd.DataFrame(data = [val], columns = col)
        df = pd.concat([df, df_j], ignore_index = True)
    return df


In [26]:
# Ask the user if they would upload the cif file themselves or download from materials project
response = input('Do you want to upload the cif file by yourself? (y/n):')

Do you want to upload the cif file by yourself? (y/n): n


In [27]:
# Sequential input promp to define the range of band gaps the user is interested.
# Here for a fast demo, we strict the range to a narrow gap to reduce the datasize.
if response == 'n':
    api = input('Please provide with you API here:')
    lower_bound = float(input('Please provide the lower bound of the bandgap:'))
    higher_bound = float(input('Please provide the higher bound of the bandgap:'))
    path = input('Please provide a saving path for the cif files here:')
    data = CIFdownload(api, path, lower_bound, higher_bound)
if response == 'y':
    path = input('Please provide the path of your cif files here:')


Please provide with you API here: rIn4iGCK5MpgmdWtQtJ
Please provide the lower bound of the bandgap: 1.8
Please provide the higher bound of the bandgap: 1.9
Please provide a saving path for the cif files here: /Users/yifeihe/DIRECT/win22_project/test_folder/


100%|██████████████████████████████████████| 1375/1375 [00:01<00:00, 913.59it/s]


In [30]:
cell = CIFconvert(path) # read cell parameters from the cif files and compile into a dataframe

In [31]:
cell

Unnamed: 0,cell_length_a,cell_length_b,cell_length_c,cell_angle_alpha,cell_angle_beta,cell_angle_gamma
0,4.64594705,4.64594705,7.81070100,90.00000000,90.00000000,115.94818905
1,3.71078575,3.71078575,3.71078533,85.43448185,85.43448185,85.43449271
2,5.62415800,9.09580400,10.99891300,90.00000000,90.00000000,90.00000000
3,9.13635100,5.63018800,12.17363312,63.87454728,90.00000000,90.00000000
4,8.45961172,8.45961172,8.45961172,120.09771597,120.09771597,89.83083414
...,...,...,...,...,...,...
1370,8.75956000,5.53595900,12.57236963,77.77118115,90.00000000,90.00000000
1371,10.24839978,9.27111548,6.86359102,77.27519116,61.93588181,40.78892703
1372,5.71697900,5.40173600,9.41414666,55.14483997,90.00000000,90.00000000
1373,10.17744512,8.93182692,7.26197280,77.11260130,58.81536799,44.07203071


In [39]:
# append the structure with the pretty formula and bandgap values
bandgap = []
pretty_formula = []
for i in range(len(data)):
    bandgap.append(data[i]['band_gap'])
    pretty_formula.append(data[i]['pretty_formula'])
dict = {'pretty_formula': pretty_formula, 'bandgap': bandgap}
df1 = pd.DataFrame(data = dict)  # The subdataframe that contains the pretty formula and the band gap

df = pd.concat([df1, cell], axis = 1)
df.head()

Unnamed: 0,pretty_formula,bandgap,cell_length_a,cell_length_b,cell_length_c,cell_angle_alpha,cell_angle_beta,cell_angle_gamma
0,BrCl,1.8278,4.64594705,4.64594705,7.810701,90.0,90.0,115.94818905
1,SrC2,1.8008,3.71078575,3.71078575,3.71078533,85.43448185,85.43448185,85.43449271
2,Cs2Se,1.8027,5.624158,9.095804,10.998913,90.0,90.0,90.0
3,Cs2Se,1.8141,9.136351,5.630188,12.17363312,63.87454728,90.0,90.0
4,CsPbBr3,1.8178,8.45961172,8.45961172,8.45961172,120.09771597,120.09771597,89.83083414


In [41]:
# save the final dataframe into the path of the cif files.
df.to_csv(path+'CIF_data.csv')