In [26]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os
from bs4 import BeautifulSoup
import json
import re


In [27]:

# instantiate a chrome options object so you can set the size and headless preference
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")

driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://aflowlib.org/CrystalDatabase/prototype_index.html')

html = driver.page_source
soup = BeautifulSoup(html)

  


In [28]:
table = pd.read_html(str(soup.table))

In [29]:
df_source = table[0]
df_source.index.name = 'prototype_index'
# data clean up
df_source.iloc[24, 0] = 'CaCO3'
df_source.iloc[25, 0] = 'FeB'
df_source.iloc[54, 0] = 'alpha-Pa'
df_source.iloc[60, 0] = 'eta-Fe2C'
df_source.iloc[62, 0] = 'alpha-Ga'
df_source.iloc[80, 0] = 'HgCl2'
df_source.iloc[165, 0] = 'SnS'
df_source.iloc[187, 0] = 'alpha-CO'
df_source.iloc[190, 0] = 'beta-Po'
df_source.iloc[204, 0] = 'alpha-Hg'
df_source.iloc[235, 0] = 'alpha-As'
df_source.iloc[253, 0] = 'PbCl2'
df_source.iloc[261, 0] = 'beta-O'
df_source.iloc[377, 0] = 'SeO2'
df_source.iloc[418, 0] = 'SrH2'
df_source.iloc[491, 0] = 'FeAs'
df_source.iloc[565, 0] = 'PbO'

df_source.head(10)

Unnamed: 0_level_0,Prototype,Number of Species,Number of Atoms,Pearson Symbol,Strukturbericht Designation,AFLOW Prototype,Space Group Symbol,Space Group Number,Notes
prototype_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,C,1,4,hP4,,A_hP4_194_f,P63/mmcP6_{3}/mmc,194,Lonsdaleite (Hexagonal Diamond) Structure
1,Cr3Si,2,8,cP8,A15A15,A3B_cP8_223_c_a,Pmˉ3nPm\bar{3}n,223,Cr3Si (A15) Structure
2,V4Zn5,2,9,tI18,,A4B5_tI18_139_i_ah,I4/mmmI4/mmm,139,V4Zn5 Structure
3,Mn3O4,2,14,tI28,,A3B4_tI28_141_ad_h,I41/amdI4_{1}/amd,141,Hausmannite (Mn3O4) Structure
4,B5W2,2,14,hP14,D8hD8_{h},A5B2_hP14_194_abdf_f,P63/mmcP6_{3}/mmc,194,W2B5 (D8h) Structure
5,O3Pb(Ti0.48)Zr0.52,3,5,tP5,,A3BC_tP5_99_bc_a_b,P4mmP4mm,99,Tetragonal PZT [Pb(ZrxTi1–x)O3] Structure
6,Ga2Hf,2,12,tI24,,A2B_tI24_141_2e_e,I41/amdI4_{1}/amd,141,Ga2Hf Structure
7,O2Si,2,12,tP12,C30C30,A2B_tP12_92_b_a,P41212P4_{1}2_{1}2,92,"α\alpha–Cristobalite (SiO2, C30C30, low) Struc..."
8,AuCu3,2,4,cP4,L12L1_{2},AB3_cP4_221_a_c,Pmˉ3mPm\bar{3}m,221,Cu3Au (L12) Structure
9,BaS3,2,16,oP16,,AB3_oP16_18_ab_3c,P21212P2_{1}2_{1}2,18,BaS3 Structure


In [30]:
df_source.to_csv('Aflow_raw_data.csv')

In [31]:
df = pd.DataFrame(columns=['id', 'Pearson_symbol','space_group_number', 'Wyckoff_site', 'lattice_params_list',
                           'basis_params_list', 'lattice_params_value_list', 'basis_params_value_list'])
df.head()

Unnamed: 0,id,Pearson_symbol,space_group_number,Wyckoff_site,lattice_params_list,basis_params_list,lattice_params_value_list,basis_params_value_list


In [32]:
def process(labels):
    output = ''
    n = len(labels)
    i = 0
    while i < n:
        if labels[i].isnumeric():
            if labels[i+1].isnumeric():
                output += int(labels[i:i+2])*labels[i+2]
                i += 3
            else:
                output += int(labels[i])*labels[i+1]
                i += 2
        else:
            output += labels[i]
            i += 1
    return output
            
def extract_info(string):
    wyckoff_site = []
    labels = string.split('_')[3:]
    for site in labels:
        wyckoff_site.append(process(site))
    return wyckoff_site

for index, row in df_source.iterrows():
    df = df.append({'id' : row['Pearson Symbol'] + '-' + row['Prototype'] + '-' + str(row['Space Group Number']), 'Pearson_symbol':row['Pearson Symbol'], 
               'space_group_number': row['Space Group Number'], 'Wyckoff_site': extract_info(row['AFLOW Prototype'])},ignore_index=True)

In [33]:
df

Unnamed: 0,id,Pearson_symbol,space_group_number,Wyckoff_site,lattice_params_list,basis_params_list,lattice_params_value_list,basis_params_value_list
0,hP4-C-194,hP4,194,[f],,,,
1,cP8-Cr3Si-223,cP8,223,"[c, a]",,,,
2,tI18-V4Zn5-139,tI18,139,"[i, ah]",,,,
3,tI28-Mn3O4-141,tI28,141,"[ad, h]",,,,
4,hP14-B5W2-194,hP14,194,"[abdf, f]",,,,
...,...,...,...,...,...,...,...,...
585,tP32-Ga5Ir3-118,tP32,118,"[gii, aceh]",,,,
586,cI52-Fe3Zn10-229,cI52,229,"[e, fh]",,,,
587,hP24-AuF3-178,hP24,178,"[b, ac]",,,,
588,tP32-BaSi-130,tP32,130,"[cg, cf]",,,,


In [34]:
def process_parameters(parameters):
        print(parameters)
        parameters = parameters.split(',')
        params1 = []
        params2 = []
        for para in parameters:
            if para == 'a':
                params1.append('a')
            elif para == 'b/a':
                params1.append('b/a')
            elif para == 'c/a':
                params1.append('c/a')
            elif para == '\\\\alpha':
                params1.append('alpha')
            elif para == '\\\\beta':
                params1.append('beta')
            elif para == '\\\\gamma':
                params1.append('gamma')
            else:
                params2.append(''.join(re.findall(r'[0-9a-z]', para)))
        return params1, params2
i = 0

In [56]:
# get info
while i < 590:
    name = df_source['AFLOW Prototype'][i]
    element = df_source['Prototype'][i]
    print(name, element)
    try :
        driver.get('http://aflowlib.org/CrystalDatabase/%s.html' %name)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        info = str(soup.find_all('script')[-3])
    except:
        try:
            driver.get('http://aflowlib.org/CrystalDatabase/%s.%s.html' %(name, element))
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            info = str(soup.find_all('script')[-3])
        except:
            driver.get('http://aflowlib.org/CrystalDatabase/%s-%s.html' %(name, element))
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            info = str(soup.find_all('script')[-3])

    parameters = re.findall(r'var aflow_params_str = "\s*(.*?)";', info)[0]
    values = re.findall(r'aflow_parameter_values = \s*(.*?);', info)[0]


    params1, params2 = process_parameters(parameters)
    exec('values = %s'%values)
    df['lattice_params_list'][i] = params1
    df['basis_params_list'][i] = params2
    df['lattice_params_value_list'][i] = values[:len(params1)]
    df['basis_params_value_list'][i] = values[len(params1):]
    i += 1
    

A4B_tI20_88_f_a Cl4Th
a,c/a,x_{2},y_{2},z_{2}
A2B_hP36_177_j2lm_n O2Si
a,c/a,x_{1},x_{2},x_{3},x_{4},x_{5},y_{5},z_{5}
A2BC_oC16_67_ag_b_g Al2CuIr
a,b/a,c/a,z_{3},z_{4}
ABC2_oP16_53_h_e_gh NiTaTe2
a,b/a,c/a,x_{1},y_{2},y_{3},z_{3},y_{4},z_{4}
A5B5C4_tP28_104_ac_ac_c Ba5Bi5In4
a,c/a,z_{1},z_{2},x_{3},y_{3},z_{3},x_{4},y_{4},z_{4},x_{5},y_{5},z_{5}
A4B2C_tP28_135_gh_h_d O4Sb2Zn
a,c/a,x_{2},x_{3},y_{3},x_{4},y_{4}
A3B3C_cI56_214_g_h_a Ca3I3P
a,y_{2},y_{3}
A5B3_tP32_118_g2i_aceh Ga5Ir3
a,c/a,z_{3},x_{4},z_{5},x_{6},y_{6},z_{6},x_{7},y_{7},z_{7}
A3B10_cI52_229_e_fh Fe3Zn10
a,x_{1},x_{2},y_{3}
AB3_hP24_178_b_ac AuF3
a,c/a,x_{1},x_{2},x_{3},y_{3},z_{3}
A5B3_tP32_130_cg_cf BaSi
a,c/a,z_{1},z_{2},x_{3},x_{4},y_{4},z_{4}
A3B_hP24_153_3c_2b Cl3Cr
a,c/a,x_{1},x_{2},x_{3},y_{3},z_{3},x_{4},y_{4},z_{4},x_{5},y_{5},z_{5}


In [58]:
# duplication check
print(df['id'].is_unique)
dup = df[df.duplicated(['id'])]
print(dup)

False
            id Pearson_symbol space_group_number Wyckoff_site  \
209  hP4-C-194            hP4                194         [bc]   

    lattice_params_list basis_params_list lattice_params_value_list  \
209            [a, c/a]                []    [2.464, 2.72362012987]   

    basis_params_value_list  
209                      []  


In [59]:
# sanity check
for index, row in df.iterrows():
    number_of_Wyckoff = len(''.join(row['Wyckoff_site']))
    try:
        number_in_para = int(row['basis_params_list'][-1][1:])
    except:
        number_in_para = number_of_Wyckoff
    try:
        assert(number_of_Wyckoff == number_in_para)
    except:
        print(number_of_Wyckoff, number_in_para, index)
        print(row)

3 2 262
id                                    hP12-O2Si-194
Pearson_symbol                                 hP12
space_group_number                              194
Wyckoff_site                                [cg, f]
lattice_params_list                        [a, c/a]
basis_params_list                              [z2]
lattice_params_value_list    [5.052, 1.63697545527]
basis_params_value_list                     [0.062]
Name: 262, dtype: object


In [60]:
# mannual corrections
df.loc[39,'basis_params_list'] = "['x2', 'x3', 'x4', 'x5', 'y5']"
df.loc[73,'basis_params_list'] = "['x1', 'y1', 'x2', 'y2']"
df.loc[77,'basis_params_list'] = "['x1', 'y1', 'x2', 'y2']"
df.loc[132,'basis_params_list'] = "['x1', 'x2']"
df.loc[208,'basis_params_list'] = "['x2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8', 'x9', 'y9', 'x10', 'y10', 'x11', 'y11', 'x12', 'y12', 'z12', 'x13', 'y13', 'z13', 'x14', 'y14', 'z14', 'x15', 'y15', 'z15']"
df.loc[434,'basis_params_list'] = "['x1', 'x2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8', 'x9', 'y9', 'z9']"
df.loc[495,'basis_params_list'] = "['x1', 'x2', 'y2']"
df.index.name = 'prototype_index'

df.to_csv('Aflow_processed_data.csv')