# Collected AMSCD cif files 

In [1]:
import polars as pl
import re
import os 
from glob import glob
import shutil

In [2]:
os.chdir('..')

## Downloading cif from AMSCD (skip)

In [3]:
df = pl.read_excel('./data/AMCSD-data/CIF overzicht fases.xlsx')
df = df.fill_null(strategy='forward')

with pl.Config(tbl_rows=100): 
    print(df)

shape: (31, 5)
┌────────────────────┬───────┬──────────┬─────────────┬─────────┐
│ Chemical formula   ┆ amcsd ┆ ICDD     ┆ Space group ┆ CIF     │
│ ---                ┆ ---   ┆ ---      ┆ ---         ┆ ---     │
│ str                ┆ i64   ┆ str      ┆ str         ┆ i64     │
╞════════════════════╪═══════╪══════════╪═════════════╪═════════╡
│ CdS                ┆ 11540 ┆ null     ┆ P 63 m d    ┆ 1010154 │
│ CdS                ┆ 11540 ┆ null     ┆ F4 3 m      ┆ 1011251 │
│ CdS                ┆ 11540 ┆ null     ┆ F4 3 m      ┆ 1011260 │
│ Cd1-xZnxS          ┆ 11540 ┆ null     ┆ F4 3 m      ┆ 1011260 │
│ CdC2O4             ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 2012182 │
│ CdCO3              ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 1011341 │
│ CdSO4              ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 1533737 │
│ CdSO4.H2O          ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 2241132 │
│ (CdSO4)3.8H2O      ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 1010534 │
│ (NH4)2Cd2(SO4)3    ┆ 11540 ┆ 014-0712 ┆ F4 3 m      ┆ 10105

In [4]:
phases = df.select('Chemical formula').to_numpy()[:, 0]

In [5]:
print(phases)

['CdS' 'CdS' 'CdS' 'Cd1-xZnxS' 'CdC2O4' 'CdCO3' 'CdSO4' 'CdSO4.H2O'
 '(CdSO4)3.8H2O' '(NH4)2Cd2(SO4)3' 'Cd2(OH)2SO4' '\xa02 CdSO3*3H2O'
 'CdCl2' 'CdCl2*H2O' 'CdCl2*2.5H2O' 'CdOHCl' 'CdOHCl' 'Cd2(OH)3Cl'
 'Cd4SO4(OH)6*1.5H2O' 'PbSO4' 'PbCO3' '2PbCO3.Pb(OH)2' 'ZnS' 'ZnSO4'
 'ZnC2O4.2H2O' 'MgCO3' 'CaMg(CO3)2' 'CaCO3' 'BaSO4' 'BaSO4.ZnS' 'Na2SO4']


In [6]:
elems = ['Cd S', 'Cd Zn S', 'Cd C O', 'Cd S O', 'Cd S H O', 'N H Cd S O', 'Cd O H S', 'Cd Cl', 'Cd Cl H O',  'Cd S O H', 'Pb S O', 
         'Pb C O', 'Pb C O H', 'Zn S', 'Zn S O', 'Zn C O H', 'Mg C O', 'Ca Mg C O', 'Ca C O', 'Ba S O', 'Ba S O Zn S', 'Na S O']

In [4]:
len(elems)

22

In [7]:
queries = []

for e in elems: 
    e = re.sub('\\s+', ',', e)
    e = f'({e})-'
    queries.append(e)
    print(e)
   

(Cd,S)-
(Cd,Zn,S)-
(Cd,C,O)-
(Cd,S,O)-
(Cd,S,H,O)-
(N,H,Cd,S,O)-
(Cd,O,H,S)-
(Cd,Cl)-
(Cd,Cl,H,O)-
(Cd,S,O,H)-
(Pb,S,O)-
(Pb,C,O)-
(Pb,C,O,H)-
(Zn,S)-
(Zn,S,O)-
(Zn,C,O,H)-
(Mg,C,O)-
(Ca,Mg,C,O)-
(Ca,C,O)-
(Ba,S,O)-
(Ba,S,O,Zn,S)-
(Na,S,O)-


In [22]:
zip_cifs = glob('data/AMCSD-data/amscd-zip-files/**/*.cif', recursive=True)
zip_cifs

['data/AMCSD-data/amscd-zip-files/(CdSO4)3(H2O)8_1010534.cif',
 'data/AMCSD-data/amscd-zip-files/cFLJRMJVuNChxoL/Na2 O3 S\r_0017434.cif',
 'data/AMCSD-data/amscd-zip-files/cFLJRMJVuNChxoL/Na2 O4 S\r_0017433.cif',
 'data/AMCSD-data/amscd-zip-files/cFLJRMJVuNChxoL/Thenardite_0005113.cif',
 'data/AMCSD-data/amscd-zip-files/cFLJRMJVuNChxoL/Thenardite_0018059.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumChlorideHydrated25_2106551.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumChlorideHydrated_2106702.cif',
 'data/AMCSD-data/amscd-zip-files/cif/cadmiumHydroxyChloride_1010263.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumHydroxySulphate_2107256.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumOxalate_2012182.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumSulphateHydroxyHydrated_1525852.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumSulphateMonohydrated_2241132.cif',
 'data/AMCSD-data/amscd-zip-files/cif/CadmiumSulphate_1533737.cif',
 'data/AMCSD-data/amscd-zip-files/c

In [23]:
for i, cf in enumerate(zip_cifs): 
    if 'Zincite' in cf: 
        print(i, cf)

17 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0005203.cif
18 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0005204.cif
19 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0005205.cif
20 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0005206.cif
21 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0011555.cif
22 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0015176.cif
23 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0018119.cif
24 data/AMCSD-data/amscd-zip-files/ELVwjmUZsrProKj/Zincite_0018120.cif


In [24]:
for cf in zip_cifs: 
    basename = os.path.basename(cf)
    shutil.copy(cf, f'data/AMCSD-data/amscd-cif-files/{basename}')

## Collecting all crystal structures 

I downloaded multiple cif files for each query. Now let's pick one cif file for each crystal structure...

In [25]:
cif_paths = glob('data/AMCSD-data/amscd-cif-files/*.cif')
len(cif_paths)

243

In [26]:
cif_files = [os.path.basename(cp) for cp in cif_paths]

names = [re.split('_', cf)[0] for cf in cif_files]
unique_names = list(set(names))

unique_cif_files = [cif_files[names.index(un)] for un in unique_names]

unique_cif_files

['Lanarkite_0010717.cif',
 'Pb3O2(CO3)\r_0014552.cif',
 'Anglesite_0000665.cif',
 'Calcite_0000098.cif',
 'Thenardite_0005113.cif',
 'CadmiumOxalate_2012182.cif',
 'Shannonite_0014550.cif',
 'Glikinite_0020988.cif',
 'Sphalerite_0000110.cif',
 'Vaterite_0004854.cif',
 'CdCl2_0011804.cif',
 'Na2 O3 S\r_0017434.cif',
 'CadmiumChlorideHydrated_2106702.cif',
 'Magnesite_0000099.cif',
 'Sclarite_01317.cif',
 'Hydrocerussite_0009160.cif',
 'Pb3O2(CO3)_0014552.cif',
 'Wurtzite-2H_0010082.cif',
 'Huntite_0001006.cif',
 'cadmiumHydroxyChloride_1010263.cif',
 'Otavite_0000103.cif',
 'Dolomite_0000086.cif',
 'CdCl2\r_0011804.cif',
 'CadmiumHydroxySulphate_2107256.cif',
 'Greenockite_0011540.cif',
 'CadmiumSulphateMonohydrated_2241132.cif',
 'Na2 O4 S\r_0017433.cif',
 'Na2 O3 S_0017434.cif',
 'Hawleyite_0000070.cif',
 'Baryte_0000164.cif',
 'Zincite_0005203.cif',
 'Wurtzite-4H_0005544.cif',
 'Wurtzite-15R_0015414.cif',
 'CadmiumSulphateHydroxyHydrated_1525852.cif',
 'Na2 O4 S_0017433.cif',
 'Cadmi

In [27]:
unique_cif_files = ['Lanarkite_0010717.cif',
 'Pb3O2(CO3)\r_0014552.cif',
 'Anglesite_0000665.cif',
 'Calcite_0000098.cif',
 'Thenardite_0005113.cif',
 'CadmiumOxalate_2012182.cif',
 'Shannonite_0014550.cif',
 'Glikinite_0020988.cif',
 'Sphalerite_0000110.cif',
 'Vaterite_0004854.cif',
 'CdCl2_0011804.cif',
 'Na2 O3 S\r_0017434.cif',
 'CadmiumChlorideHydrated_2106702.cif',
 'Magnesite_0000099.cif',
 'Sclarite_01317.cif',
 'Hydrocerussite_0009160.cif',
 'Pb3O2(CO3)_0014552.cif',
 'Wurtzite-2H_0010082.cif',
 'Huntite_0001006.cif',
 'cadmiumHydroxyChloride_1010263.cif',
 'Otavite_0000103.cif',
 'Dolomite_0000086.cif',
 'CdCl2\r_0011804.cif',
 'CadmiumHydroxySulphate_2107256.cif',
 'Greenockite_0011540.cif',
 'CadmiumSulphateMonohydrated_2241132.cif',
 'Na2 O4 S\r_0017433.cif',
 'Na2 O3 S_0017434.cif',
 'Hawleyite_0000070.cif',
 'Baryte_0000164.cif',
 'Zincite_0005203.cif',
 'Wurtzite-4H_0005544.cif',
 'Wurtzite-15R_0015414.cif',
 'CadmiumSulphateHydroxyHydrated_1525852.cif',
 'Na2 O4 S_0017433.cif',
 'CadmiumSulphiteHydrated_1541695.cif',
 'CadmiumSulphate_1533737.cif',
 'Wurtzite-10H_0000088.cif',
 'Scotlandite_0015703.cif',
 'Wurtzite_0015179.cif',
 'Zinkosite_0009208.cif',
 'Barite_0000163.cif',
 'Wurtzite-6H_0000115.cif',
 'CadmiumChlorideHydrated25_2106551.cif',
 'Cerussite_0006304.cif',
 'Wurtzite-8H_0000089.cif',
 '(CdSO4)3(H2O)8_1010534.cif',
 'Aragonite_0000233.cif']