## This notebook generates the "PredictionSet_3" file callled in "Part IV" of "Iter_n.ipynb" files.

In [1]:
import pandas as pd

from rdkit import Chem as CH
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

from pymatgen.core.composition import Composition

from matminer.featurizers.composition import ElectronAffinity
from matminer.featurizers.composition import ElementFraction



## 1. Filtered dataset of 2D bilayers
### Note: Four oxides (MO$_2$, M = W, Hf, Ti, Zr) were removed due to major reconstruction in their lattice post intercalation with test molecules

In [2]:
MaterialFeatures = pd.read_csv('DataSource/MaterialFeatures2')
MaterialFeatures

Unnamed: 0,Name,SeperationE,a,b,c,SG,MatFormula
0,MoS2,1.766763,3.154333,3.154333,27.427,164.0,Mo18S36
1,MoSe2,2.056959,3.285,3.285,28.094,164.0,Mo18Se36
2,MoTe2,2.636494,3.493333,3.493333,28.999,164.0,Mo18Te36
3,MoO2,1.202318,2.828333,2.828333,25.01,164.0,Mo18O36
4,WS2,1.86527,3.162333,3.162333,27.46,164.0,W18S36
5,WSe2,2.142525,3.291667,3.291667,28.154,164.0,W18Se36
6,WTe2,2.694539,3.508333,3.508333,29.028,164.0,W18Te36
7,WO2,1.340843,2.832333,2.832333,25.022,164.0,W18O36
8,HfS2,1.844773,3.531667,3.531667,27.36,164.0,Hf18S36
9,HfSe2,2.075391,3.678,3.678,27.514,164.0,Hf18Se36


## 2. Dataset of planar molecules from PubChem filtered by: 
### (a) PBF < 1E-04
### (b) charge neutrality
### (c) no uncommon isotopes or valencies
### (d) fit within 2D bilayer diagonal

In [3]:
PlanarMol = pd.read_csv('DataSource/filtered_planar_mol_by_fit_with_features', index_col=None, delimiter=',')
PlanarMol

Unnamed: 0,CID,SMILES,TPSA,HeavyAtoms,AromaticRings,MolWt,LabuteASA,SPAN,GeDi
0,13,C1=CC(=C(C=C1Cl)Cl)Cl,0.00,9,1,181.449,68.341202,3.481711,6.260667
1,66,C1=CC(=C(C=C1Cl)O)Cl,20.23,9,1,163.003,62.832169,3.313690,6.323829
2,101,C1=CC(=CC(=C1)O)C=O,37.30,9,1,122.123,52.752101,3.360840,6.313907
3,240,C1=CC=C(C=C1)C=O,17.07,8,1,106.124,47.957867,3.345250,6.138852
4,241,C1=CC=CC=C1,0.00,6,1,78.114,37.431403,2.494200,4.970533
...,...,...,...,...,...,...,...,...,...
10113,5367417,C=CC=CC=C,0.00,6,0,80.130,38.495153,3.942085,7.841565
10114,137747,C=C=C=C,0.00,4,0,52.076,25.765268,2.701678,5.242299
10115,5463058,CC=CC=CC=C,0.00,7,0,94.157,44.860095,4.174226,8.014709
10116,139187,C=CC=C=C,0.00,5,0,66.103,32.130210,3.220374,6.013381


## 3. Dataset of all possible hybrid materials made using datasets from 1. and 2.

In [4]:
PlanarMol = PlanarMol.astype({'CID' :'str'})

## Making all possible MX2/SMILES combinations
sys = []
for i in PlanarMol['CID']:
    for j in MaterialFeatures['MatFormula']:
        text = i +'-'+ j
        sys.append(text)


## Merging with above datasets 1. and 2. - 
## Below lines are to get features for all combinations.

TestSet = pd.DataFrame(sys, columns=['System'])
Mat = []
CID = []

for i in TestSet['System']:
    strip_text = str(i).split('-', 1)
    CID.append(strip_text[0])
    Mat.append(strip_text[1])

TestSet['MatFormula'] = pd.DataFrame(Mat)
TestSet['CID'] = pd.DataFrame(CID)
TestSet = TestSet.merge(MaterialFeatures, on=["MatFormula"], how="left")
TestSet = TestSet.merge(PlanarMol, on=["CID"], how="left")

gas_formula = []
for i in TestSet['SMILES']:
    m = CH.MolFromSmiles(i)
    gas_formula.append(CalcMolFormula(m))
TestSet['Gas_Formula'] = gas_formula
    
TestSet['CompoundFormula'] = TestSet['MatFormula']+TestSet['Gas_Formula']

Comp = []
for i in TestSet['CompoundFormula']:
    Comp.append(Composition(i))

TestSet['Composition'] = Comp
TestSet = ElementFraction().featurize_dataframe(TestSet, 'Composition')
TestSet = TestSet.drop(['b', 'SG', 'c', 'Composition', 'System','Gas_Formula', 'CompoundFormula', 'MolWt', 'MatFormula', 'HeavyAtoms'], axis = 1)


#TestSet.to_csv('PredictionSet_3', index=False)

TestSet        

ElementFraction:   0%|          | 0/212478 [00:00<?, ?it/s]

Unnamed: 0,CID,Name,SeperationE,a,SMILES,TPSA,AromaticRings,LabuteASA,SPAN,GeDi,...,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr
0,13,MoS2,1.766763,3.154333,C1=CC(=C(C=C1Cl)Cl)Cl,0.0,1,68.341202,3.481711,6.260667,...,0,0,0,0,0,0,0,0,0,0
1,13,MoSe2,2.056959,3.285000,C1=CC(=C(C=C1Cl)Cl)Cl,0.0,1,68.341202,3.481711,6.260667,...,0,0,0,0,0,0,0,0,0,0
2,13,MoTe2,2.636494,3.493333,C1=CC(=C(C=C1Cl)Cl)Cl,0.0,1,68.341202,3.481711,6.260667,...,0,0,0,0,0,0,0,0,0,0
3,13,MoO2,1.202318,2.828333,C1=CC(=C(C=C1Cl)Cl)Cl,0.0,1,68.341202,3.481711,6.260667,...,0,0,0,0,0,0,0,0,0,0
4,13,WS2,1.865270,3.162333,C1=CC(=C(C=C1Cl)Cl)Cl,0.0,1,68.341202,3.481711,6.260667,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212473,13593933,CrTe2,2.477311,3.407000,CC=C=C=C,0.0,0,32.130210,3.314275,6.547516,...,0,0,0,0,0,0,0,0,0,0
212474,13593933,CrO2,0.823892,2.622667,CC=C=C=C,0.0,0,32.130210,3.314275,6.547516,...,0,0,0,0,0,0,0,0,0,0
212475,13593933,ZrS2,1.895850,3.567333,CC=C=C=C,0.0,0,32.130210,3.314275,6.547516,...,0,0,0,0,0,0,0,0,0,0
212476,13593933,ZrSe2,2.158777,3.706333,CC=C=C=C,0.0,0,32.130210,3.314275,6.547516,...,0,0,0,0,0,0,0,0,0,0
