# Import the necessaries library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from joblib import dump, load

from matminer.utils.io import load_dataframe_from_json, store_dataframe_as_json

# Define Screening Parameters

In [2]:
# RF model uses True/False labels, while XGB uses 1/0 labels
property_threshold = 1

# Name of the predicted property (direct band gap classification)
property_name = 'predicted_direct'

# Output filename for screened materials
outname = 'new_material_direct_band_gap_screening'

# Load new formable perovskite

In [3]:
# Load dataset containing new perovskite candidates
df_new_perovskite_formula = pd.read_csv('./Data/new_perovskite_filter.csv')
df_new_perovskite_formula.tail()

Unnamed: 0,formula,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,F fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
21016,Nd2IrSbO6,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.408075,0.192867,0.600942,0.062033,0.264177,-0.511302,0.312271,0.823573,0.01732,0.291435
21017,Nd2IrScO6,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.408075,0.192867,0.600942,0.082775,0.236983,-0.511302,0.330798,0.8421,0.067745,0.302025
21018,Nd2IrSeO6,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.408075,0.192867,0.600942,0.07296,0.246873,-0.511302,0.312271,0.823573,0.004185,0.310245
21019,Nd2IrSmO6,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.408075,0.192867,0.600942,0.084987,0.235574,-0.511302,0.329468,0.84077,0.067612,0.301825
21020,Ag2AlCuF6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.502267,0.50185,1.004118,0.152786,0.57833,-0.347532,0.371508,0.71904,0.12751,0.421119


In [4]:
# Drop the "formula" column, keeping only numerical features for model prediction
X_new_perovskite = df_new_perovskite_formula.drop(columns=['formula'])
X_new_perovskite.tail()

Unnamed: 0,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,F fraction,Ne fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
21016,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.408075,0.192867,0.600942,0.062033,0.264177,-0.511302,0.312271,0.823573,0.01732,0.291435
21017,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.408075,0.192867,0.600942,0.082775,0.236983,-0.511302,0.330798,0.8421,0.067745,0.302025
21018,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.408075,0.192867,0.600942,0.07296,0.246873,-0.511302,0.312271,0.823573,0.004185,0.310245
21019,0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.408075,0.192867,0.600942,0.084987,0.235574,-0.511302,0.329468,0.84077,0.067612,0.301825
21020,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0,...,-0.502267,0.50185,1.004118,0.152786,0.57833,-0.347532,0.371508,0.71904,0.12751,0.421119


# Load Trained Model and Predict Direct Band Gap

In [5]:
# Load the best-trained model
best_model = load('./Trained model/CS_XGB.joblib') 

In [6]:
# Predict direct band gap classification for new perovskite materials
y_pred_new_perovskite = best_model.predict(X_new_perovskite)
df_new_perovskite_formula.insert(1, property_name, y_pred_new_perovskite)
df_new_perovskite_formula.tail()

Unnamed: 0,formula,predicted_direct,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
21016,Nd2IrSbO6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.408075,0.192867,0.600942,0.062033,0.264177,-0.511302,0.312271,0.823573,0.01732,0.291435
21017,Nd2IrScO6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.408075,0.192867,0.600942,0.082775,0.236983,-0.511302,0.330798,0.8421,0.067745,0.302025
21018,Nd2IrSeO6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.408075,0.192867,0.600942,0.07296,0.246873,-0.511302,0.312271,0.823573,0.004185,0.310245
21019,Nd2IrSmO6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.408075,0.192867,0.600942,0.084987,0.235574,-0.511302,0.329468,0.84077,0.067612,0.301825
21020,Ag2AlCuF6,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.502267,0.50185,1.004118,0.152786,0.57833,-0.347532,0.371508,0.71904,0.12751,0.421119


In [7]:
# Insert predictions into the original dataframe
df_new_perovskite = df_new_perovskite_formula.copy(deep=True)
df_new_perovskite.describe()

Unnamed: 0,predicted_direct,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,F fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
count,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,...,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0,21021.0
mean,0.096427,0.0,0.0,0.003696,0.003725,0.003282,0.002236,0.003111,0.452514,0.056115,...,-0.237024,0.304486,0.541511,0.146511,0.225204,-0.236999,0.326249,0.563248,0.067274,0.229121
std,0.295183,0.0,0.0,0.020112,0.019162,0.017818,0.015477,0.017715,0.258231,0.174705,...,0.198998,0.131849,0.217624,0.090195,0.08657,0.167583,0.13863,0.203301,0.080488,0.078242
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.62297,0.171787,0.058509,-0.104609,0.02966,-0.652751,-0.201964,0.013367,-0.307175,0.007088
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.386133,0.192867,0.380198,0.090752,0.159458,-0.343382,0.249747,0.419349,0.024857,0.177596
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.241424,0.233502,0.537079,0.128916,0.222336,-0.253415,0.34511,0.557096,0.071892,0.224465
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,-0.07196,0.395654,0.689222,0.177492,0.279639,-0.136217,0.43061,0.699658,0.111186,0.276926
max,1.0,0.0,0.0,0.2,0.2,0.1,0.2,0.2,0.6,0.6,...,0.431196,0.583951,1.206921,0.511375,0.65038,0.148025,0.552185,1.204936,0.329199,0.520013


# Filter Materials Based on Prediction

In [8]:
# Create a copy of the dataframe for further processing
filter_property = df_new_perovskite[property_name] == property_threshold

# Filter perovskites predicted to have a direct band gap
df_new_perovskite = df_new_perovskite[filter_property]

# Reset index after filtering
df_new_perovskite = df_new_perovskite.reset_index(drop=True)
df_new_perovskite

Unnamed: 0,formula,predicted_direct,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,CsZrCl3,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.148203,0.583951,0.732154,0.190222,0.310859,0.052268,0.148025,0.095757,0.116296,0.053755
1,SbBaI3,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.221856,0.352671,0.574527,0.158301,0.263089,-0.201964,0.275374,0.477338,-0.100793,0.251770
2,SbBaBr3,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.221856,0.352671,0.574527,0.166264,0.266557,-0.173449,0.275374,0.448823,-0.038555,0.213334
3,LuBaCl3,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.054088,0.352671,0.298583,0.184424,0.127858,0.148025,0.442883,0.294858,0.232466,0.155273
4,LuAmI3,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034489,0.220231,0.185742,0.149854,0.115479,-0.201964,0.442883,0.644847,-0.028290,0.335398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,Nd2ILiO6,1,0,0,0.1,0.0,0.0,0.0,0.0,0.6,...,0.046533,0.299678,0.253144,0.177017,0.095115,-0.201964,0.312271,0.514235,0.096234,0.206003
2023,Nd2InYO6,1,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.241424,0.192867,0.434290,0.105446,0.174541,-0.001872,0.422425,0.424298,0.127850,0.194358
2024,Nd2LiNbO6,1,0,0,0.1,0.0,0.0,0.0,0.0,0.6,...,-0.344212,0.299678,0.643890,0.120573,0.223771,-0.017283,0.312271,0.329554,0.114702,0.169528
2025,Nd2InWO6,1,0,0,0.0,0.0,0.0,0.0,0.0,0.6,...,-0.424430,0.192867,0.617297,0.058441,0.272848,-0.205435,0.312271,0.517705,0.065064,0.187534


# Save Screened Materials

In [9]:
df_new_perovskite.to_csv('./Output/'+outname+'.csv', index=False)