# Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer

from joblib import dump

# Import train & test data

In [2]:
# Load training dataset
# X_train: Feature matrix (input variables)
# y_train: Target labels (output variable)
X_train = pd.read_csv('./Data/X_train.csv')
y_train = pd.read_csv('./Data/y_train.csv')


In [3]:
X_train.head()

Unnamed: 0,mp_id_bg,formula,symmetry,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,mp-1184117,CuPbO3,Cubic,0.0,0,0.0,0.0,0.0,0.0,0.0,...,-0.386133,0.192867,0.578999,0.049895,0.299698,-0.347532,0.038902,0.386433,-0.102941,0.233697
1,mp-1114061,Rb2InSbCl6,Cubic,0.0,0,0.0,0.0,0.0,0.0,0.0,...,-0.241424,0.356478,0.597902,0.12804,0.2542,-0.173449,0.148025,0.321474,0.061715,0.148584
2,mp-1217607,TbBO3,Orthorhombic,0.0,0,0.0,0.0,0.2,0.0,0.0,...,-0.177121,0.192867,0.369988,0.077839,0.200731,-0.171613,0.552185,0.723798,0.099455,0.321516
3,mp-1113269,Cs2NdAgI6,Cubic,0.0,0,0.0,0.0,0.0,0.0,0.0,...,-0.502267,0.583951,1.086218,0.203355,0.374606,-0.343382,0.312271,0.655653,-0.113836,0.240703
4,mp-1521867,KBaNdWO6,Cubic,0.0,0,0.0,0.0,0.0,0.0,0.0,...,-0.42443,0.352671,0.777101,0.137808,0.258369,-0.205435,0.312271,0.517705,0.0634,0.175485


In [4]:
y_train.head()

Unnamed: 0,is_direct
0,False
1,True
2,False
3,False
4,False


In [5]:
# Drop unnecessary columns that are not relevant for model training
X_train = X_train.drop(columns=['mp_id_bg', 'formula', 'symmetry'])

# Calculate data distribution

In [6]:
# Function to calculate dataset distribution between direct and indirect band gap classes
def distribution(dataset):
    len_direct = len(dataset[dataset['is_direct'] == True])
    len_indirect = len(dataset[dataset['is_direct'] == False])

    ratio_indirect_sample = int(100*(len_indirect/(len_direct+len_indirect)))
    ratio_direct_sample = 100 - ratio_indirect_sample

    return len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample

In [7]:
# Compute class distribution for imbalanced dataset
len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample = distribution(y_train)

print('n direct: ', len_direct, ', n indirect: ', len_indirect)
print('ratio: ', ratio_direct_sample, ' : ', ratio_indirect_sample)

n direct:  1095 , n indirect:  3357
ratio:  25  :  75


# Random Forest (RF)

In [8]:
# Cost-Sensitive Random Forest (CS-RF) Pipeline
# Define model pipeline with feature scaling and cost-sensitive RF classifier
pipeline_rf = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Define hyperparameter search space for Random Forest
param_rscv_rf = {
    'rf__max_depth': np.arange(1, 100 + 1, 1),
    'rf__min_samples_leaf': np.arange(1, 11, 1),
    'rf__min_samples_split': np.arange(1, 11, 1),
    'rf__bootstrap': [True,False],
    'rf__n_estimators': np.arange(100, 501, 1)
}

# Perform hyperparameter tuning using RandomizedSearchCV
rscv_cs_rf = RandomizedSearchCV(pipeline_rf, param_distributions=param_rscv_rf, n_iter=100, 
                                   scoring='f1', cv=5, verbose=1, random_state=42, n_jobs=16)

# Train the CS-RF model
rscv_cs_rf.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 100 candidates, totalling 500 fits


50 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_para

# eXtreme Gradient Boosting (XGB)

In [9]:
# Cost-Sensitive XGBoost (CS-XGB) Pipeline
# Define model pipeline with feature scaling and cost-sensitive XGBoost classifier
pipeline_xgb = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('xgb', XGBClassifier(random_state=42, 
                          scale_pos_weight = (len_indirect/len_direct), 
                          max_delta_step = 10))
])

# Define hyperparameter search space for XGBoost
max_depth = np.arange(3, 30 + 1,1)
subsample = np.arange(0.1, 1.1, 0.1)
colsample_bytree = np.arange(0.1, 1.1, 0.1)
colsample_bylevel = np.arange(0.1, 1.1, 0.1)
n_estimators = np.arange(100, 501, 1)

param_rscv_xgb = {
    'xgb__max_depth': max_depth,
    'xgb__subsample': subsample,
    'xgb__colsample_bytree': colsample_bytree,
    'xgb__colsample_bylevel': colsample_bylevel,
    'xgb__n_estimators': n_estimators
}

# Perform hyperparameter tuning using RandomizedSearchCV
rscv_cs_xgb = RandomizedSearchCV(pipeline_xgb, param_distributions=param_rscv_xgb, n_iter=100, 
                                   scoring='f1', cv=5, verbose=1, random_state=42, n_jobs=16)

# Train the CS-XGB model
rscv_cs_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


# Save Trained Models

In [10]:
# Export CS-RF model
dump(rscv_cs_rf, './Trained model/CS_RF.joblib') 

# Export CS-XGB model
dump(rscv_cs_xgb, './Trained model/CS_XGB.joblib') 

['./Trained model/CS_XGB.joblib']