# Import the necessary libraries

In [91]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer

from joblib import dump

# Import train & test data

In [None]:
# Define the oversampling method (ros, smote, adasyn) method and ratio (50:50, 40:60, 70:30)
# Options: 'ros_50_50', 'smote_40_60', 'adasyn_30_70', etc.
# Example: 'smote_30_70' applies SMOTE with a 30:70 direct-to-indirect ratio
rs_type = 'ros_30_70'

# Load training dataset
# X_train: Feature matrix (input variables)
# y_train: Target labels (output variable)
X_train = pd.read_csv('./Data/Oversampling/X_train_'+ rs_type +'.csv')
y_train = pd.read_csv('./Data/Oversampling/y_train_' + rs_type + ".csv")


In [93]:
X_train.head()

Unnamed: 0,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,F fraction,Ne fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.386133,0.192867,0.578999,0.049895,0.299698,-0.347532,0.038902,0.386433,-0.102941,0.233697
1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,-0.241424,0.356478,0.597902,0.12804,0.2542,-0.173449,0.148025,0.321474,0.061715,0.148584
2,0.0,0,0.0,0.0,0.2,0.0,0.0,0.6,0.0,0,...,-0.177121,0.192867,0.369988,0.077839,0.200731,-0.171613,0.552185,0.723798,0.099455,0.321516
3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,-0.502267,0.583951,1.086218,0.203355,0.374606,-0.343382,0.312271,0.655653,-0.113836,0.240703
4,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.42443,0.352671,0.777101,0.137808,0.258369,-0.205435,0.312271,0.517705,0.0634,0.175485


In [94]:
y_train.head()

Unnamed: 0,is_direct
0,False
1,True
2,False
3,False
4,False


# Calculate data distribution

In [95]:
# Function to calculate dataset distribution between direct and indirect band gap classes
def distribution(dataset):
    len_direct = len(dataset[dataset['is_direct'] == True])
    len_indirect = len(dataset[dataset['is_direct'] == False])

    ratio_indirect_sample = int(100*(len_indirect/(len_direct+len_indirect)))
    ratio_direct_sample = 100 - ratio_indirect_sample

    print('n direct: ', len_direct, ', n indirect: ', len_indirect)
    print('ratio: ', ratio_direct_sample, ' : ', ratio_indirect_sample)

    return len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample

In [96]:
# Compute class distribution for imbalanced dataset
len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample = distribution(y_train)

n direct:  1409 , n indirect:  3357
ratio:  30  :  70


# Random Forest (RF)

In [97]:
# Random Forest Pipeline
# Define model pipeline with feature scaling and set the hyperparameter that have been tuned before
pipeline_rf = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('xgb', RandomForestClassifier(random_state = 42, n_estimators = 487, 
                                min_samples_split = 2, min_samples_leaf = 1, 
                                max_depth = 70, bootstrap = False))
])

# Train the RF model
pipeline_rf.fit(X_train, y_train.values.ravel())

# eXtreme Gradient Boosting (XGB)

In [98]:
# XGB Pipeline
# Define model pipeline with feature scaling and set the hyperparameter that have been tuned before
pipeline_xgb = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('xgb', XGBClassifier(random_state=42, max_depth = 9, subsample = 1.0,
                    colsample_bytree = 0.9, colsample_bylevel = 0.5,
                    n_estimators = 220))
])

# Train the XGB model
pipeline_xgb.fit(X_train, y_train.values.ravel())

# Save Trained Models

In [99]:
# Export RF model
dump(pipeline_rf, './Trained model/RF_'+rs_type+'.joblib') 

# Export XGB model
dump(pipeline_xgb, './Trained model/XGB_'+rs_type+'.joblib') 

['./Trained model/XGB_ros_30_70.joblib']