# Import the necessary libraries

In [55]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer

from joblib import dump

# Import train & test data

In [56]:
# Define the undersampling method and ratio (50:50, 40:60, 30:70)
# Options: 'RandomUnderSampler_50_50', 'ClusterCentroids_40_60', 'NearMiss_30_70', etc.
# Example: 'RandomUnderSampler_30_70' applies RandomUnderSampler with a 30:70 direct-to-indirect ratio
rs_type = 'NearMiss_50_50'

# Load training dataset
# X_train: Feature matrix (input variables)
# y_train: Target labels (output variable)
X_train = pd.read_csv('./Data/Undersampling/X_train_'+ rs_type +'.csv')
y_train = pd.read_csv('./Data/Undersampling/y_train_' + rs_type + ".csv")


In [57]:
X_train.head()

Unnamed: 0,H fraction,He fraction,Li fraction,Be fraction,B fraction,C fraction,N fraction,O fraction,F fraction,Ne fraction,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.344212,0.352671,0.696883,0.141038,0.244983,-0.017283,0.348429,0.365712,0.118504,0.175728
1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.344212,0.431196,0.775408,0.164848,0.249134,-0.017283,0.412166,0.429449,0.116456,0.177063
2,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.344212,0.431196,0.775408,0.164339,0.253004,-0.017283,0.312271,0.329554,0.109029,0.155961
3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.344212,0.431196,0.775408,0.157177,0.259986,-0.017283,0.336615,0.353898,0.111464,0.161437
4,0.0,0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0,...,-0.344212,0.431196,0.775408,0.155322,0.257751,-0.017283,0.336615,0.353898,0.108901,0.157354


In [58]:
y_train.head()

Unnamed: 0,is_direct
0,False
1,False
2,False
3,False
4,False


# Calculate data distribution

In [59]:
# Function to calculate dataset distribution between direct and indirect band gap classes
def distribution(dataset):
    len_direct = len(dataset[dataset['is_direct'] == True])
    len_indirect = len(dataset[dataset['is_direct'] == False])

    ratio_indirect_sample = int(100*(len_indirect/(len_direct+len_indirect)))
    ratio_direct_sample = 100 - ratio_indirect_sample

    print('n direct: ', len_direct, ', n indirect: ', len_indirect)
    print('ratio: ', ratio_direct_sample, ' : ', ratio_indirect_sample)

    return len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample

In [60]:
# Compute class distribution for imbalanced dataset
len_direct, len_indirect, ratio_direct_sample, ratio_indirect_sample = distribution(y_train)

n direct:  1095 , n indirect:  1095
ratio:  50  :  50


# Random Forest (RF)

In [61]:
# Random Forest Pipeline
# Define model pipeline with feature scaling and set the hyperparameter that have been tuned before
pipeline_rf = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('xgb', RandomForestClassifier(random_state = 42, n_estimators = 487, 
                                min_samples_split = 2, min_samples_leaf = 1, 
                                max_depth = 70, bootstrap = False))
])

# Train the RF model
pipeline_rf.fit(X_train, y_train.values.ravel())

# eXtreme Gradient Boosting (XGB)

In [62]:
# XGB Pipeline
# Define model pipeline with feature scaling and set the hyperparameter that have been tuned before
pipeline_xgb = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0,1))),
    ('xgb', XGBClassifier(random_state=42, max_depth = 9, subsample = 1.0,
                    colsample_bytree = 0.9, colsample_bylevel = 0.5,
                    n_estimators = 220))
])

# Train the XGB model
pipeline_xgb.fit(X_train, y_train.values.ravel())

# Save Trained Models

In [63]:
# Export RF model
dump(pipeline_rf, './Trained model/RF_'+rs_type+'.joblib') 

# Export XGB model
dump(pipeline_xgb, './Trained model/XGB_'+rs_type+'.joblib') 

['./Trained model/XGB_NearMiss_50_50.joblib']