In [1]:
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import GridSearchCV
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_regression, f_classif, chi2, VarianceThreshold
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from matplotlib import pyplot

In [2]:
class MyDecorrelator(BaseEstimator, TransformerMixin): 
    
    def __init__(self, threshold):
        self.threshold = threshold
        self.correlated_columns = None

    def fit(self, X, y=None):
        correlated_features = set()  
        X = pd.DataFrame(X)
        corr_matrix = X.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
                    colname = corr_matrix.columns[i]  # getting the name of column
                    correlated_features.add(colname)
        self.correlated_features = correlated_features
        return self

    def transform(self, X, y=None, **kwargs):
        return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)

In [3]:
Data = pd.read_excel(r'database.xlsx', index_col=0)

In [4]:
import tqdm
rows_to_drop = []
for i in tqdm.tqdm(range(len(Data))):
    if 'Cu' in Data.index[i] or 'Fe' in Data.index[i] or 'Ni' in Data.index[i] or 'O' in Data.index[i] or Data['Tc'].iloc[i]>50:
        rows_to_drop.append(Data.index[i])
        
data_reduced = Data.drop(rows_to_drop)

data_reduced['class'] = 0
for i in range(len(data_reduced)):
    if data_reduced['Tc'].iloc[i] >= 15:
        data_reduced['class'].iloc[i] = 1
        
data = data_reduced.drop('Tc', axis = 1)

train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 0)

100%|██████████| 16763/16763 [00:00<00:00, 171001.84it/s]


In [9]:
'''Pipeline for ETC-vanilla'''

etc = ExtraTreesClassifier(random_state = 0)

pipe = Pipeline([
    ('decorrelation', MyDecorrelator(0.9)), 
    ('threshold', VarianceThreshold(threshold = 0)), 
    ('feature_selector', SelectPercentile(f_classif)), 
    ('etc', etc)
], verbose = 1)

param_grid = {
    "etc__n_estimators": [100, 250, 500, 750, 1000],    #Tune the number of estimators
    "etc__max_features": [1, 0.9, 0.8, 0.7, 0.6, 0.5],  #Tune the number of features to consider when looking for the best split
    "feature_selector__percentile": [50, 75, 100]       #Tune the percentage of features to retain in terms of f_regression score
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)
search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose = 1, cv = stratified_kfold)
search.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

In [None]:
'''Pipeline for ETC-SMOTE'''

etc = ExtraTreesClassifier(random_state = 0)

pipe = imbpipeline([
    ('decorrelation', MyDecorrelator(0.9)), 
    ('threshold', VarianceThreshold(threshold = 0)), 
    ('smote', SMOTE(random_state = 0)),
    ('feature_selector', SelectPercentile(f_classif)), 
    ('etc', etc)
], verbose = 1)

param_grid = {
    "etc__n_estimators": [100, 250, 500, 750, 1000],                   
    "etc__max_features": [1, 0.9, 0.8, 0.7, 0.6, 0.5],  
    "feature_selector__percentile": [50, 75, 100]             
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)
search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose = 1, cv = stratified_kfold)
search.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

In [None]:
'''Pipeline for ETC-vanilla 81'''

SHAP = pd.read_excel(r'SHAP_for_ETR_metallic_mean.xlsx', index_col = 0)
train_data = data[SHAP.index[0:81]]
train_data['class'] = data['class']
train_df, test_df = train_test_split(train_data, test_size = 0.15, random_state = 0)

etc = ExtraTreesClassifier(random_state = 0)

pipe = Pipeline([ 
    ('etc', etc)
], verbose = 1)

param_grid = {
    "etc__n_estimators": [100, 250, 500, 750, 1000],  
    "etc__max_features": [1, 0.9, 0.8, 0.7, 0.6, 0.5],
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)
search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose = 1, cv = stratified_kfold)
search.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

In [None]:
'''Pipeline for ETC-SMOTE 81'''

SHAP = pd.read_excel(r'SHAP_for_ETR_metallic_mean.xlsx', index_col = 0)
train_data = data[SHAP.index[0:81]]
train_data['class'] = data['class']
train_df, test_df = train_test_split(train_data, test_size = 0.15, random_state = 0)


etc = ExtraTreesClassifier(random_state = 0)

pipe = imbpipeline([
    ('smote', SMOTE(random_state = 0)),
    ('etc', etc)
], verbose = 1)

param_grid = {
    "etc__n_estimators": [100, 250, 500, 750, 1000], 
    "etc__max_features": [1, 0.9, 0.8, 0.7, 0.6, 0.5],

}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)
search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose = 1, cv = stratified_kfold)
search.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

In [None]:
'''Pipeline for any ETC 2D/3D'''

etc = ExtraTreesClassifier(random_state = 0)
pipe = Pipeline([
    ('etc', etc)
], verbose = 1)

param_grid = {
    "etc__n_estimators": [100, 250, 500, 750, 1000]
}

stratified_kfold= StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)
search= GridSearchCV(pipe, param_grid, n_jobs=3, verbose = 1, cv = stratified_kfold)
search.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

'''
For ETC 2D high the features are ['MagpieData range MeltingT', '0-norm']
For ETC 3D high the features are ['MagpieData range MeltingT' '0-norm', 'MagpieData mode NdUnfilled']

For ETC 2D middle the features are ['MagpieData mode SpaceGroupNumber', 'MagpieData mode NpValence']
For ETC 3D middle the features are ['MagpieData mode SpaceGroupNumber', 'MagpieData mode NpValence', 'MagpieData mean Column']

For ETC 2D low the features are ['MagpieData minimum NfValence', 'MagpieData mode GSmagmom']
For ETC 3D low the features are ['MagpieData minimum NfValence', 'MagpieData mode GSmagmom', 'MagpieData mode NfUnfilled']

'''