# 1. predicting adsorption for each adsorption data point using Vt&BET only

In [1]:
import pandas as pd
import numpy as np

def pred_dataset(file_names, feature_set ):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        #temp_data = temp_data[temp_data['Pressure']>0.01]
        index = list(set(temp_data['Index'].values))
        print(len(index))
        test_index= np.random.choice(index,int(0.2*len(index)),replace=False)
        train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor,RandomForestRegressor
from lightgbm import LGBMRegressor  
from sklearn.svm import SVR
  
n_estimators = [50,100,120,150,180,200]

# define different models#,
models = [
    #('SVR',SVR(max_iter=100000)),
    ('DT',DecisionTreeRegressor(random_state=42)),\
     ('ADBR',AdaBoostRegressor(random_state=42)), 
    ("GBR",GradientBoostingRegressor(random_state=42)),\
    ('BG',BaggingRegressor(random_state=42,n_jobs=-1)),
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    ('RF',RandomForestRegressor(n_jobs=-1,random_state=42)),
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ('BGETR',BaggingRegressor(ExtraTreesRegressor(n_estimators = 180,random_state=42,n_jobs=6),random_state=42,n_jobs=-1))
    ]

# set search parameters grid for different models
para_grids = { 
    'SVR':{'kernel':['linear','poly','rbf','sigmoid','precomputed']},\
    'DT':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']},\
    'ADBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2],'loss':['linear','square','exponential']},\
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'BG':{'n_estimators':[10,50,100]},\
    'ETR':{'n_estimators':n_estimators},\
    'RF':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    'BGETR':{'n_estimators':[10]}
    }

In [3]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []
    for gas in gas_list:
        if gas =='total':
            train_df_com = train_df
            test_df_com = test_df
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index'].values
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            
            for model_name, model in model_list:
                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2_total,test_mse_total = model_eval(model_refit,test_x,test_y.squeeze()) 
                for gases in gas_list[1:]:
                    test_df_com = test_df[test_df['Label']==gases]
                    test_xs = test_df_com[input_feature]
                    test_ys = test_df_com[output].values
                    test_r2,test_mse = model_eval(model_refit,test_xs,test_ys.squeeze()) 
                    result_total.append([gases,model_name+'_total',result[0],result[1],test_r2_total,test_mse_total,test_r2,test_mse,best_param])
                    print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))    
        else:
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            for model_name, model in model_list:
                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))     
    return result_total

In [5]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_3 = ['Vt',"Vmic",]
combin_5 = ['Vt',"Vmic",'Vmeso',]
combin_6 = ["Vmic",'Vmeso',]
feature_list = [base_feature+combin_1+condition_feature,base_feature+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_only','BET_plut_Vt']
feature_set = ["BET","Vt",]
gas_list = ['CO2','CFCs','Methane','E&E']
for i in range(15):
    train_df,test_df = pred_dataset(['CO2','Methane','Ethane&Ethylene','CFCs'],feature_set)
    for j in range(len(feature_list)):
        results = model_comparison(models,para_grids, feature_list[j],gas_list)
        files_name = 'BG_ETR_Full_Four_gases_with_pred_Vmic_'+file_name[j]+'_result_'+str(i)+'.csv'
        pd.DataFrame(results,columns = columns).to_csv(os.path.join('./1_Predicting_separate_gas_by_two approach',files_name))  

2145
820
388
115
Dataset CO2, Algorithm DT_total, Test_r2 0.8689891385472541, Test_error 1.5933201624849187
Dataset CO2, Algorithm ADBR_total, Test_r2 0.7902283443231705, Test_error 1.9774035697876633
Dataset CO2, Algorithm GBR_total, Test_r2 0.9424667094562494, Test_error 0.6574331338980477
Dataset CO2, Algorithm BG_total, Test_r2 0.927602424314107, Test_error 0.8154805234612376
Dataset CO2, Algorithm ETR_total, Test_r2 0.9393439485291014, Test_error 0.6716337394352319
Dataset CO2, Algorithm RF_total, Test_r2 0.92768937381469, Test_error 0.8163186088531545
Dataset CO2, Algorithm LGBM_total, Test_r2 0.9447431898071584, Test_error 0.6632602216964001
Dataset CO2, Algorithm BGLGBM_total, Test_r2 0.9502354510082839, Test_error 0.5646105287338988
Dataset CO2, Algorithm BGETR_total, Test_r2 0.9392987299187565, Test_error 0.6659675256892447
Dataset CFCs, Algorithm DT_total, Test_r2 0.8454879799893359, Test_error 2.938607532759776
Dataset CFCs, Algorithm ADBR_total, Test_r2 0.6260531417441217,



Dataset Methane, Algorithm BGETR_total, Test_r2 0.8823449062474796, Test_error 1.0967403480596023
Dataset E&E, Algorithm DT_total, Test_r2 0.9122927657660408, Test_error 0.8061402070297033
Dataset E&E, Algorithm ADBR_total, Test_r2 0.569514612232338, Test_error 1.9068947044378968
Dataset E&E, Algorithm GBR_total, Test_r2 0.9558199050632283, Test_error 0.3680404284177259
Dataset E&E, Algorithm BG_total, Test_r2 0.9318667990893358, Test_error 0.5597145748640602
Dataset E&E, Algorithm ETR_total, Test_r2 0.9161103825961296, Test_error 0.6514910767936344
Dataset E&E, Algorithm RF_total, Test_r2 0.9312888240814825, Test_error 0.5633273925263912
Dataset E&E, Algorithm LGBM_total, Test_r2 0.9556839463684498, Test_error 0.37295396714746065
Dataset E&E, Algorithm BGLGBM_total, Test_r2 0.9596458294465648, Test_error 0.32706009921417445
Dataset E&E, Algorithm BGETR_total, Test_r2 0.9126897957729093, Test_error 0.6526919729945198
2145
820
388
115
Dataset CO2, Algorithm DT_total, Test_r2 0.897725884



Dataset CFCs, Algorithm BG_total, Test_r2 0.911575603291623, Test_error 0.7815819302112225
Dataset CFCs, Algorithm ETR_total, Test_r2 0.9407211726259066, Test_error 0.46909013845554554
Dataset CFCs, Algorithm RF_total, Test_r2 0.9118564001742204, Test_error 0.7710643128855504
Dataset CFCs, Algorithm LGBM_total, Test_r2 0.9468791978253963, Test_error 0.48481486109236
Dataset CFCs, Algorithm BGLGBM_total, Test_r2 0.9610104095378301, Test_error 0.3496252394627985
Dataset CFCs, Algorithm BGETR_total, Test_r2 0.9369896307655248, Test_error 0.5006357642201917
Dataset Methane, Algorithm DT_total, Test_r2 0.8194625525315411, Test_error 1.62933134661742
Dataset Methane, Algorithm ADBR_total, Test_r2 0.7224029114736414, Test_error 2.273007125955717
Dataset Methane, Algorithm GBR_total, Test_r2 0.9000936400659296, Test_error 0.9181887699257161
Dataset Methane, Algorithm BG_total, Test_r2 0.8893906742546679, Test_error 0.9485563613728148
Dataset Methane, Algorithm ETR_total, Test_r2 0.880373894428



Dataset Methane, Algorithm BGETR_total, Test_r2 0.8610423093528718, Test_error 1.169569658730744
Dataset E&E, Algorithm DT_total, Test_r2 0.8046295173312702, Test_error 1.5419900234752568
Dataset E&E, Algorithm ADBR_total, Test_r2 0.6563881543983903, Test_error 1.706432180231467
Dataset E&E, Algorithm GBR_total, Test_r2 0.9040492849253302, Test_error 0.7861813241154466
Dataset E&E, Algorithm BG_total, Test_r2 0.8646929807781414, Test_error 0.9692231365244434
Dataset E&E, Algorithm ETR_total, Test_r2 0.8701810101701897, Test_error 0.9334876834543755
Dataset E&E, Algorithm RF_total, Test_r2 0.8649053308800914, Test_error 0.9739332609530345
Dataset E&E, Algorithm LGBM_total, Test_r2 0.9195582433474156, Test_error 0.612734521160724
Dataset E&E, Algorithm BGLGBM_total, Test_r2 0.9198051008188349, Test_error 0.5857258693029914
Dataset E&E, Algorithm BGETR_total, Test_r2 0.8744238559077593, Test_error 0.8770519168093573
Dataset CO2, Algorithm DT_total, Test_r2 0.8668144171884463, Test_error 1



Dataset CFCs, Algorithm ETR_total, Test_r2 0.8968578750967922, Test_error 2.3038948074305923
Dataset CFCs, Algorithm RF_total, Test_r2 0.7973074333495994, Test_error 5.455013769682557
Dataset CFCs, Algorithm LGBM_total, Test_r2 0.7687687763698161, Test_error 6.977660036642151
Dataset CFCs, Algorithm BGLGBM_total, Test_r2 0.8168740541255061, Test_error 4.94197727915517
Dataset CFCs, Algorithm BGETR_total, Test_r2 0.891687512289706, Test_error 2.2864321355818906
Dataset Methane, Algorithm DT_total, Test_r2 0.8470706569326758, Test_error 1.4228157209953076
Dataset Methane, Algorithm ADBR_total, Test_r2 0.757962784500144, Test_error 1.8277920195879862
Dataset Methane, Algorithm GBR_total, Test_r2 0.8887153678788068, Test_error 0.9657810077326947
Dataset Methane, Algorithm BG_total, Test_r2 0.905170971367555, Test_error 0.8128875259215806
Dataset Methane, Algorithm ETR_total, Test_r2 0.9055856942480687, Test_error 0.7980313463987279




Dataset Methane, Algorithm RF_total, Test_r2 0.907698556912689, Test_error 0.7915869345218945
Dataset Methane, Algorithm LGBM_total, Test_r2 0.9158657131820768, Test_error 0.7455667106538894
Dataset Methane, Algorithm BGLGBM_total, Test_r2 0.9202815432388736, Test_error 0.6729397320206041
Dataset Methane, Algorithm BGETR_total, Test_r2 0.9014340583737872, Test_error 0.8202438656324277
Dataset E&E, Algorithm DT_total, Test_r2 0.8783036748624385, Test_error 0.9429901018064772
Dataset E&E, Algorithm ADBR_total, Test_r2 0.576878832532824, Test_error 1.6732149242136358
Dataset E&E, Algorithm GBR_total, Test_r2 0.9094058878680755, Test_error 0.7103393657146025
Dataset E&E, Algorithm BG_total, Test_r2 0.914956877564474, Test_error 0.5951631641816725
Dataset E&E, Algorithm ETR_total, Test_r2 0.9092757227779275, Test_error 0.6266775868392446
Dataset E&E, Algorithm RF_total, Test_r2 0.9154557007776151, Test_error 0.5920700541096997
Dataset E&E, Algorithm LGBM_total, Test_r2 0.9401084149999155, T

# Post result treatments
 

In [6]:
import pandas as pd
file_name = ['BET_only','BET_plut_Vt']
df_list = []
cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model"]
for j in range(1):
    for i in range(15):
    
        files_name = 'BG_ETR_Full_Four_gases_with_pred_Vmic_'+file_name[j]+'_result_'+str(i)+'.csv'
        df_list.append(pd.read_csv(os.path.join('./1_Predicting_separate_gas_by_two approach',files_name))[cal_columns] )
        pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./1_Predicting_separate_gas_by_two approach','mean.csv'))
        pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./1_Predicting_separate_gas_by_two approach','std.csv'))


# 2_Predicting adsorption for each data point using the combination Vt, BET, Vmeso, and Vmic
the dataset for each separate gas will be smaller than previous one.

In [None]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_5 = ['Vt',"Vmic",]
combin_6 = ['Vt',"Vmic",'Vmeso',]
combin_7 = ["Vmic",'Vmeso',]

feature_list = [base_feature+condition_feature,base_feature+combin_1+condition_feature, \
    base_feature+combin_3+condition_feature, base_feature+combin_2+condition_feature,\
    base_feature+combin_4+condition_feature, base_feature+combin_5+condition_feature,\
    base_feature+combin_6+condition_feature, base_feature+combin_7+condition_feature, ]

columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param']


feature_set = ["BET","Vt","Vmeso","Vmic"]
gas_list = ['CO2','CFCs','Methane','E&E']
file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]


for i in range(10,15):
    train_df,test_df = pred_dataset(['CO2','Methane','Ethane&Ethylene','CFCs'],feature_set= feature_set)
    for j in range(len(feature_list)):
        results = model_comparison(models,para_grids, feature_list[j],gas_list)
        files_name = 'Four_gases_with_Vmic_'+file_name[j]+'_result_'+str(i)+'.csv'
        pd.DataFrame(results,columns = columns).to_csv(os.path.join('./2_Predicting_separate_gas_BET_Vt_Vmeso_Vmic',files_name))  
        #pd.DataFrame(results,columns = ['Gas','Algo','Train_erro','Test_error']).to_csv(os.path.join('./',files_name))   

# Post result treatment

In [24]:
import pandas as pd
file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]

cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model"]
for j in range(len(file_name)):
    df_list = []
    for i in range(11):
    
        files_name = 'Four_gases_with_Vmic_'+file_name[j]+'_result_'+str(i)+'.csv'
        df_list.append(pd.read_csv(os.path.join('./2_Predicting_separate_gas_BET_Vt_Vmeso_Vmic',files_name))[cal_columns] )
        pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./2_Predicting_separate_gas_BET_Vt_Vmeso_Vmic',file_name[j]+'_mean_new.csv'))
        pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./2_Predicting_separate_gas_BET_Vt_Vmeso_Vmic',file_name[j]+'_std_new.csv'))


# Predicting the fitted parameters of adsorption isotherms

# Using only BET and Vt

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def pred_dataset(file_names, feature_set = feature_set):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-fitting-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        train_x,test_x = train_test_split(temp_data,test_size = 0.2)
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df

In [None]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,model,para_grid):

    
    result = GridSearchCV(model,para_grid,cv= 5,
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,cv =5,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
        if gas =='total':

            train_df_com = train_df
            test_df_com = test_df
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            
            train_x, train_y = shuffle(train_x, train_y,random_state=42)
            
            for model_name, model in model_list:

                
                result, best_param = model_CV(train_x,train_y.squeeze(),model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2_total,test_mse_total = model_eval(model_refit,test_x,test_y.squeeze()) 
                for gases in gas_list[1:]:
                    test_df_com = test_df[test_df['Label']==gases]
                    test_xs = test_df_com[input_feature]
                    test_ys = test_df_com[output].values
                    test_r2,test_mse = model_eval(model_refit,test_xs,test_ys.squeeze()) 
                    result_total.append([gases,model_name+'_total',result[0],result[1],test_r2_total,test_mse_total,test_r2,test_mse,best_param])

                    print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))

            
        else:
            
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            
            train_x, train_y = shuffle(train_x, train_y, random_state=42)
           
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print(print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))
)
                
    return result_total