# 2. Predicting adsorption for each adsorption data point using Vt and BET only focusing on the impact of different training sample size

In [1]:
import pandas as pd
import numpy as np

def pred_dataset(file_names, feature_set ):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        temp_data = temp_data[temp_data['Pressure']>0.01]
        index = list(set(temp_data['Index'].values))
        print(len(index))
        #test_index= np.random.choice(index,int(0.1*len(index)),replace=False)
        test_index= np.random.choice(index,25,replace=False)
        train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor,RandomForestRegressor
from lightgbm import LGBMRegressor  
from sklearn.svm import SVR
#from xgboost import XGBRegressor

n_estimators = [50,100,120,150,180,200]

# define different models#,
models = [
    #('SVR',SVR(max_iter=100000)),
    #('DT',DecisionTreeRegressor(random_state=42)),\
    # ('ADBR',AdaBoostRegressor(random_state=42)), 
    #("GBR",GradientBoostingRegressor(random_state=42)),\
    #('BG',BaggingRegressor(random_state=42,n_jobs=-1)),
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    #('RF',RandomForestRegressor(n_jobs=-1,random_state=42)),
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ('XGBR',XGBRegressor(eta=0.1, subsample=0.7, colsample_bytree=0.8,random_state =42))
    #('BGETR',BaggingRegressor(ExtraTreesRegressor(n_estimators = 180,random_state=42,n_jobs=6),random_state=42,n_jobs=-1))
    ]

# set search parameters grid for different models
para_grids = { 
    'SVR':{'kernel':['linear','poly','rbf','sigmoid','precomputed']},\
    'DT':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']},\
    'ADBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2],'loss':['linear','square','exponential']},\
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'BG':{'n_estimators':[10,50,100]},\
    'ETR':{'n_estimators':n_estimators},\
    'RF':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    'BGETR':{'n_estimators':[10]},\
    'XGBR':{'n_estimators':n_estimators, 'max_depth':[2,4,6,8,10],}
      
    }

In [3]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
           
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_separate',test_r2,test_mse))

                
    return result_total

In [4]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_3 = ['Vt',"Vmic",]
combin_5 = ['Vt',"Vmic",'Vmeso',]
combin_6 = ["Vmic",'Vmeso',]
feature_list = [base_feature+combin_1+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_plut_Vt']
feature_set = ["Vt",]
gas_list = ['Methane']
fraction = [1,2,3,4,5,6,7,8,9]
for i in range(10):
    train_dfs,test_df = pred_dataset(gas_list,feature_set)#
    fraction = range(25,len(list(set(train_dfs["Index"].values))),25)
    for k in fraction:
        print(k)
        temp_df = train_dfs
        nums_test = len(list(set(test_df["Index"].values)))
        index = list(set(temp_df['Index'].values))
        #print(len(index))
        train_index= np.random.choice(index,k,replace=False)
        train_df = temp_df.loc[temp_df['Index'].isin(train_index)]
        
        results = model_comparison(models,para_grids, feature_list[0],gas_list)
        files_name = 'The_impact_of_different_training_sample_size_of_'+str(k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
        pd.DataFrame(results,columns = columns).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name))  

820
25
Dataset Methane, Algorithm ETR_separate, Test_r2 0.6710063485262785, Test_error 2.296857462190462
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.821623505766514, Test_error 1.5991057213204904
Dataset Methane, Algorithm BGLGBM_separate, Test_r2 0.8237523297309143, Test_error 1.5685836243671645
Dataset Methane, Algorithm XGBR_separate, Test_r2 0.6648345749988451, Test_error 2.240071270917598
50
Dataset Methane, Algorithm ETR_separate, Test_r2 0.7558400900900114, Test_error 2.0762371562118633
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.8671490937468458, Test_error 1.4792001873222735
Dataset Methane, Algorithm BGLGBM_separate, Test_r2 0.8288537652993834, Test_error 1.6531739869135813
Dataset Methane, Algorithm XGBR_separate, Test_r2 0.6758991463321662, Test_error 2.5261031184128915
75
Dataset Methane, Algorithm ETR_separate, Test_r2 0.7341203668657469, Test_error 2.165920833583289
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.7351272471012846, Test_error 3.0994449

# Post result treatments
 

In [132]:
import pandas as pd
file_name = ['BET_plut_Vt','BET_only',]

cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model"]
for j in  range(25,len(list(set(train_dfs["Index"].values))),25):
    df_list = []
    for i in range(10):
    
        files_name = 'The_impact_of_different_training_sample_size_of_'+str(j)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
        df_list.append(pd.read_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name))[cal_columns] )
    pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size','Training_size_of_'+str(j)+'_mean.csv'))
    pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size','Training_size_of_'+str(j)+'std.csv'))


In [135]:
total = pd.DataFrame()
for j in  range(25,len(list(set(train_dfs["Index"].values))),25):
    
    files_name = 'Training_size_of_'+str(j)+'std.csv'
    temp = pd.read_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name))
    total = pd.concat([total,temp],axis =0)
total.to_csv(os.path.join('./3_The_impact_of_different_training_sample_size','Total'+'_std.csv'))
    

# 4 Using dataset (CO2, Methane, and E&E) with more datapoints to improve the prediction of dataset with less data points (CFCs)

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def pred_dataset(file_names, feature_set):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    data_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        #train_x,test_x = train_test_split(temp_data,test_size = 0.2)
        data_df = pd.concat([data_df,temp_data],axis=0)
        #test_df = pd.concat([test_df,test_x],axis =0)
    return data_df

In [7]:
data = pred_dataset(['Ethane&Ethylene'],['BET','Vt'])
len(set(data["Index"].values))

388

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor,RandomForestRegressor
from lightgbm import LGBMRegressor  
from sklearn.svm import SVR
from xgboost import XGBRegressor
  
n_estimators = [50,100,120,150,180,200]

# define different models#,
models = [
    #('SVR',SVR(max_iter=100000)),
    #('DT',DecisionTreeRegressor(random_state=42)),\
    # ('ADBR',AdaBoostRegressor(random_state=42)), 
    #("GBR",GradientBoostingRegressor(random_state=42)),\
    #('BG',BaggingRegressor(random_state=42,n_jobs=-1)),
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    #('RF',RandomForestRegressor(n_jobs=-1,random_state=42)),
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ('XGBR',XGBRegressor(eta=0.1, subsample=0.7, colsample_bytree=0.8,random_state =42))
    #('BGETR',BaggingRegressor(ExtraTreesRegressor(n_estimators = 180,random_state=42,n_jobs=6),random_state=42,n_jobs=-1))
    ]

# set search parameters grid for different models
para_grids = { 
    'SVR':{'kernel':['linear','poly','rbf','sigmoid','precomputed']},\
    'DT':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']},\
    'ADBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2],'loss':['linear','square','exponential']},\
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'BG':{'n_estimators':[10,50,100]},\
    'ETR':{'n_estimators':n_estimators},\
    'RF':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    'BGETR':{'n_estimators':[10]},\
    'XGBR':{'n_estimators':n_estimators, 'max_depth':[2,4,6,8,10],}
      
    }

In [9]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle


def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
            #train_df_com = train_df[train_df['Label']==gas]
            train_df_com = train_df
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            print(f'With big set and the total training records is {len(train_df_com)}')
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_with_big_set',test_r2,test_mse))

            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            print(f'With no big set and the total training records is {len(train_df_com)}')
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_no_big_set',test_r2,test_mse))

                
    return result_total

In [12]:
import random
def group_split(data_df,target,additional):
    #np.random.seed(42)
    temp_data = data_df[data_df["Label"]==target]
    index = list(set(temp_data['Index'].values))
    temp_data,_ = shuffle(temp_data, temp_data['Index'].values, random_state=42)
    #print(len(index))
    test_index= np.random.choice(index,int(0.2*len(index)),replace=False)
    train_df = temp_data.loc[~temp_data['Index'].isin( test_index)]
    test_df = temp_data.loc[temp_data['Index'].isin(test_index)]
    index_len = len(list(set(train_df['Index'].values)))


    for gas in additional:
        temp_data = data_df[data_df["Label"]==gas]
        temp_index = list(set(temp_data['Index'].values))
        if len(temp_index)<index_len*6:
            train_df = pd.concat([train_df,temp_data])
        else:
            #selected_index =  np.random.choice(temp_index,index_len*3,replace=False)
            selected_index =  np.random.choice(temp_index,388,replace=False) # for CO2 and Methane only
            temp_train = temp_data.loc[temp_data['Index'].isin(selected_index)]
            train_df = pd.concat([train_df,temp_train])
    groups = train_df['Index'].values
    train_df, groups = shuffle(train_df, groups, random_state=42)
    
    return train_df,test_df

In [13]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_5 = ['Vt',"Vmic",]
combin_6 = ['Vt',"Vmic",'Vmeso',]
combin_7 = ["Vmic",'Vmeso',]

feature_list = [base_feature+combin_1+condition_feature]

columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param']
feature_set = ["BET","Vt"]
gas_list = ['CFCs']
file_name = ["BET_Vt_Vmic_meso"]

for i in range(15):
    data_df = pred_dataset(['Methane','CFCs'],feature_set= feature_set)
    for gas in gas_list:
        train_df,test_df = group_split(data_df,gas,['Methane'])
        print(len(test_df))
        for j in range(len(feature_list)):
            results = model_comparison(models,para_grids, feature_list[j],[gas])
            files_name = 'Improving_small_with_big_set'+gas+file_name[j]+'_result_'+str(i)+'.csv'
            pd.DataFrame(results,columns = columns).to_csv(os.path.join('./4_Improving_small_set_with_big_set',files_name))  
            #pd.DataFrame(results,columns = ['Gas','Algo','Train_erro','Test_error']).to_csv(os.path.join('./',files_name))   

272
With big set and the total training records is 5230
Dataset CFCs, Algorithm ETR_with_big_set, Test_r2 0.9575793692807911, Test_error 0.7395101865428176
Dataset CFCs, Algorithm LGBM_with_big_set, Test_r2 0.9396782196984502, Test_error 1.0759032718127106
Dataset CFCs, Algorithm BGLGBM_with_big_set, Test_r2 0.9406840633655214, Test_error 0.9009091139034522
Dataset CFCs, Algorithm XGBR_with_big_set, Test_r2 0.893957020406485, Test_error 1.425373432705784
With no big set and the total training records is 1295
Dataset CFCs, Algorithm ETR_no_big_set, Test_r2 0.9545732602184712, Test_error 0.7630520088115674
Dataset CFCs, Algorithm LGBM_no_big_set, Test_r2 0.9463394003250634, Test_error 0.9105006160081432
Dataset CFCs, Algorithm BGLGBM_no_big_set, Test_r2 0.9404565532997785, Test_error 0.9332125440186539
Dataset CFCs, Algorithm XGBR_no_big_set, Test_r2 0.9384202798311716, Test_error 0.9410676209540155
316
With big set and the total training records is 5130
Dataset CFCs, Algorithm ETR_with_

# Post result treatment

In [14]:
import pandas as pd
#file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]
file_name = ["BET_Vt_Vmic_meso"]
cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model"]
for gas in gas_list:
    df_list = [] 
    for i in range(15):
        for j in range(len(feature_list)):
            files_name  = 'Improving_small_with_big_set'+gas+file_name[j]+'_result_'+str(i)+'.csv'
            df_list.append(pd.read_csv(os.path.join('./4_Improving_small_set_with_big_set',files_name))[cal_columns] )
    pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./4_Improving_small_set_with_big_set',   gas+'-mean.csv'))
    pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./4_Improving_small_set_with_big_set',gas+'-std.csv'))


# Predicting the fitted parameters of adsorption isotherms

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def pred_dataset(file_names, feature_set = feature_set):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-fitting-02-01-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        train_x,test_x = train_test_split(temp,test_size = 0.2)
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df

In [None]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,model,para_grid):

    
    result = GridSearchCV(model,para_grid,cv= 5,
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,cv =5,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
        if gas =='total':

            train_df_com = train_df
            test_df_com = test_df
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            
            train_x, train_y = shuffle(train_x, train_y,random_state=42)
            
            for model_name, model in model_list:

                
                result, best_param = model_CV(train_x,train_y.squeeze(),model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2_total,test_mse_total = model_eval(model_refit,test_x,test_y.squeeze()) 
                for gases in gas_list[1:]:
                    test_df_com = test_df[test_df['Label']==gases]
                    test_xs = test_df_com[input_feature]
                    test_ys = test_df_com[output].values
                    test_r2,test_mse = model_eval(model_refit,test_xs,test_ys.squeeze()) 
                    result_total.append([gases,model_name+'_total',result[0],result[1],test_r2_total,test_mse_total,test_r2,test_mse,best_param])

                    print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))

            
        else:
            
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            
            train_x, train_y = shuffle(train_x, train_y, random_state=42)
           
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print(print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_total',test_r2,test_mse))
)
                
    return result_total