In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

def pred_dataset(file_names, feature_set,random_state):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    add_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        temp_data = temp_data[temp_data['Pressure']>0.01]
        index = list(set(temp_data['Index'].values))
        index = shuffle(index,random_state=random_state)
        print(len(index))
        test_add_index =  index[:int(len(index)*0.3)]
        #test_index= np.random.choice(index,int(0.1*len(index)),replace=False)
        test_index= test_add_index[:len(test_add_index)//2]
        add_index = test_add_index[len(test_add_index)//2:]
        train_x = temp_data.loc[~temp_data['Index'].isin( test_add_index)]

        test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        add_x = temp_data.loc[temp_data['Index'].isin(add_index)]
        
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
        add_df = pd.concat([add_df,add_x],axis =0)
        print(train_df.shape,test_df.shape,add_df.shape)
    return train_df,test_df,add_df

In [2]:

from sklearn.ensemble import GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor
from lightgbm import LGBMRegressor  

n_estimators = [50,100,120,150,180,200]
# define different models#,
models = [
    ("GBR",GradientBoostingRegressor(random_state=42)),\
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    #('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ]

# set search parameters grid for different models
para_grids = { 
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'ETR':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    }

In [3]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
           
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print('Dataset {}, Algorithm {}, Test_r2 {}, Test_error {}'.format(gas,model_name+'_separate',test_r2,test_mse))

                
    return result_total

In [4]:
import numpy.random as nrd
from sklearn.decomposition import KernelPCA,SparsePCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from  collections import Counter

def bgk_pca(train_df,test_df): 
    #np.random.RandomState(42)# using major voting approach to find the outliers in the test dataset
    test_index = list(set(test_df["Index"].values))
    len_test = len(test_index)
    train_index = list(set(train_df["Index"].values))
    #total_feature = ["Index",'V','L','BET','Vt','Temp(K)']
    total_feature = ["Index",'BET','Vt','Temp(K)']
    #pca_feature =  ['V','L','BET','Vt','Temp(K)']
    pca_feature =  ['BET','Vt','Temp(K)']
    num_feature = len(pca_feature)
    removed_index = []
    res = []
    iters = 10
    multi_mse = []
    for i in range(iters):
        mses = []
        train_selected = train_df[train_df["Index"].isin(nrd.choice(train_index,int(len(train_index)*0.75),replace=False))] # modified here change fixed len to a the fraction of the len of the training dataset.
        data = pd.concat([test_df,train_selected])
        sub_data = data[total_feature].drop_duplicates()
        sub_data_scalered = MinMaxScaler().fit_transform(sub_data[pca_feature].values)
        
        """adding lines to determine the number of components to achieve 0.99 threshold"""
        kernel_pca = KernelPCA(kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        kpca_transform = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        explained_variance = np.var(kpca_transform, axis=0)
        explained_variance_ratio = explained_variance / np.sum(explained_variance)
        cumu_variance = np.cumsum(explained_variance_ratio)
        n_components = np.where(cumu_variance>0.99)[0][0]+1
        kernel_pca = KernelPCA(n_components=n_components,kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        """ The end of adding new lines"""

        sub_data_transformed = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        reconstructed = kernel_pca.inverse_transform(sub_data_transformed.reshape(num_feature,-1))
        for i in range(len(sub_data_scalered)):
            mses.append(mean_squared_error(sub_data_scalered[i],reconstructed.reshape(-1,num_feature)[i]))
            df_mse = pd.DataFrame(mses,columns = ['MSE'])
        df_mse['Indexs'] = sub_data["Index"].drop_duplicates().values
        mean_mse = df_mse["MSE"].mean()
        test_mse_df = df_mse[df_mse["Indexs"].isin(test_index)]
        outlier_index = test_mse_df[test_mse_df["MSE"]>3*mean_mse]["Indexs"].values.tolist()
        removed_index.extend(outlier_index)
        multi_mse.append(mean_mse)

    counter = Counter(removed_index)
    thresh = int(0.7*iters)
    for key,values in counter.most_common():
        if values>=thresh:
            res.append(key)
            
        if values<thresh: break

    return np.mean(multi_mse)#res

In [12]:
import os
from tqdm import tqdm
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_5 = ['Vt',"Vmic",'Vmeso',]
feature_list = [base_feature+combin_5+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param','kpg_pca_mse']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_plus_Vt']
feature_set = ["Vt","Vmic",'Vmeso',]
gas_list = ['CO2']
recons_error = []
total_results = []
label = ['Without new adding','With new adding']
for i in tqdm(range(50)):
    train_dfs,test_df,add_df = pred_dataset(gas_list,feature_set,random_state=i)#
    for i in range(2):
        if not i:
            select_list = list(set(add_df['Index'].values))[:len(set(add_df['Index'].values))//2]
            print(len(select_list))
            add_df = add_df[add_df['Index'].isin(select_list)]
            train_df = pd.concat([train_dfs,add_df],axis= 0)
        else:
            train_df = train_dfs

        k_recon_mse = bgk_pca(train_df,test_df)
        results = model_comparison(models,para_grids, feature_list[0],gas_list)
        temp_results  = []
        for ele in results:
            temp_results.append(ele+[k_recon_mse])
        print("Datasetstype: {}, recons_mse: {:.4f}".format(label[i],k_recon_mse))
        total_results.extend(temp_results)
files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_CO2_add_half_'+file_name[0]+'_result_'+str(i)+'.csv'
pd.DataFrame(total_results,columns = columns).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name)) 

  0%|          | 0/50 [00:00<?, ?it/s]

2028
(15061, 27) (3260, 27) (3260, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9536452508770378, Test_error 0.6294428439161793
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9439560756245216, Test_error 0.7632598455806767
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9500558755264121, Test_error 0.6510087162556933
Datasetstype: Without new adding, recons_mse: 0.0194
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9441972552178082, Test_error 0.7382289479314933
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9417422199288029, Test_error 0.8039262751125303


  2%|▏         | 1/50 [02:52<2:21:12, 172.92s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9414345446055676, Test_error 0.7657783978721063
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15199, 27) (3159, 27) (3223, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9437450622318072, Test_error 0.9264738316984321
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9412930276751457, Test_error 0.9184855012456189
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9475723944717708, Test_error 0.8187582006434645
Datasetstype: Without new adding, recons_mse: 0.0189
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9363764447953846, Test_error 1.0316716017820198
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9394131096587868, Test_error 0.9337771640765642


  4%|▍         | 2/50 [05:44<2:17:38, 172.05s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9361508825871141, Test_error 0.9978610533430297
Datasetstype: With new adding, recons_mse: 0.0196
2028
(15173, 27) (3205, 27) (3203, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9289524518042067, Test_error 1.0968677372247102
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9475045294300524, Test_error 0.7680500928443826
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9393825653356631, Test_error 0.9035417045713726
Datasetstype: Without new adding, recons_mse: 0.0199
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9343938918427662, Test_error 0.9622191107331594
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9486741404257015, Test_error 0.7585023695137968


  6%|▌         | 3/50 [08:30<2:12:48, 169.53s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9470887166757409, Test_error 0.8381394503901206
Datasetstype: With new adding, recons_mse: 0.0209
2028
(15098, 27) (3251, 27) (3232, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9132293116027438, Test_error 1.1245878371920408
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9304758295936602, Test_error 0.9294104237680358
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9324207703547313, Test_error 0.9087382817437991
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9242372107451418, Test_error 0.9741058136858405
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9315020787522977, Test_error 0.9280158783442971


  8%|▊         | 4/50 [11:15<2:08:23, 167.48s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9311183213544943, Test_error 0.9142049456784467
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15144, 27) (3205, 27) (3232, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9410857922014652, Test_error 0.7170709768499128
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9353202360347161, Test_error 0.7637441778610834
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9512748001143303, Test_error 0.6082528915503975
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9445774080531818, Test_error 0.7029021753544243
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9370693441880177, Test_error 0.7418019245625477


 10%|█         | 5/50 [13:58<2:04:31, 166.03s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9514385071055612, Test_error 0.6036661881271934
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15145, 27) (3234, 27) (3202, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9466980963310713, Test_error 0.8411470814475904
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9429583235524259, Test_error 0.8844686916344273
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.953136398864998, Test_error 0.7172194025537181
Datasetstype: Without new adding, recons_mse: 0.0189
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9519131314808917, Test_error 0.7551622035971892
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9430248764803999, Test_error 0.8876769961958614


 12%|█▏        | 6/50 [16:47<2:02:27, 166.99s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9534339346797531, Test_error 0.7183506517534747
Datasetstype: With new adding, recons_mse: 0.0197
2028
(15168, 27) (3228, 27) (3185, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9415241384953071, Test_error 0.9297296585742848
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9381651728639218, Test_error 0.9534051795832841
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.945196953283295, Test_error 0.8744769501018634
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9336751099571631, Test_error 1.0137671182858643
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9361146463997035, Test_error 0.9823034147566536


 14%|█▍        | 7/50 [19:38<2:00:32, 168.19s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9354787911273247, Test_error 1.0535862251139057
Datasetstype: With new adding, recons_mse: 0.0204
2028
(15061, 27) (3278, 27) (3242, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9257610758153709, Test_error 0.8868736654003724
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9012538224233858, Test_error 1.1208292142437535
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.8931503323363834, Test_error 1.2257729425537043
Datasetstype: Without new adding, recons_mse: 0.0196
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9215195498194277, Test_error 0.9530378744259931
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9006949924566144, Test_error 1.1340831656114632


 16%|█▌        | 8/50 [22:24<1:57:14, 167.48s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.8983092299862333, Test_error 1.1688418801579066
Datasetstype: With new adding, recons_mse: 0.0201
2028
(15167, 27) (3224, 27) (3190, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.924107958734401, Test_error 1.18225456692224
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9409837256649256, Test_error 0.8465163364207994
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9279230442719668, Test_error 1.1255391370996526
Datasetstype: Without new adding, recons_mse: 0.0201
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9323543935046298, Test_error 0.999185920154581
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9355339070781721, Test_error 0.9217357059760675


 18%|█▊        | 9/50 [25:12<1:54:43, 167.89s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9229259319513582, Test_error 1.2420487669646438
Datasetstype: With new adding, recons_mse: 0.0209
2028
(15125, 27) (3230, 27) (3226, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9434805668917288, Test_error 0.7844948763224137
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9490222751613323, Test_error 0.6880013719077036
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9517167797165617, Test_error 0.6805486167808572
Datasetstype: Without new adding, recons_mse: 0.0188
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9424052084674689, Test_error 0.8110863042467652
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9445594369264529, Test_error 0.7520565974986372


 20%|██        | 10/50 [28:02<1:52:11, 168.28s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.948962976866685, Test_error 0.7031077969942932
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15115, 27) (3189, 27) (3277, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9225787162259287, Test_error 1.022376696477052
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9321764331335425, Test_error 0.8398157063023618
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.93468167763798, Test_error 0.877526867554955
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9241508539457831, Test_error 1.029572767784456
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9257112170240597, Test_error 0.9169655648033121


 22%|██▏       | 11/50 [30:51<1:49:31, 168.50s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9424471962902936, Test_error 0.7795078730707358
Datasetstype: With new adding, recons_mse: 0.0201
2028
(15053, 27) (3289, 27) (3239, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9521614167616269, Test_error 0.5730262382598891
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9389969538070745, Test_error 0.746704949030926
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9608719939784676, Test_error 0.4973939115123126
Datasetstype: Without new adding, recons_mse: 0.0197
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9496954709713966, Test_error 0.6223323642236426
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9358270035091875, Test_error 0.7789708532786982


 24%|██▍       | 12/50 [33:39<1:46:44, 168.54s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9563758446076941, Test_error 0.5373766381485022
Datasetstype: With new adding, recons_mse: 0.0211
2028
(15005, 27) (3301, 27) (3275, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9526731328138832, Test_error 0.6703616200581148
Dataset CO2, Algorithm ETR_separate, Test_r2 0.946168276992869, Test_error 0.7461669718464469
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9564431166874989, Test_error 0.6131039530240114
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9423518564227649, Test_error 0.7960799250301054
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9424380932680623, Test_error 0.7973953935479091


 26%|██▌       | 13/50 [36:25<1:43:23, 167.67s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9457268560229657, Test_error 0.7773453005842562
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15090, 27) (3191, 27) (3300, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9471669939253948, Test_error 0.5983484637244502
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9505791800937347, Test_error 0.5854325767909337
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9555050023514822, Test_error 0.5255641234570425
Datasetstype: Without new adding, recons_mse: 0.0190
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9535774046214899, Test_error 0.5543630206919038
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9494724620623385, Test_error 0.6065093781469684


 28%|██▊       | 14/50 [39:13<1:40:46, 167.95s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9652547780218574, Test_error 0.4290151688872411
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15101, 27) (3289, 27) (3191, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.937633700341039, Test_error 0.6295870925927591
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9399921562743202, Test_error 0.5574687252987975
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9493074692339418, Test_error 0.5082400302527516
Datasetstype: Without new adding, recons_mse: 0.0193
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9445525003073947, Test_error 0.5471757544373478
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9398241078280212, Test_error 0.5551192944289535


 30%|███       | 15/50 [42:02<1:38:06, 168.20s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9347905134073126, Test_error 0.6496268107073931
Datasetstype: With new adding, recons_mse: 0.0197
2028
(15066, 27) (3250, 27) (3265, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9430381332834749, Test_error 0.855126605193495
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9299559289907355, Test_error 0.9765215369898931
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9312028079738797, Test_error 1.0274887717666288
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9189590345957496, Test_error 1.2008527960950435
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9264443712576296, Test_error 1.0060756232420027


 32%|███▏      | 16/50 [44:50<1:35:15, 168.09s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9325392225139044, Test_error 0.998750543245757
Datasetstype: With new adding, recons_mse: 0.0204
2028
(15102, 27) (3247, 27) (3232, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9250876816532854, Test_error 0.8672101427973007
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9168112102054696, Test_error 0.944454183804358
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9348171080495488, Test_error 0.7908864166364299
Datasetstype: Without new adding, recons_mse: 0.0200
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9231175156823461, Test_error 0.9311532870112609
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9154062697126273, Test_error 0.962525048374675


 34%|███▍      | 17/50 [47:38<1:32:23, 167.97s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9265251748268355, Test_error 0.863695537829791
Datasetstype: With new adding, recons_mse: 0.0210
2028
(15153, 27) (3214, 27) (3214, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9530946683640552, Test_error 0.6582835186656281
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9506191523838284, Test_error 0.656462925712681
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9569669096294023, Test_error 0.5590363651440902
Datasetstype: Without new adding, recons_mse: 0.0199
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9498871101357508, Test_error 0.7143502429322206
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9480375852050605, Test_error 0.692643705574122


 36%|███▌      | 18/50 [50:26<1:29:38, 168.08s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9559740960788456, Test_error 0.581505914848687
Datasetstype: With new adding, recons_mse: 0.0209
2028
(15147, 27) (3230, 27) (3204, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9506640801781159, Test_error 0.599532827581406
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9532960038259541, Test_error 0.5556801025401621
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9645823566386322, Test_error 0.4627643715715325
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9455740139972296, Test_error 0.647657204503992
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9438043183082852, Test_error 0.649720657611367


 38%|███▊      | 19/50 [53:11<1:26:22, 167.18s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9544110994959312, Test_error 0.5779599567009582
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15051, 27) (3293, 27) (3237, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9482711957947643, Test_error 0.7900805952957719
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9421193209948946, Test_error 0.8898468074833544
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9456932530442979, Test_error 0.8560927226447933
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9456986852124609, Test_error 0.8547304909526492
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9427444593158705, Test_error 0.8895941727364911


 40%|████      | 20/50 [55:59<1:23:42, 167.41s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9432463148275276, Test_error 0.916781412330988
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15025, 27) (3304, 27) (3252, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9409235177311461, Test_error 0.9546334079228581
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9398548605823629, Test_error 0.9354417675279012
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9424062032568521, Test_error 0.9300140218361
Datasetstype: Without new adding, recons_mse: 0.0189
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9294547307800836, Test_error 1.1217824877078926
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9380450767321495, Test_error 0.9593947009587496


 42%|████▏     | 21/50 [58:46<1:20:47, 167.15s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.939474934041798, Test_error 1.0000697395831772
Datasetstype: With new adding, recons_mse: 0.0196
2028
(15121, 27) (3191, 27) (3269, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9401276454920311, Test_error 0.8368237391939196
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9425398396039372, Test_error 0.829363963710582
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.949804966182122, Test_error 0.7230396069761031
Datasetstype: Without new adding, recons_mse: 0.0193
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9477982018711897, Test_error 0.7907738136928443
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9370465080994146, Test_error 0.9217987841979597


 44%|████▍     | 22/50 [1:01:33<1:17:58, 167.08s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9524330556792401, Test_error 0.7277946519969383
Datasetstype: With new adding, recons_mse: 0.0202
2028
(15172, 27) (3230, 27) (3179, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9428273245161576, Test_error 0.7372317237818184
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9373840648621562, Test_error 0.8111227744169817
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9386701867577665, Test_error 0.8310901886109912
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9420748630052652, Test_error 0.7239777627183773
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9364624337522888, Test_error 0.8215181861891319


 46%|████▌     | 23/50 [1:04:19<1:15:05, 166.87s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9399956139357576, Test_error 0.7725602115705169
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15109, 27) (3218, 27) (3254, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9077235692361918, Test_error 1.4000692016869416
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9261466783406014, Test_error 1.152113625195376
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9119334870708831, Test_error 1.342980822361289
Datasetstype: Without new adding, recons_mse: 0.0200
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9038265024122693, Test_error 1.4817246890952938
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9267663221096227, Test_error 1.1415405270464303


 48%|████▊     | 24/50 [1:07:07<1:12:28, 167.25s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9162735218467815, Test_error 1.3427517570562426
Datasetstype: With new adding, recons_mse: 0.0212
2028
(15148, 27) (3190, 27) (3243, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9500717117348123, Test_error 0.6863529206144503
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9444280027584145, Test_error 0.7730889814951435
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9548350659226724, Test_error 0.6198152316761472
Datasetstype: Without new adding, recons_mse: 0.0193
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9499911877850737, Test_error 0.6814090985474536
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9422854434880198, Test_error 0.796287303228306


 50%|█████     | 25/50 [1:09:54<1:09:39, 167.18s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.942723601611064, Test_error 0.7912603254775935
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15116, 27) (3189, 27) (3276, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9449437468694812, Test_error 0.776524811010113
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9426392004945118, Test_error 0.7819769802311319
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9341263456873184, Test_error 0.8834797193496029
Datasetstype: Without new adding, recons_mse: 0.0194
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9442064416404929, Test_error 0.7692899645501511
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9429116922031665, Test_error 0.7829789515527714


 52%|█████▏    | 26/50 [1:12:43<1:07:02, 167.61s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9364876100265128, Test_error 0.8409018606710204
Datasetstype: With new adding, recons_mse: 0.0202
2028
(15084, 27) (3239, 27) (3258, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9331549082632493, Test_error 0.8836083407335484
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9388098531432453, Test_error 0.7754752754795684
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9469674760985644, Test_error 0.7123598611650042
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9400769004162871, Test_error 0.7725885875994988
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9371530339975961, Test_error 0.7992951540366112


 54%|█████▍    | 27/50 [1:15:31<1:04:16, 167.67s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9379724486942217, Test_error 0.8207773549888552
Datasetstype: With new adding, recons_mse: 0.0201
2028
(15128, 27) (3229, 27) (3224, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9415850357995627, Test_error 0.8660055876168411
Dataset CO2, Algorithm ETR_separate, Test_r2 0.951163511490027, Test_error 0.7472694833563134
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9360019697984417, Test_error 0.9813650967889855
Datasetstype: Without new adding, recons_mse: 0.0194
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9411689315528617, Test_error 0.8663008077703311
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9492722940050242, Test_error 0.7745753437272146


 56%|█████▌    | 28/50 [1:18:19<1:01:34, 167.94s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9288645988454401, Test_error 1.058748824595927
Datasetstype: With new adding, recons_mse: 0.0197
2028
(15172, 27) (3190, 27) (3219, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9393825575360158, Test_error 0.7024802651964389
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9303752264377586, Test_error 0.7272204268556018
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9361471685070237, Test_error 0.7296565232980058
Datasetstype: Without new adding, recons_mse: 0.0190
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9357267504549419, Test_error 0.7568538832081554
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9300533521809492, Test_error 0.7299204787620542


 58%|█████▊    | 29/50 [1:21:08<58:51, 168.18s/it]  

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9377569939356338, Test_error 0.7330028750757254
Datasetstype: With new adding, recons_mse: 0.0196
2028
(15150, 27) (3231, 27) (3200, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9211854160582799, Test_error 1.0162059668121937
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9339704718914904, Test_error 0.8544840367796498
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9400644324768551, Test_error 0.7866308696339284
Datasetstype: Without new adding, recons_mse: 0.0199
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9136394547771868, Test_error 1.1265169993046074
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9286793790631133, Test_error 0.9164277406936446


 60%|██████    | 30/50 [1:24:01<56:33, 169.65s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9371261936766535, Test_error 0.8419962613495501
Datasetstype: With new adding, recons_mse: 0.0208
2028
(15116, 27) (3270, 27) (3195, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9358021205890233, Test_error 0.8842660563274476
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9302789766129901, Test_error 0.8972947196541136
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9347480756453642, Test_error 0.8622826199774125
Datasetstype: Without new adding, recons_mse: 0.0190
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9302535013969124, Test_error 0.9537133504952625
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9275667601190348, Test_error 0.9255921759175726


 62%|██████▏   | 31/50 [1:26:48<53:30, 168.99s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9361672271386574, Test_error 0.8835562155885489
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15058, 27) (3290, 27) (3233, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9366786979129164, Test_error 0.7970154851276612
Dataset CO2, Algorithm ETR_separate, Test_r2 0.935579507986579, Test_error 0.8054074557119884
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9382338937138668, Test_error 0.7508899908181949
Datasetstype: Without new adding, recons_mse: 0.0190
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9323489869989253, Test_error 0.86902165353862
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9360473028773059, Test_error 0.8059575848572131


 64%|██████▍   | 32/50 [1:29:42<51:06, 170.34s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9459668745255868, Test_error 0.7252964850165184
Datasetstype: With new adding, recons_mse: 0.0197
2028
(15219, 27) (3165, 27) (3197, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9582454493864591, Test_error 0.606504378427596
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9496345965276014, Test_error 0.6646313523697928
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9489161602863027, Test_error 0.7016838744803681
Datasetstype: Without new adding, recons_mse: 0.0194
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9581681870615661, Test_error 0.594974459636153
Dataset CO2, Algorithm ETR_separate, Test_r2 0.94791525564733, Test_error 0.6892596107951462


 66%|██████▌   | 33/50 [1:32:33<48:19, 170.56s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9534146562613455, Test_error 0.6493072621224152
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15083, 27) (3281, 27) (3217, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.940755011586351, Test_error 0.7708784985784581
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9353465896677848, Test_error 0.8327114911059061
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9448213212184076, Test_error 0.7688494212357093
Datasetstype: Without new adding, recons_mse: 0.0199
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9379677170272106, Test_error 0.8121691469164029
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9325651134680077, Test_error 0.8581539985298134


 68%|██████▊   | 34/50 [1:35:22<45:19, 169.99s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9479960381083283, Test_error 0.7080278708629288
Datasetstype: With new adding, recons_mse: 0.0206
2028
(15085, 27) (3228, 27) (3268, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9516439616244681, Test_error 0.6113711292982713
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9466657061045507, Test_error 0.654913419386825
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9568204479963224, Test_error 0.5887761065819291
Datasetstype: Without new adding, recons_mse: 0.0195
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9505513596269191, Test_error 0.6568458301610223
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9417750442510712, Test_error 0.718062718773463


 70%|███████   | 35/50 [1:38:11<42:25, 169.72s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.952382883742539, Test_error 0.6206151027745227
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15017, 27) (3258, 27) (3306, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9501648265707247, Test_error 0.9861567012087229
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9511835981606545, Test_error 0.8554382473168277
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9330415495788589, Test_error 1.2736901398987843
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9536845404078146, Test_error 0.8366223569618659
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9492449502078932, Test_error 0.899151314383227


 72%|███████▏  | 36/50 [1:40:58<39:26, 169.04s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9358680114440868, Test_error 1.1652400007019015
Datasetstype: With new adding, recons_mse: 0.0192
2028
(15119, 27) (3160, 27) (3302, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.949519025999646, Test_error 1.055781297505349
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9518885842244322, Test_error 0.9374672159422781
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9437714351437454, Test_error 1.1432023758628518
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9375921589467486, Test_error 1.3646250199298153
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9493062726440713, Test_error 0.9874879021362392


 74%|███████▍  | 37/50 [1:43:45<36:29, 168.39s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9360333893435981, Test_error 1.3100204716169936
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15043, 27) (3280, 27) (3258, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9411285460156075, Test_error 0.7758671216172144
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9460849437107154, Test_error 0.7168342976065807
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9386391891478538, Test_error 0.8378251420676965
Datasetstype: Without new adding, recons_mse: 0.0191
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9404458243470151, Test_error 0.7982744682295916
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9450350977471295, Test_error 0.7364545672401104


 76%|███████▌  | 38/50 [1:46:34<33:41, 168.47s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9390043731261316, Test_error 0.863765155794736
Datasetstype: With new adding, recons_mse: 0.0199
2028
(15125, 27) (3219, 27) (3237, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9422574234970915, Test_error 0.9588778105199228
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9491363187630332, Test_error 0.8001414271037418
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9529307382214903, Test_error 0.7856462732449899
Datasetstype: Without new adding, recons_mse: 0.0200
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9471816658135463, Test_error 0.8712738507599046
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9497764283494146, Test_error 0.7903733413207099


 78%|███████▊  | 39/50 [1:49:22<30:53, 168.54s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9432330487360491, Test_error 0.8806768594977564
Datasetstype: With new adding, recons_mse: 0.0208
2028
(15218, 27) (3184, 27) (3179, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9444871278296811, Test_error 0.7191049038602733
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9414789810851177, Test_error 0.7581209191423668
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.948042121427541, Test_error 0.7063874218442536
Datasetstype: Without new adding, recons_mse: 0.0200
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9500996352704293, Test_error 0.6287516572652392
Dataset CO2, Algorithm ETR_separate, Test_r2 0.935359836871029, Test_error 0.8214693672051948


 80%|████████  | 40/50 [1:52:12<28:08, 168.81s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9402688972743313, Test_error 0.7607325575804265
Datasetstype: With new adding, recons_mse: 0.0209
2028
(15144, 27) (3155, 27) (3282, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9493277841242851, Test_error 0.793421479347881
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9526854794485785, Test_error 0.7287244751946614
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9531368140263847, Test_error 0.759794506389594
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9422879697270029, Test_error 0.8567956331047585
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9543508985063666, Test_error 0.7043277817923301


 82%|████████▏ | 41/50 [1:55:01<25:19, 168.87s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9539715588508149, Test_error 0.69630598663672
Datasetstype: With new adding, recons_mse: 0.0198
2028
(15141, 27) (3188, 27) (3252, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9276940941283928, Test_error 0.946182174381528
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9285558244996217, Test_error 0.9085228904437229
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9259617278367481, Test_error 0.931419489620109
Datasetstype: Without new adding, recons_mse: 0.0193
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9269791315552197, Test_error 0.9312397605541585
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9218252379129854, Test_error 0.9917570626860542


 84%|████████▍ | 42/50 [1:57:50<22:32, 169.02s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9221594323830921, Test_error 0.9481311759732479
Datasetstype: With new adding, recons_mse: 0.0203
2028
(15095, 27) (3214, 27) (3272, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9502522186437011, Test_error 0.7693638611461985
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9499339104258147, Test_error 0.7934035219195134
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.953168244581956, Test_error 0.7648539769620825
Datasetstype: Without new adding, recons_mse: 0.0189
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9477157626442361, Test_error 0.8132266002227637
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9472127835633309, Test_error 0.827274469887425


 86%|████████▌ | 43/50 [2:00:34<19:31, 167.43s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.954998047766982, Test_error 0.6982190257442952
Datasetstype: With new adding, recons_mse: 0.0196
2028
(15149, 27) (3215, 27) (3217, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9431366272776649, Test_error 0.8075663242165625
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9261250802068145, Test_error 0.9865925156437148
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9433955031608314, Test_error 0.8210502390921524
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9402100804614201, Test_error 0.8549311906588277
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9252260485844209, Test_error 1.0038624751659442


 88%|████████▊ | 44/50 [2:03:19<16:40, 166.67s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9382022725807737, Test_error 0.892147943619626
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15074, 27) (3257, 27) (3250, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9258582354339151, Test_error 1.1898502839581016
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9247758958084764, Test_error 1.2105870639793537
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9280309549659127, Test_error 1.2024245394339859
Datasetstype: Without new adding, recons_mse: 0.0190
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9317804533095154, Test_error 1.1073816458974723
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9243782872242677, Test_error 1.2234106656546022


 90%|█████████ | 45/50 [2:06:03<13:48, 165.78s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9367881853062049, Test_error 1.0830698197503943
Datasetstype: With new adding, recons_mse: 0.0200
2028
(15032, 27) (3320, 27) (3229, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9501182760323239, Test_error 0.6338953036534071
Dataset CO2, Algorithm ETR_separate, Test_r2 0.94109810063011, Test_error 0.7206582098398512
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9447104124641381, Test_error 0.687469128303473
Datasetstype: Without new adding, recons_mse: 0.0198
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9356956751021664, Test_error 0.7793666496236765
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9413094520598445, Test_error 0.7188938315525293


 92%|█████████▏| 46/50 [2:08:47<11:01, 165.30s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9414220150046374, Test_error 0.7370377006585096
Datasetstype: With new adding, recons_mse: 0.0207
2028
(15025, 27) (3231, 27) (3325, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9400874053946363, Test_error 0.8680805475841017
Dataset CO2, Algorithm ETR_separate, Test_r2 0.943681816977387, Test_error 0.8088875894389412
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9364479111171384, Test_error 0.965947280295475
Datasetstype: Without new adding, recons_mse: 0.0192
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9334351774216912, Test_error 0.99534248000536
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9398544952979992, Test_error 0.8554246984140951


 94%|█████████▍| 47/50 [2:11:35<08:18, 166.05s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9303373731397266, Test_error 1.0779963613534382
Datasetstype: With new adding, recons_mse: 0.0203
2028
(15105, 27) (3236, 27) (3240, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.946863941369036, Test_error 0.7547595874013476
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9526354309838716, Test_error 0.6592387472272109
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9544816281800202, Test_error 0.6709023986638247
Datasetstype: Without new adding, recons_mse: 0.0188
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9510230070141483, Test_error 0.6924513798875592
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9517307401680156, Test_error 0.6711533541679263


 96%|█████████▌| 48/50 [2:14:24<05:34, 167.03s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9531509094769183, Test_error 0.6721588438507464
Datasetstype: With new adding, recons_mse: 0.0195
2028
(15093, 27) (3200, 27) (3288, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9461588679112173, Test_error 0.5299149258434634
Dataset CO2, Algorithm ETR_separate, Test_r2 0.935077407793816, Test_error 0.5724320972970232
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9463217387443258, Test_error 0.48593756491648477
Datasetstype: Without new adding, recons_mse: 0.0193
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9354806300269168, Test_error 0.5887911699087254
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9302429525442878, Test_error 0.6116402541194842


 98%|█████████▊| 49/50 [2:17:10<02:46, 166.71s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9418687015597208, Test_error 0.5244502463082326
Datasetstype: With new adding, recons_mse: 0.0208
2028
(15216, 27) (3177, 27) (3188, 27)
152
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9372199821261243, Test_error 0.6826981490443373
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9359804904718263, Test_error 0.7163215730787867
Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9360416819819383, Test_error 0.6705506621149924
Datasetstype: Without new adding, recons_mse: 0.0189
Dataset CO2, Algorithm GBR_separate, Test_r2 0.9335432993506995, Test_error 0.742206644269408
Dataset CO2, Algorithm ETR_separate, Test_r2 0.9344379151973488, Test_error 0.7440589570543991


100%|██████████| 50/50 [2:19:59<00:00, 167.99s/it]

Dataset CO2, Algorithm LGBM_separate, Test_r2 0.9426239926719948, Test_error 0.6531438542172406
Datasetstype: With new adding, recons_mse: 0.0200





In [14]:
add_df.head(5)

Unnamed: 0.1,Unnamed: 0,Literature,Unnamed: 1,GAC,GAS,E,S,A,B,V,...,%N,Temp(K),Pressure,Adsorp(mmol/g),Index,logP,logQ,logD,Label,Vmic_index
255,255,,255,UC-700-4,CarbonDiox,0,0.28,0.05,0.1,0.2809,...,0.29,298.0,0.050968,0.261194,14,-2.976567,-1.342492,1.634075,CO2,1
256,256,,256,,CarbonDiox,0,0.28,0.05,0.1,0.2809,...,0.29,298.0,0.086415,0.41791,14,-2.448591,-0.872488,1.576103,CO2,1
257,257,,257,,CarbonDiox,0,0.28,0.05,0.1,0.2809,...,0.29,298.0,0.130724,0.609453,14,-2.03467,-0.495194,1.539476,CO2,1
258,258,,258,,CarbonDiox,0,0.28,0.05,0.1,0.2809,...,0.29,298.0,0.183898,0.853234,14,-1.693374,-0.158722,1.534652,CO2,1
259,259,,259,,CarbonDiox,0,0.28,0.05,0.1,0.2809,...,0.29,298.0,0.22819,0.992537,14,-1.477578,-0.007491,1.470087,CO2,1


In [6]:
import pandas as pd
#file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]
file_name = ['BET_plus_Vt']
cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model",'kpg_pca_mse']
tpd = []
gas_list = ['CO2']
for gas in gas_list:
    
    #fraction = range(50,len(list(set(train_dfs["Index"].values))),interval)
    fraction = range(50,2125,50)
    for k in fraction: 
        df_list = []
        for i in range(45):
            #for j in range(len(feature_list)):
                files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str(k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
                df_list.append(pd.read_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name))[cal_columns] )
        pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',   str(k)+'-mean.csv'))
        pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',str(k)+'-std.csv'))
        tpd = df_list

Post-post treatment

In [8]:
file_path = "./3_The_impact_of_different_training_sample_size" #/Methane-25(test)-775(max training)-with-reconstruction-mse
mean_df = pd.DataFrame()
std_df = pd.DataFrame()
for i in range(50,2125,50):#range(50,len(list(set(train_dfs["Index"].values))),interval):
    mean_temp = pd.read_csv(os.path.join(file_path,str(i)+'-mean.csv'))
    std_temp = pd.read_csv(os.path.join(file_path,str(i)+'-std.csv'))
    mean_df = pd.concat([mean_df,mean_temp],axis = 0)
    std_df = pd.concat([std_df,std_temp],axis = 0)
mean_df.to_csv(os.path.join(file_path,"Total_mean.csv"))
std_df.to_csv(os.path.join(file_path,"Total_std.csv"))


Only calculating the reconstruction error

In [77]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_3 = ['Vt',"Vmic",] 
combin_5 = ['Vt',"Vmic",'Vmeso',]
combin_6 = ["Vmic",'Vmeso',]
feature_list = [base_feature+combin_1+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param','kpg_pca_mse']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_plus_Vt']
feature_set = ["Vt",]
gas_list = ['CO2']

for i in range(15):
    recons_error = []
    train_index = []
    train_dfs,test_df = pred_dataset(gas_list,feature_set)#
    nums_test = len(list(set(test_df["Index"].values)))
    
    index = list(set(train_dfs['Index'].values))
    fraction = range(100,len(list(set(train_dfs["Index"].values))),50)
    for k in range(len(fraction)):
        print(k)
        
        #k = 50
        selected_index = np.random.choice(index,50,replace=False)
        
        for ele in selected_index:
            index.remove(ele)
        train_index.extend(selected_index) 
        print(k, len(train_index))
        train_df = temp_df.loc[temp_df['Index'].isin(train_index)]
        k_recon_mse = bgk_pca(train_df,test_df)
        recons_error.append(k_recon_mse)
        files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str(50*k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
        pd.DataFrame(recons_error).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name)) 

2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20 1050
21
21 1100
22
22 1150
23
23 1200
24
24 1250
25
25 1300
26
26 1350
27
27 1400
28
28 1450
29
29 1500
30
30 1550
31
31 1600
32
32 1650
33
33 1700
34
34 1750
35
35 1800
36
36 1850
37
37 1900
38
38 1950
39
39 2000
2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20 1050
21
21 1100
22
22 1150
23
23 1200
24
24 1250
25
25 1300
26
26 1350
27
27 1400
28
28 1450
29
29 1500
30
30 1550
31
31 1600
32
32 1650
33
33 1700
34
34 1750
35
35 1800
36
36 1850
37
37 1900
38
38 1950
39
39 2000
2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20

In [73]:
len(range(2))

2