In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

def pred_dataset(file_names, feature_set ,i):
    source_path = './CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        temp_data = temp_data[temp_data['Pressure']>0.01]
        index = list(set(temp_data['Index'].values))
        
        test_index = shuffle(sorted(index),random_state=i)[:26]
        #test_index= np.random.choice(index,55,replace=False)
        train_x = temp_data.loc[~temp_data['Index'].isin(test_index)]#[1,2,3]
        test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df

In [2]:

from sklearn.ensemble import GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor
from lightgbm import LGBMRegressor  

n_estimators = [50,100,120,150,180,200]
# define different models#,
models = [
    #("GBR",GradientBoostingRegressor(random_state=42)),\
    #('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    #('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ]

# set search parameters grid for different models
para_grids = { 
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'ETR':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    }

In [28]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle
import os
def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error')) 
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(train_df,test_df,model_list,para_grids,feature_list,gas_list,selected_index = None):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
        
            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            
            for model_name, model in model_list:


                gas_checkpoint = gas+'_checkpoint'
                if not os.path.exists(gas_checkpoint):
                    os.makedirs(gas_checkpoint)

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                if os.path.exists(gas_checkpoint) and gas+'_checkpoint.txt' not in os.listdir(gas_checkpoint) and len(train_df)>10*len(test_df):
                    model_refit.booster_.save_model(os.path.join(gas_checkpoint,gas+'_checkpoint.txt'))     
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                if os.path.isfile(os.path.join(gas_checkpoint,gas+'_checkpoint.txt')) and len(train_df)>10*len(test_df):
                    
                    train_df_incre = train_df.loc[train_df['Index'].isin(selected_index),:]
                    train_x_incr = train_df_incre[input_feature]
                    train_y_incr = train_df_incre[output].values
                    model_incread = LGBMRegressor(n_jobs = -1,random_state = 42).fit(train_x_incr,train_y_incr.squeeze(),init_model = os.path.join(gas_checkpoint,gas+'_checkpoint.txt'))
                    test_r2_increa,test_mse_increa = model_eval(model_incread,test_x,test_y.squeeze())
                    model_incread.booster_.save_model(os.path.join(gas_checkpoint,gas+'_checkpoint.txt'))
                if not os.path.isfile(os.path.join(gas_checkpoint,gas+'_checkpoint.txt')):
                    test_r2_increa,test_mse_increa = -1,-1
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,test_r2_increa,test_mse_increa,best_param])
                
                print('Dataset {}, Algorithm {}, Test_r2 {:.4f}, Test_error {:.4f}, Test_r2_incre {:.4f}, Test_error_incre {:.4f},'.format(gas,model_name+'_separate',test_r2,test_mse,test_r2_increa,test_mse_increa))           
    return result_total

In [29]:
import numpy.random as nrd
from sklearn.decomposition import KernelPCA,SparsePCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from  collections import Counter

def bgk_pca(train_df,test_df): 
    #np.random.RandomState(42)# using major voting approach to find the outliers in the test dataset
    test_index = list(set(test_df["Index"].values))
    len_test = len(test_index)
    train_index = list(set(train_df["Index"].values))
    #total_feature = ["Index",'V','L','BET','Vt','Temp(K)']
    total_feature = ["Index",'BET','Vt','Temp(K)']
    #pca_feature =  ['V','L','BET','Vt','Temp(K)']
    pca_feature =  ['BET','Vt','Temp(K)']
    num_feature = len(pca_feature)
    removed_index = []
    res = []
    iters = 10
    multi_mse = []
    for i in range(iters):
        mses = []
        train_selected = train_df[train_df["Index"].isin(nrd.choice(train_index,int(len(train_index)*0.75),replace=False))] # modified here change fixed len to a the fraction of the len of the training dataset.
        data = pd.concat([test_df,train_selected])
        sub_data = data[total_feature].drop_duplicates()
        sub_data_scalered = MinMaxScaler().fit_transform(sub_data[pca_feature].values)
        
        """adding lines to determine the number of components to achieve 0.99 threshold"""
        kernel_pca = KernelPCA(kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        kpca_transform = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        explained_variance = np.var(kpca_transform, axis=0)
        explained_variance_ratio = explained_variance / np.sum(explained_variance)
        cumu_variance = np.cumsum(explained_variance_ratio)
        n_components = np.where(cumu_variance>0.99)[0][0]+1
        kernel_pca = KernelPCA(n_components=n_components,kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        """ The end of adding new lines"""

        sub_data_transformed = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        reconstructed = kernel_pca.inverse_transform(sub_data_transformed.reshape(num_feature,-1))
        for i in range(len(sub_data_scalered)):
            mses.append(mean_squared_error(sub_data_scalered[i],reconstructed.reshape(-1,num_feature)[i]))
            df_mse = pd.DataFrame(mses,columns = ['MSE'])
        df_mse['Indexs'] = sub_data["Index"].drop_duplicates().values
        mean_mse = df_mse["MSE"].mean()
        test_mse_df = df_mse[df_mse["Indexs"].isin(test_index)]
        outlier_index = test_mse_df[test_mse_df["MSE"]>3*mean_mse]["Indexs"].values.tolist()
        removed_index.extend(outlier_index)
        multi_mse.append(mean_mse)

    counter = Counter(removed_index)
    thresh = int(0.7*iters)
    for key,values in counter.most_common():
        if values>=thresh:
            res.append(key)
            
        if values<thresh: break

    return np.mean(multi_mse)#res

In [31]:
import os
from sklearn.utils import shuffle
from tqdm import tqdm
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_3 = ['Vt',"Vmic",] 
combin_5 = ['Vt',"Vmic",'Vmeso',]
combin_6 = ["Vmic",'Vmeso',]
feature_list = [base_feature+combin_1+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param','kpg_pca_mse']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_plus_Vt']
feature_set = ["Vt",]
gas_list = ['Methane']
'''
for i in range(15):
    train_dfs,test_df = pred_dataset(gas_list,feature_set)#
    fraction = range(100,len(list(set(train_dfs["Index"].values))),50)
    for k in fraction:
        print(k)
        temp_df = train_dfs
        nums_test = len(list(set(test_df["Index"].values)))
        index = list(set(temp_df['Index'].values))
        #print(len(index))
        train_index= np.random.choice(index,k,replace=False)
        train_df = temp_df.loc[temp_df['Index'].isin(train_index)]
        k_recon_mse = bgk_pca(train_df,test_df)
        
        results = model_comparison(models,para_grids, feature_list[0],gas_list)
        temp_results  = []
        for ele in results:
            temp_results.append(ele+[k_recon_mse])
        print("recons_mse: {:.4f}".format(k_recon_mse))
        files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str(k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
        pd.DataFrame(temp_results,columns = columns).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name)) 
'''

for i in range(50):
    recons_error = []
    train_index = []
    train_dfs,test_df = pred_dataset(gas_list,feature_set,i)#
    nums_test = len(list(set(test_df["Index"].values)))
    index = list(set(train_dfs['Index'].values))
    interval = 25

    fraction = range(50,len(list(set(train_dfs["Index"].values))),interval)
    big_results = []
    for k in range(len(fraction)):
        if k==0:
            selected_index = np.random.choice(index,50,replace=False)
        else:
            selected_index = np.random.choice(index,interval,replace=False)

        for ele in selected_index:
            index.remove(ele)
        #print(len(index),print(train_index))
        train_index.extend(selected_index) 
        print(k, nums_test,len(train_index))
        train_df = train_dfs.loc[train_dfs['Index'].isin(train_index)]
        k_recon_mse = bgk_pca(train_df,test_df)
        results = model_comparison(train_df,test_df,models,para_grids, feature_list[0],gas_list,selected_index=selected_index)
        temp_results  = []
        for ele in results:
            temp_results.append(ele+[k_recon_mse])
        print("recons_mse: {:.4f}".format(k_recon_mse))
        big_results.append(temp_results)
    #files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str((k+1)*interval)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
    #pd.DataFrame(temp_results,columns = columns).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name)) 

0 26 50
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.6397, Test_error 3.4537, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0357
1 26 75
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.7074, Test_error 3.2236, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0374
2 26 100
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.6495, Test_error 3.6376, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0355
3 26 125
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.8021, Test_error 2.1913, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0355
4 26 150
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.8511, Test_error 1.7389, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0314
5 26 175
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.8409, Test_error 1.8463, Test_r2_incre -1.0000, Test_error_incre -1.0000,
recons_mse: 0.0336
6 26 200
Dataset Methane, Algorithm LGBM_separate, Test_r2 0.8406, Test_error 

UnboundLocalError: local variable 'test_r2_increa' referenced before assignment

In [8]:
import pandas as pd
#file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]
file_name = ['BET_plus_Vt']
cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model",'kpg_pca_mse']
tpd = []
for gas in gas_list:
    
    fraction = range(25,len(list(set(train_dfs["Index"].values)))-50,interval)
    #fraction = range(50,2125,interval)
    for k in fraction: 
        df_list = []
        for i in range(50):
            for j in range(len(feature_list)):
                files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str(k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
                df_list.append(pd.read_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name))[cal_columns] )
        pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',   str(k)+'-mean.csv'))
        pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',str(k)+'-std.csv'))
        tpd = df_list

In [18]:
tpd[0]

Unnamed: 0,CV_r2,CV_mse,test_r2_separa_model,test_mse_separa_model,kpg_pca_mse
0,0.939815,0.792283,0.951128,0.576892,0.018231
1,0.944506,0.732007,0.93862,0.740196,0.018231


Post-post treatment

In [9]:
file_path = "./3_The_impact_of_different_training_sample_size" #/Methane-25(test)-775(max training)-with-reconstruction-mse
mean_df = pd.DataFrame()
std_df = pd.DataFrame()
for i in  range(25,len(list(set(train_dfs["Index"].values)))-50,interval):
    mean_temp = pd.read_csv(os.path.join(file_path,str(i)+'-mean.csv'))
    std_temp = pd.read_csv(os.path.join(file_path,str(i)+'-std.csv'))
    mean_df = pd.concat([mean_df,mean_temp],axis = 0)
    std_df = pd.concat([std_df,std_temp],axis = 0)
mean_df.to_csv(os.path.join(file_path,"Total_mean.csv"))
std_df.to_csv(os.path.join(file_path,"Total_std.csv"))


Only calculating the reconstruction error

In [77]:
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_3 = ['Vt',"Vmic",] 
combin_5 = ['Vt',"Vmic",'Vmeso',]
combin_6 = ["Vmic",'Vmeso',]
feature_list = [base_feature+combin_1+condition_feature]
columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param','kpg_pca_mse']
#file_name = ['Total',"Meso","Micro",'All','Vmic_meso']
file_name = ['BET_plus_Vt']
feature_set = ["Vt",]
gas_list = ['CO2']

for i in range(15):
    recons_error = []
    train_index = []
    train_dfs,test_df = pred_dataset(gas_list,feature_set)#
    nums_test = len(list(set(test_df["Index"].values)))
    
    index = list(set(train_dfs['Index'].values))
    fraction = range(100,len(list(set(train_dfs["Index"].values))),50)
    for k in range(len(fraction)):
        print(k)
        
        #k = 50
        selected_index = np.random.choice(index,50,replace=False)
        
        for ele in selected_index:
            index.remove(ele)
        train_index.extend(selected_index) 
        print(k, len(train_index))
        train_df = temp_df.loc[temp_df['Index'].isin(train_index)]
        k_recon_mse = bgk_pca(train_df,test_df)
        recons_error.append(k_recon_mse)
        files_name = 'Res_pca_The_impact_of_different_training_sample_size_of_'+str(50*k)+"_"+file_name[0]+'_result_'+str(i)+'.csv'
        pd.DataFrame(recons_error).to_csv(os.path.join('./3_The_impact_of_different_training_sample_size',files_name)) 

2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20 1050
21
21 1100
22
22 1150
23
23 1200
24
24 1250
25
25 1300
26
26 1350
27
27 1400
28
28 1450
29
29 1500
30
30 1550
31
31 1600
32
32 1650
33
33 1700
34
34 1750
35
35 1800
36
36 1850
37
37 1900
38
38 1950
39
39 2000
2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20 1050
21
21 1100
22
22 1150
23
23 1200
24
24 1250
25
25 1300
26
26 1350
27
27 1400
28
28 1450
29
29 1500
30
30 1550
31
31 1600
32
32 1650
33
33 1700
34
34 1750
35
35 1800
36
36 1850
37
37 1900
38
38 1950
39
39 2000
2145
0
0 50
1
1 100
2
2 150
3
3 200
4
4 250
5
5 300
6
6 350
7
7 400
8
8 450
9
9 500
10
10 550
11
11 600
12
12 650
13
13 700
14
14 750
15
15 800
16
16 850
17
17 900
18
18 950
19
19 1000
20
20

In [73]:
len(range(2))

2