In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

def pred_dataset(file_names, feature_set):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    data_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-02-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = feature_set)
        #train_x,test_x = train_test_split(temp_data,test_size = 0.2)
        data_df = pd.concat([data_df,temp_data],axis=0)
        #test_df = pd.concat([test_df,test_x],axis =0)
    return data_df

In [2]:

from sklearn.ensemble import BaggingRegressor,ExtraTreesRegressor
from lightgbm import LGBMRegressor  

n_estimators = [50,100,120,150,180,200]

# define different models#,
models = [
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    ]
# set search parameters grid for different models
para_grids = { 
    'ETR':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},
    }

In [3]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from sklearn.ensemble import ExtraTreesRegressor
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    return [train_r2_cv,train_mse_cv],result.best_params_

# model evaluation
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

# comparing different models
def model_comparison(model_list,para_grids,feature_list,gas_list):
    gas_list = gas_list 
    input_feature = feature_list
    output = ['Adsorp(mmol/g)']
    result_total = []

    for gas in gas_list:
            #train_df_com = train_df[train_df['Label']==gas]
            train_df_com = train_df
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            print(f'With big set and the total training records is {len(train_df_com)}')
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                
                print('Dataset: {:s}, Algorithm: {:s}, Test_r2: {:.2f}, Test_error: {:.2f}'.format(gas,model_name+'_with_big_set',test_r2,test_mse))

            train_df_com = train_df[train_df['Label']==gas]
            test_df_com = test_df[test_df['Label']==gas]
            train_x = train_df_com[input_feature]
            test_x = test_df_com[input_feature]
            train_y = train_df_com[output].values
            test_y = test_df_com[output].values
            groups = train_df_com['Index']
            train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
            print(f'With no big set and the total training records is {len(train_df_com)}')
            for model_name, model in model_list:

                result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
                model_refit = model.set_params(**best_param)
                model_refit.fit(train_x,train_y.squeeze())
                test_r2,test_mse = model_eval(model_refit,test_x,test_y.squeeze()) 
                result_total.append([gas,model_name+'_separate',result[0],result[1],-1,-1, test_r2,test_mse,best_param])
                print('Dataset: {:s}, Algorithm: {:s}, Test_r2: {:.2f}, Test_error: {:.2f}'.format(gas,model_name+'_no_big_set',test_r2,test_mse))
          
    return result_total

In [4]:
def group_split(data_df,target,additional):
    #np.random.seed(42)
    temp_data = data_df[data_df["Label"]==target]
    index = list(set(temp_data['Index'].values))
    temp_data,_ = shuffle(temp_data, temp_data['Index'].values, random_state=42)
    #print(len(index))
    test_index= np.random.choice(index,int(0.2*len(index)),replace=False)
    train_df = temp_data.loc[~temp_data['Index'].isin( test_index)]
    test_df = temp_data.loc[temp_data['Index'].isin(test_index)]
    index_len = len(list(set(train_df['Index'].values)))

    for gas in additional:
        temp_data = data_df[data_df["Label"]==gas]
        temp_index = list(set(temp_data['Index'].values))
        selected_index =  np.random.choice(temp_index,100,replace=False) # for CO2 and Methane only
        temp_train = temp_data.loc[temp_data['Index'].isin(selected_index)]
        train_df = pd.concat([train_df,temp_train])
        """
        if len(temp_index)<index_len*6:
            train_df = pd.concat([train_df,temp_data])
        else:
            selected_index =  np.random.choice(temp_index,388,replace=False) # for CO2 and Methane only
            temp_train = temp_data.loc[temp_data['Index'].isin(selected_index)]
            train_df = pd.concat([train_df,temp_train])
        """
    groups = train_df['Index'].values
    train_df, groups = shuffle(train_df, groups, random_state=42)
    
    return train_df,test_df

In [6]:
import numpy.random as nrd
from sklearn.decomposition import KernelPCA,SparsePCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from  collections import Counter

def bgk_pca(train_df,test_df): 
    #np.random.RandomState(42)# using major voting approach to find the outliers in the test dataset
    test_index = list(set(test_df["Index"].values))
    #len_test = len(test_index)
    train_index = list(set(train_df["Index"].values))
    #total_feature = ["Index",'V','L','BET','Vt','Temp(K)']
    total_feature = ["Index",'V','L','BET','Vt','Temp(K)']
    #pca_feature =  ['V','L','BET','Vt','Temp(K)']
    pca_feature =  ['V','L','BET','Vt','Temp(K)']
    num_feature = len(pca_feature)
    removed_index = []
    res = []
    iters = 10
    multi_mse = []
    for i in range(iters):
        mses = []
        train_selected = train_df[train_df["Index"].isin(nrd.choice(train_index,int(len(train_index)*0.75),replace=False))] # modified here change fixed len to a the fraction of the len of the training dataset.
        data = pd.concat([test_df,train_selected])
        sub_data = data[total_feature].drop_duplicates()
        sub_data_scalered = MinMaxScaler().fit_transform(sub_data[pca_feature].values)
        
        """adding lines to determine the number of components to achieve 0.99 threshold"""
        kernel_pca = KernelPCA(kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        kpca_transform = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        explained_variance = np.var(kpca_transform, axis=0)
        explained_variance_ratio = explained_variance / np.sum(explained_variance)
        cumu_variance = np.cumsum(explained_variance_ratio)
        n_components = np.where(cumu_variance>0.99)[0][0]+1
        kernel_pca = KernelPCA(n_components=n_components,kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        """ The end of adding new lines"""

        #kernel_pca = KernelPCA(kernel='poly',max_iter =100000,n_jobs =-1,gamma=1e-2,fit_inverse_transform=True,random_state=42)
        sub_data_transformed = kernel_pca.fit_transform(sub_data_scalered.reshape(num_feature,-1))
        reconstructed = kernel_pca.inverse_transform(sub_data_transformed.reshape(num_feature,-1))
        for i in range(len(sub_data_scalered)):
            mses.append(mean_squared_error(sub_data_scalered[i],reconstructed.reshape(-1,num_feature)[i]))
            df_mse = pd.DataFrame(mses,columns = ['MSE'])
        df_mse['Indexs'] = sub_data["Index"].drop_duplicates().values
        mean_mse = df_mse["MSE"].mean()
        std_mse = df_mse['MSE'].std()
        test_mse_df = df_mse[df_mse["Indexs"].isin(test_index)]
        outlier_index = test_mse_df[test_mse_df["MSE"]>3*std_mse+mean_mse]["Indexs"].values.tolist()
        removed_index.extend(outlier_index)
        multi_mse.append(mean_mse)

    counter = Counter(removed_index)
    thresh = int(0.7*iters)
    for key,values in counter.most_common():
        if values>=thresh:
            res.append(key)
            
        if values<thresh: break

    return np.mean(multi_mse)

In [10]:
# with reconstruction errors
import os
base_feature = ['V','S','L','BET',]
condition_feature = ['Temp(K)','Pressure']
combin_1 = ['Vt']
combin_2 = ["Vmeso"]
combin_3 = ['Vmic']
combin_4 = ['Vt',"Vmeso",]
combin_5 = ['Vt',"Vmic",]
combin_6 = ['Vt',"Vmic",'Vmeso',]
combin_7 = ["Vmic",'Vmeso',]

feature_list = [base_feature+combin_1+condition_feature]

columns = ['Gas','Model_name','CV_r2','CV_mse','test_r2_total_model','test_mse_by_total_model','test_r2_separa_model','test_mse_separa_model','best_param','kpg_pca_mse','kpg_pca_mse_added_other']
feature_set = ["BET","Vt"]
add_gas_list = ["E&E","Methane",'CO2']
target_gas_list = ['CFCs'] 
file_name = ["BET_Vt"]

for i in range(15):
    data_df = pred_dataset(['Ethane&Ethylene',"CO2",'Methane','CFCs'],feature_set= feature_set)
    for add_gas in add_gas_list:
        for gas in target_gas_list:
            train_df,test_df = group_split(data_df,gas,[add_gas])
            print('The length of test datasets: {}'.format(len(test_df)))
            for j in range(len(feature_list)):
                results = model_comparison(models,para_grids, feature_list[j],[gas])
                k_recon_mse = bgk_pca(train_df[train_df["Label"]==gas],test_df)
                k_recon_mse_added_other = bgk_pca(train_df,test_df)
                temp_results  = []
                for ele in results:
                    temp_results.append(ele+[k_recon_mse,k_recon_mse_added_other])
                print("recons_mse: {:.4f}, recons_mse_added_other: {:.4f}".format(k_recon_mse,k_recon_mse_added_other))
                files_name = 'Improving_small_with_big_set'+"_"+gas+"_"+add_gas+"_"+file_name[j]+'_result_'+str(i)+'.csv'
                pd.DataFrame(temp_results,columns = columns).to_csv(os.path.join('./6_Using_kernel_pca_to determine_combinable_datasets',files_name))  
                #pd.DataFrame(results,columns = ['Gas','Algo','Train_erro','Test_error']).to_csv(os.path.join('./',files_name))  

The length of test datasets: 310
With big set and the total training records is 2429
Dataset: CFCs, Algorithm: ETR_with_big_set, Test_r2: 0.9565, Test_error: 0.7840
Dataset: CFCs, Algorithm: LGBM_with_big_set, Test_r2: 0.9682, Test_error: 0.6181
Dataset: CFCs, Algorithm: BGLGBM_with_big_set, Test_r2: 0.9689, Test_error: 0.5882
With no big set and the total training records is 1257
Dataset: CFCs, Algorithm: ETR_no_big_set, Test_r2: 0.9644, Test_error: 0.6620
Dataset: CFCs, Algorithm: LGBM_no_big_set, Test_r2: 0.9581, Test_error: 0.8445
Dataset: CFCs, Algorithm: BGLGBM_no_big_set, Test_r2: 0.9704, Test_error: 0.5830
recons_mse: 0.0728, recons_mse_added_other: 0.0471
The length of test datasets: 301
With big set and the total training records is 2246
Dataset: CFCs, Algorithm: ETR_with_big_set, Test_r2: 0.9496, Test_error: 0.6273
Dataset: CFCs, Algorithm: LGBM_with_big_set, Test_r2: 0.9525, Test_error: 0.6379
Dataset: CFCs, Algorithm: BGLGBM_with_big_set, Test_r2: 0.9602, Test_error: 0.524

KeyboardInterrupt: 

# Post treatment

In [10]:
import pandas as pd
#file_name = ['BET_only','BET_plut_Vt',"BET_Vmic","BET_Vmeso",'BET_Vt_Vmeso','BET_Vt_Vmic',"BET_Vt_Vmic_meso","BET_Vmic_meso"]
file_name = ["BET_Vt"]
cal_columns= ["CV_r2","CV_mse","test_r2_separa_model","test_mse_separa_model",'kpg_pca_mse','kpg_pca_mse_added_other']
for add_gas in add_gas_list:
    for gas in target_gas_list:
        df_list = [] 
        for i in range(15):
            for j in range(len(feature_list)):
                files_name = 'Improving_small_with_big_set'+'_'+gas+"_"+add_gas+"_"+file_name[j]+'_result_'+str(i)+'.csv'
                df_list.append(pd.read_csv(os.path.join('./6_Using_kernel_pca_to determine_combinable_datasets',files_name))[cal_columns])
        pd.concat(df_list).groupby(level=0).mean().to_csv(os.path.join('./6_Using_kernel_pca_to determine_combinable_datasets',   add_gas+gas+'-mean.csv'))
        pd.concat(df_list).groupby(level=0).std().to_csv(os.path.join('./6_Using_kernel_pca_to determine_combinable_datasets',add_gas+gas+'-std.csv'))