In [1]:
import pandas as pd
import numpy as np
import os

def pred_dataset(file_names):
    source_path = 'C:/Kai_Zhang/MachineLearning/Unified gas Adsorption/CO2_adsorption/new_data'
    data_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-02-01-2022.xlsx'),skiprows= 1 )
        
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = ["BET",'Vt'])
        temp_data = temp_data[temp_data['Pressure']>0.01]
        #temp_data = temp_data[temp_data['Vmic']<2]
        index = list(set(temp_data['Index'].values))
        #print(len(index))
        #test_index= np.random.choice(index,int(0.2*len(index)),replace=False)
        #train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        #test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        
        #train_df = pd.concat([train_df,train_x],axis=0)
        data_df = pd.concat([data_df,temp_data],axis =0)
    return data_df

In [2]:
data = pred_dataset(['CO2']) #,'Methane','Ethane&Ethylene','CFCs'

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor,RandomForestRegressor
from lightgbm import LGBMRegressor  
from sklearn.svm import SVR
from xgboost import XGBRegressor
  
n_estimators = [50,100,120,150,180,200]

# define different models#,
models = [
    #('SVR',SVR(max_iter=100000)),
    #('DT',DecisionTreeRegressor(random_state=42)),\
    # ('ADBR',AdaBoostRegressor(random_state=42)), 
    #("GBR",GradientBoostingRegressor(random_state=42)),\
    #('BG',BaggingRegressor(random_state=42,n_jobs=-1)),
    ('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    #('RF',RandomForestRegressor(n_jobs=-1,random_state=42)),
    ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    #('XGBR',XGBRegressor(eta=0.1, subsample=0.7, colsample_bytree=0.8,random_state =42))
    #('BGETR',BaggingRegressor(ExtraTreesRegressor(n_estimators = 180,random_state=42,n_jobs=6),random_state=42,n_jobs=-1))
    ]

# set search parameters grid for different models
para_grids = { 
    'SVR':{'kernel':['linear','poly','rbf','sigmoid','precomputed']},\
    'DT':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']},\
    'ADBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2],'loss':['linear','square','exponential']},\
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'BG':{'n_estimators':[10,50,100]},\
    'ETR':{'n_estimators':n_estimators},\
    'RF':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]},\
    'BGETR':{'n_estimators':[10]},\
    'XGBR':{'n_estimators':n_estimators, 'max_depth':[2,4,6,8,10],}
      
    }

In [25]:
from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CV(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 5)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_

In [26]:
input_feature =  ['S','V','L','BET','Vt','Temp(K)','Pressure']
output = ['Adsorp(mmol/g)']


In [27]:
from sklearn.inspection import permutation_importance
features_importances = []

for i in range(100):

    mse_cv = 10
    forest_importance = []

    indexes = list(set(data['Index'].values))
    #selected = np.random.choice(indexes,len(indexes)//5,replace = False)
    selected = np.random.choice(indexes,int(len(indexes)*0.6),replace = False) # only for CFCs
    random_data = data.loc[data['Index'].isin(selected)]
    train_df_com = random_data
    train_x = train_df_com[input_feature]
    train_y = train_df_com[output].values            
    groups = train_df_com['Index'].values
    train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)

    for model_name, model in models:          
        result, best_param = model_CV(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
        model_refit = model.set_params(**best_param)
        model_refit.fit(train_x,train_y.squeeze())
        results = permutation_importance(model_refit,train_x,train_y, n_repeats=10, random_state=42, n_jobs=2)
        feature_importances = pd.Series(results.importances_mean)
        if result[0]<mse_cv:
            mse_cv = result[0]
            forest_importance = feature_importances.values
    print(forest_importance)
    features_importances.append(forest_importance)
        

[0.         0.         0.         0.35494891 0.08534778 0.20620754
 1.73315781]
[0.         0.         0.         0.33018374 0.10890361 0.23978394
 1.80909177]
[0.         0.         0.         0.26234863 0.07037808 0.19865288
 1.65956452]
[0.         0.         0.         0.30411658 0.10685978 0.20950279
 1.69538987]
[0.         0.         0.         0.28179072 0.11590965 0.23098287
 1.72160102]
[0.         0.         0.         0.38461082 0.0868433  0.20226855
 1.72930421]
[0.         0.         0.         0.33090731 0.10408106 0.21189315
 1.7172012 ]
[0.         0.         0.         0.37570282 0.09078991 0.22710308
 1.74897784]
[0.         0.         0.         0.29353193 0.0953408  0.20288947
 1.71844607]
[0.         0.         0.         0.27740702 0.06180991 0.25316961
 1.74067286]
[0.         0.         0.         0.29037316 0.09698608 0.23938326
 1.77563912]
[0.         0.         0.         0.31257966 0.10708858 0.20872462
 1.75361868]
[0.         0.         0.         0.2781

In [28]:
weight_matrix = np.diag(np.mean(features_importances,axis=0)[0:-1])
weight_matrix


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.31950523, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.09215328,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.20999685]])

In [150]:
# extracting the first line of each isotherms

first_row = pd.DataFrame()
total_index = set(data["Index"].values)
for index in total_index:
    temp_pd = data[data["Index"]==index]
    first_row  = pd.concat([first_row,temp_pd.iloc[0:1,:]],axis=0)

In [151]:
from sklearn.preprocessing import StandardScaler
input_feature = first_row[['S','V','L','BET','Vt','Temp(K)']]
input_feature_scale = StandardScaler().fit_transform(input_feature)
input_feature_weighted = np.dot(input_feature_scale,weight_matrix)

In [152]:
from sklearn.cluster import KMeans
#kmeans = KMeans(n_clusters=10, random_state=0).fit(input_feature_weighted)
kmeans = KMeans(n_clusters=5, random_state=0).fit(input_feature_weighted) # only for cfcs
group = kmeans.labels_

In [153]:
groups = group.tolist()
cgroup_index = []
total_index = list(total_index)
for i in range(len(total_index)):
    ls = len(data[data["Index"]==total_index[i]])
    temp = [groups[i] for j in range(ls)]
    cgroup_index= cgroup_index+temp

In [154]:
data['cgroup'] = cgroup_index

In [108]:
def split_double(data_df):
    first_group = list(set(data_df['cgroup'].values))
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for ele in first_group:
        temp_data = data[data['cgroup']==ele]
        index = list(set(temp_data['Index'].values))
        
        test_index= np.random.choice(index,int(0.2*len(index)),replace=False)
        train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis=0)
    return train_df,test_df


In [159]:
train_df,test_df = split_double(data)

In [160]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,\
    BaggingRegressor,ExtraTreesRegressor,RandomForestRegressor
from lightgbm import LGBMRegressor  
  
n_estimators = [50,100,120,150,180,200]

# define different models#('SVR',SVR(max_iter=10000)),
models = [
    #('DT',DecisionTreeRegressor(random_state=42)),\
     #('ADBR',AdaBoostRegressor(random_state=42)), 
    #("GBR",GradientBoostingRegressor(random_state=42)),\
    #('BG',BaggingRegressor(random_state=42,n_jobs=-1)),
    #('ETR',ExtraTreesRegressor(random_state=42,n_jobs=-1)),\
    ('RF',RandomForestRegressor(n_jobs=-1,random_state=42)),
   ('LGBM',LGBMRegressor(n_jobs = -1,random_state = 42)),\
    ('BGLGBM',BaggingRegressor(LGBMRegressor(n_estimators = 200, n_jobs = -1,random_state = 42), random_state=42,n_jobs=-1)),\
    
    ]

# set search parameters grid for different models
para_grids = { #'SVR':{'kernel':['linear','poly','rbf','sigmoid','precomputed']},\
    'DT':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']},\
    'ADBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2],'loss':['linear','square','exponential']},\
    'GBR':{'n_estimators':n_estimators,'learning_rate':[0.1,0.5,1,2]},\
    'BG':{'n_estimators':[10,50,100]},\
    'ETR':{'n_estimators':n_estimators},\
    'RF':{'n_estimators':n_estimators},\
    'LGBM':{'num_leaves':[10,20,30,50],'learning_rate': [0.05,0.1,0.5,1],
    'n_estimators':n_estimators},\
    'BGLGBM':{'n_estimators':[10,30,50]}
    
    }

from sklearn.model_selection import GridSearchCV,cross_validate,GroupKFold
from  sklearn.metrics import mean_squared_error,r2_score
from sklearn.utils import shuffle

def model_CVs(train_x,train_y,groups,model,para_grid):

    out_cv = GroupKFold(n_splits = 3)
    result = GridSearchCV(model,para_grid,cv= out_cv.get_n_splits(groups =groups),
    scoring='neg_mean_squared_error', return_train_score=True,n_jobs=-1)
    result.fit(train_x,train_y)
    
    model_refit =model.set_params(**result.best_params_)
    train_cv = cross_validate(model_refit,train_x,train_y,groups = groups,cv =out_cv,scoring = ('r2', 'neg_mean_squared_error'))
    train_mse_cv = -train_cv['test_neg_mean_squared_error'].mean()
    train_r2_cv = train_cv['test_r2'].mean()
    
    return [train_r2_cv,train_mse_cv],result.best_params_
def model_eval(model,test_x,test_y):
      
    test_pre = model.predict(test_x)
    test_r2 = r2_score(test_pre,test_y)
    test_mse = mean_squared_error(test_y,test_pre)
    return test_r2,test_mse

In [156]:
input_feature =  ['S','V','L','BET','Vt','Temp(K)','Pressure']
output = ['Adsorp(mmol/g)']
result_total = []
for j in range(10):
    train_df,test_df = split_double(data)
    train_df_com = train_df
    test_df_com = test_df
    train_x = train_df_com[input_feature]
    test_x = test_df_com[input_feature]
    train_y = train_df_com[output].values
    test_y = test_df_com[output].values
    groups = train_df_com['Index'].values
    train_x, train_y, groups = shuffle(train_x, train_y, groups, random_state=42)
                
    for model_name, model in models:

                    
        result, best_param = model_CVs(train_x,train_y.squeeze(),groups,model,para_grids[model_name])
        print(result)
        model_refit = model.set_params(**best_param)
        model_refit.fit(train_x,train_y.squeeze())
        test_r2_total,test_mse_total = model_eval(model_refit,test_x,test_y.squeeze()) 
        result_total.append([model_name+'_total',result[0],result[1],test_r2_total,test_mse_total,best_param])
        print('Algorithm {}, Test_r2 {}, Test_error {}'.format(model_name+'_total',test_r2_total,test_mse_total))

[0.9077878144761683, 1.650067242438382]
Algorithm RF_total, Test_r2 0.9038864525085525, Test_error 1.840939980372061
[0.9162729084050772, 1.6296901929732142]
Algorithm LGBM_total, Test_r2 0.9444434587374289, Test_error 1.0197588930955372
[0.9181454854313141, 1.621798344501478]
Algorithm BGLGBM_total, Test_r2 0.9249593418059658, Test_error 1.3649713922675208
[0.9118772528907672, 1.6163044423903665]
Algorithm RF_total, Test_r2 0.9441246177307865, Test_error 0.9372094190428824
[0.9398513966745773, 1.1285209294683456]
Algorithm LGBM_total, Test_r2 0.9566550799943051, Test_error 0.734709925900244
[0.9445085904325947, 1.0582600926674575]
Algorithm BGLGBM_total, Test_r2 0.9693836261326565, Test_error 0.5483708605276558
[0.8954611046857197, 1.904084852490956]
Algorithm RF_total, Test_r2 0.9380005549806172, Test_error 1.3097959331973312
[0.9183684599965902, 1.4922941026482475]
Algorithm LGBM_total, Test_r2 0.9554116365354444, Test_error 0.8595976436225283
[0.9153578138847145, 1.5426102279147926

In [158]:
pd.DataFrame(result_total).to_csv('KKN__CFCs_total.csv')