In [None]:
class MTLnet(nn.Module):
    def __init__(self):
        super(MTLnet, self).__init__()

        self.sharedlayer = nn.Sequential(
            nn.Linear(feature_size, shared_layer_size),
            nn.ReLU(),
            nn.Dropout()
        )
        self.tower1 = nn.Sequential(
            nn.Linear(shared_layer_size, tower_h1),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(tower_h1, tower_h2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(tower_h2, output_size)
        )
        self.tower2 = nn.Sequential(
            nn.Linear(shared_layer_size, tower_h1),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(tower_h1, tower_h2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(tower_h2, output_size)
        )        

    def forward(self, x):
        h_shared = self.sharedlayer(x)
        out1 = self.tower1(h_shared)
        out2 = self.tower2(h_shared)
        return out1, out2

In [None]:
import bartpy
from bartpy.sklearnmodel import SklearnModel

# 交替训练 Alternative Trainning
# and joint learning


# Shared last layers for inversed multi-task learning

In [2]:
import torch
import torch.nn as nn

In [132]:
class Net_full(nn.Module):
    def __init__(self,input_dim):
        super(Net_full,self).__init__()
        self.partA = nn.Sequential(nn.Linear(input_dim,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,20),nn.ReLU(),nn.Dropout(0.4))
        self.partC = nn.Sequential(nn.Linear(20,10),nn.ReLU(),nn.Linear(10,1))
    def forward(self,x):
        out = self.partA(x)
        out = self.partC(out)
        return out
class Net_half(nn.Module):
    def __init__(self,input_dim,shared_mol):
        super(Net_half,self).__init__()
        self.partA = nn.Sequential(nn.Linear(input_dim,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,20),nn.ReLU(),nn.Dropout(0.4))
        self.partB = shared_mol
    def forward(self,x):
        out = self.partA(x)
        out = self.partB(out)
        return out

In [310]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import os

def group_split(data:pd.DataFrame,index: list):
    ls =len(index)
    test_index= np.random.choice(index,int(0.2*ls),replace=False)
    train_data = data.loc[~data['Index'].isin( test_index)]
    test_data = data.loc[data['Index'].isin( test_index)]
    return train_data,test_data
def pred_dataset(file_names):
    source_path = '/Users/kai/Documents/Desktop/CO2_adsorption/new_data'
    train_df_full = pd.DataFrame()
    test_df_full = pd.DataFrame()
    train_df_half = pd.DataFrame()
    test_df_half = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-01-10-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = ["BET","Vt"])
        temp_data = temp_data[temp_data['Pressure']>0.01]
        data_half = temp_data[temp_data["Vmic"].isna()]
        data_full = temp_data.dropna(axis=0,how = 'any',subset = ["Vmic","Vmeso"])
        train_full,test_full = group_split(data_full,list(set(data_full['Index'].values)))
        train_half,test_half = group_split(data_half,list(set(data_half['Index'].values)))
        
        train_df_full = pd.concat([train_df_full,train_full],axis=0)
        test_df_full = pd.concat([test_df_full,test_full],axis =0)
        train_df_half = pd.concat([train_df_half,train_half],axis=0)
        test_df_half = pd.concat([test_df_half,test_half],axis =0)
    return train_df_full,test_df_full,train_df_half,test_df_half

In [311]:
train_df_full,test_df_full,train_df_half,test_df_half = pred_dataset(['CO2',"Ethane&Ethylene",'CFCs','Methane'])#,"Ethane&Ethylene",'CFCs']) #'Hydrogen',,'Methane','Ethane&Ethylene','CFCs'

In [312]:
train_df_full.shape

(21565, 25)

In [313]:
train_df_half.shape

(8601, 25)

In [314]:
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
import torch

class CustomDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self,idx):   
        return self.inputs[idx], self.outputs[idx]

def data_pred(data_train:pd.DataFrame,data_test:pd.DataFrame,input_feature:list):
    scaler = StandardScaler()
    output = ['Adsorp(mmol/g)']
    train_x = scaler.fit_transform(data_train[input_feature])
    test_x = scaler.transform(data_test[input_feature])
    train_x = torch.Tensor(train_x).float()
    test_x = torch.Tensor(test_x).float()
    train_y = torch.Tensor(data_train[output].values.reshape(-1,1)).float()
    test_y = torch.Tensor(data_test[output].values.reshape(-1,1)).float()
    train_loader = DataLoader(CustomDataset(train_x,train_y),batch_size=500)
    test_loader = DataLoader(CustomDataset(test_x,test_y),batch_size= 500)

    return train_loader,test_loader


In [315]:
full_feature = ['S','V','L','BET',"Vt",'Vmeso','Temp(K)','Pressure']
half_feature = ['S','V','L','BET',"Vt",'Temp(K)','Pressure']

train_loader_full,test_loader_full = data_pred(train_df_full,test_df_full,full_feature)
train_loader_half,test_loader_half = data_pred(train_df_half,test_df_half,half_feature)

In [316]:
def train_eval_mode(model,optimizer,loss_fn, train_loader,test_loader,i):
    for train_x,train_y in train_loader:
        pred = model(train_x)
        loss = loss_fn(pred,train_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if i%10==0:
        y_real= torch.Tensor(())
        y_pred = torch.Tensor(())
        with torch.no_grad():
            for val_x,val_y in test_loader:
                    y_pred = model(val_x) 
                    
                    y_real=torch.cat((y_real,val_y),0)
                    y_pred = torch.cat((y_pred,y_pred),0)
            val_loss = loss_fn(y_real,y_pred)
    print('Epcoh: {}, Train_loss {:.4f}, Val_loss {:.4f}'.format(i,loss,val_loss))

In [317]:
import torch.optim as optim
input_dim_full= len(full_feature)
input_dim_half= len(half_feature)
loss_fn = nn.MSELoss()
model_full  = Net_full(input_dim_full)
model_half = Net_half(input_dim_half,shared_mol=model_full.partC)
optimizer_full = optim.Adam(model_full.parameters(),lr = 1e-3)
optimizer_half = optim.Adam(model_half.parameters(),lr = 1e-3)
epochs =1000

        

In [318]:
def model_eval(model,data_loader,loss_fn):
    y_real= torch.Tensor(())
    y_preds = torch.Tensor(())
    with torch.no_grad():
        for test_x,test_y in data_loader:
            y_pred = model(test_x) 
            y_real=torch.cat((y_real,test_y),0)        
            y_preds = torch.cat((y_preds,y_pred),0)            
            val_loss = loss_fn(y_real,y_preds)            
    return val_loss
                
                

In [None]:
for i in range(1,epochs+1):

    if np.random.rand()>0.4:
        for train_x,train_y in train_loader_full:
            pred = model_full(train_x)
            loss = loss_fn(pred,train_y)
            optimizer_full.zero_grad()
            loss.backward()
            optimizer_full.step()
        if i%50==0:
            full_loss = model_eval(model_full,test_loader_full,loss_fn)
            half_loss = model_eval(model_half,test_loader_half,loss_fn)
            
            print('Epcoh: {}, Train_loss {:.4f}, Val_loss_full {:.4f},Val_loss_half {:.4f}'.format(i,loss,full_loss,half_loss))
    else:
        for train_x,train_y in train_loader_half:
            pred = model_half(train_x)
            loss = loss_fn(pred,train_y)
            optimizer_half.zero_grad()
            loss.backward()
            optimizer_half.step()

        if i%50==0:
            half_loss = model_eval(model_half,test_loader_half,loss_fn)
            full_loss = model_eval(model_full,test_loader_full,loss_fn)
            print('Epcoh: {}, Train_loss {:.4f}, Val_loss_half {:.4f},Val_loss_full {:.4f}'.format(i,loss,half_loss,full_loss))

In [135]:
y_real= torch.Tensor(())
y_preds = torch.Tensor(())
with torch.no_grad():
    for test_x,test_y in test_loader_half:
        y_pred = model_half(test_x) 
                        
        y_real=torch.cat((y_real,test_y),0)
        y_preds = torch.cat((y_preds,y_pred),0)
                #print(y_real.shape,y_pred.shape)
        val_loss = loss_fn(y_real,y_preds)
    print('Epcoh: {}, Train_loss {:.4f}, Val_loss {:.4f}'.format(i,loss,val_loss))

Epcoh: 1000, Train_loss 0.6031, Val_loss 3.1595


In [131]:
y_real= torch.Tensor(())
y_preds = torch.Tensor(())
with torch.no_grad():
    for test_x,test_y in test_loader_full:
        y_pred = model_full(test_x) 
                        
        y_real=torch.cat((y_real,test_y),0)
        y_preds = torch.cat((y_preds,y_pred),0)
        val_loss = loss_fn(y_real,y_preds)
        
    print('Epcoh: {}, Train_loss {:.4f}, Val_loss {:.4f}'.format(i,loss,val_loss))

Epcoh: 1000, Train_loss 0.3639, Val_loss 2.0406


array([0.31839933, 0.28382548, 0.72745791, 0.51053495, 0.17380269])

# train individual models

In [320]:
class Net_one(nn.Module):
    def __init__(self,input_dim):
        super(Net_one,self).__init__()
        self.partA = nn.Sequential(nn.Linear(input_dim,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,20),nn.ReLU(),nn.Dropout(0.2))
        self.partC = nn.Sequential(nn.Linear(20,10),nn.ReLU(),nn.Linear(10,1))
    def forward(self,x):
        out = self.partA(x)
        out = self.partC(out)
        return out
class Net_two(nn.Module):
    def __init__(self,input_dim):
        super(Net_two,self).__init__()
        self.partA = nn.Sequential(nn.Linear(input_dim,30),nn.ReLU(),
        nn.Dropout(0.4),nn.Linear(30,20),nn.ReLU(),nn.Dropout(0.4))
        self.partB = nn.Sequential(nn.Linear(20,10),nn.ReLU(),nn.Linear(10,1))
    def forward(self,x):
        out = self.partA(x)
        out = self.partB(out)
        return out

In [321]:
import torch.optim as optim
input_dim_full= len(full_feature)
input_dim_half= len(half_feature)
loss_fn = nn.MSELoss()
model_one  = Net_one(input_dim_full)
model_two = Net_two(input_dim_half)
optimizer_one = optim.Adam(model_one.parameters(),lr = 1e-3)
optimizer_two = optim.Adam(model_two.parameters(),lr = 8e-4)
epochs =1000


In [None]:
# using dataset with full features
for i in range(1,epochs+1):

    
    for train_x,train_y in train_loader_full:
        pred = model_one(train_x)
        loss = loss_fn(pred,train_y)
        optimizer_one.zero_grad()
        loss.backward() 
        optimizer_one.step()
    if i%50==0:
        full_loss = model_eval(model_one,test_loader_full,loss_fn)
            
        print('Epcoh: {}, Train_loss {:.4f}, Val_loss {:.4f}'.format(i,loss,full_loss))
            

In [96]:
# using dataset with half features
for i in range(1,epochs+1):

    
    for train_x,train_y in train_loader_half:
        pred = model_two(train_x)
        loss = loss_fn(pred,train_y)
        optimizer_two.zero_grad()
        loss.backward() 
        optimizer_two.step()
    if i%50==0:
        full_loss = model_eval(model_two,test_loader_half,loss_fn)
            
        print('Epcoh: {}, Train_loss {:.4f}, Val_loss {:.4f}'.format(i,loss,full_loss))

Epcoh: 50, Train_loss 1.3831, Val_loss 4.0738
Epcoh: 100, Train_loss 1.0408, Val_loss 3.4842
Epcoh: 150, Train_loss 1.1138, Val_loss 3.0293
Epcoh: 200, Train_loss 1.2847, Val_loss 2.8968
Epcoh: 250, Train_loss 1.3913, Val_loss 2.7893
Epcoh: 300, Train_loss 0.9832, Val_loss 2.5984
Epcoh: 350, Train_loss 0.8759, Val_loss 2.3934
Epcoh: 400, Train_loss 1.2157, Val_loss 2.3051
Epcoh: 450, Train_loss 1.0590, Val_loss 2.1051
Epcoh: 500, Train_loss 1.5149, Val_loss 2.2021
Epcoh: 550, Train_loss 1.0128, Val_loss 2.1186
Epcoh: 600, Train_loss 1.1325, Val_loss 2.0566
Epcoh: 650, Train_loss 1.0520, Val_loss 2.0398
Epcoh: 700, Train_loss 1.1044, Val_loss 2.0430
Epcoh: 750, Train_loss 1.1022, Val_loss 1.8783
Epcoh: 800, Train_loss 1.0462, Val_loss 2.0092
Epcoh: 850, Train_loss 0.8422, Val_loss 1.8942
Epcoh: 900, Train_loss 0.8608, Val_loss 1.9157
Epcoh: 950, Train_loss 1.1202, Val_loss 1.8950
Epcoh: 1000, Train_loss 1.0000, Val_loss 1.7805


# Test with simple algorithm

In [323]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,\
    GradientBoostingRegressor,ExtraTreesRegressor, BaggingRegressor,StackingRegressor,\
    VotingRegressor,HistGradientBoostingRegressor
model_list = [('RF',RandomForestRegressor(n_estimators=20)),('ADB',AdaBoostRegressor(n_estimators=30)),
('GBR',GradientBoostingRegressor(n_estimators=30)),('ETR',ExtraTreesRegressor(n_estimators=30)),
('BRR',BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=5),n_estimators=20))]
model_svm = [('LSVR',SVR(kernel='linear')),('PSVR',SVR(kernel='poly')),('RSVR',SVR(kernel='rbf'))]
    
ensemble_list = [('SR',StackingRegressor(model_list)),('VR',VotingRegressor(model_list)),
('HGBT',HistGradientBoostingRegressor(model_list))]
  
#train_x = scaler.fit_transform(train_df.iloc[:,:-1])
#test_x = scaler.transform(test_df.iloc[:,:-1])
#input_feature = ['E','V','L','BET',"Vt",'Vmic','Temp(K)','Pressure'] #with Vmic
input_feature = ['S','V','L','BET',"Vmeso",'Vt','Vmic','Temp(K)','Pressure'] # without Vmic
#input_feature = ['E','V','L','BET',"Vmeso",'Vmic','Temp(K)','Pressure'] # break Vt
output = ['Adsorp(mmol/g)']
train_x= train_df_full[input_feature]
test_x = test_df_full[input_feature]
train_y = train_df_full[output]
test_y = test_df_full[output]

In [324]:
input_feature = ['S','V','L','BET',"Vt",'Temp(K)','Pressure'] # without Vmic
#input_feature = ['E','V','L','BET',"Vmeso",'Vmic','Temp(K)','Pressure'] # break Vt
output = ['Adsorp(mmol/g)']
train_x= train_df_half[input_feature]
test_x = test_df_half[input_feature]
train_y = train_df_half[output]
test_y = test_df_half[output]

In [325]:
for model_name, model in model_list:

    model.fit(train_x,train_y)
    test_pre = model.predict(test_x)
    train_error = mean_squared_error(train_y,model.predict(train_x))
    test_error = mean_squared_error(test_y,test_pre)
    print('Algorithm {}, Train_error {}, Test_error {}'.format(model_name,train_error,test_error))
    #plt.scatter(test_y,test_pre)

Algorithm RF, Train_error 0.04963738328307915, Test_error 0.959339947983029
Algorithm ADB, Train_error 3.0812959162456224, Test_error 3.469649624062843
Algorithm GBR, Train_error 1.8455888863275807, Test_error 2.1607537428963677
Algorithm ETR, Train_error 3.8399706909490366e-30, Test_error 0.8326973443796771
Algorithm BRR, Train_error 0.12314842319647636, Test_error 0.895806456801427


# Regression chain models

In [30]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd


def pick_six(data, index_list):
    df = pd.DataFrame()
    for index in index_list:
        temp_data  = data[data['Index']==index]
        if len(temp_data)>=6:
            new_index = np.random.choice(list(temp_data['Index'].values),6,replace =False)
            new_index = sorted(new_index)
            df = pd.concat([df,temp_data.loc[temp_data['Index'].isin(new_index)]],axis=0)
    return df
    
def pred_dataset(file_names):
    source_path = '/Users/kai/Documents/Desktop/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-01-10-2022.xlsx'),skiprows= 1 )
        
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = ["BET","Vt",'Vmic','Vmeso'])
        temp_data = temp_data[temp_data['Pressure']>0.1]
        #temp_data = temp_data[temp_data['Pressure']<1]
        index = set(temp_data['Index'].values)
        test_index= np.random.choice(list(index),int(0.2*len(index)),replace=False)
        train_index = [x for x in index if x not in test_index]
        #train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        #test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        train_x = pick_six(temp_data, train_index)
        test_x = pick_six(temp_data,test_index)
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
    return train_df,test_df


In [31]:
train_df,test_df = pred_dataset(['CO2','Methane','Ethane&Ethylene','CFCs',])
#,,'CFCs']) #'Hydrogen','CO2','Methane','Ethane&Ethylene','CFCs','Hydrogen'

In [32]:
train_df.head()

Unnamed: 0,Literature,Unnamed: 1,GAC,GAS,E,S,A,B,V,L,...,%O,%N,Temp(K),Pressure,Adsorp(mmol/g),Index,logP,logQ,logD,Label
547,,2,,CarbonDiox,0.0,0.28,0.05,0.1,0.2809,0.058,...,,,298.0,0.103286,0.722063,31,-2.27025,-0.325643,1.944607,CO2
548,,3,,CarbonDiox,0.0,0.28,0.05,0.1,0.2809,0.058,...,,,298.0,0.187793,1.111748,31,-1.672413,0.105933,1.778346,CO2
549,,4,,CarbonDiox,0.0,0.28,0.05,0.1,0.2809,0.058,...,,,298.0,0.28169,1.455587,31,-1.266948,0.37541,1.642357,CO2
550,,5,,CarbonDiox,0.0,0.28,0.05,0.1,0.2809,0.058,...,,,298.0,0.370892,1.799427,31,-0.991844,0.587468,1.579313,CO2
551,,6,,CarbonDiox,0.0,0.28,0.05,0.1,0.2809,0.058,...,,,298.0,0.502347,2.212034,31,-0.688463,0.793913,1.482376,CO2


# Previous  simple model for comparison

In [33]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,\
    GradientBoostingRegressor,ExtraTreesRegressor, BaggingRegressor,StackingRegressor,\
    VotingRegressor,HistGradientBoostingRegressor
model_list = [('RF',RandomForestRegressor(n_estimators=20)),('ADB',AdaBoostRegressor(n_estimators=30)),
('GBR',GradientBoostingRegressor(n_estimators=30)),('ETR',ExtraTreesRegressor(n_estimators=30)),
('BRR',BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=5),n_estimators=20))]
  
input_feature = ['S','V','L','BET',"Vmeso",'Vt','Vmic','Temp(K)','logP'] # without Vmic
output = ['logQ']
train_x= train_df[input_feature]
test_x = test_df[input_feature]
train_y = train_df[output]
test_y = test_df[output]

In [34]:
for model_name, model in model_list:

    model.fit(train_x,train_y)
    test_pre = model.predict(test_x)
    train_error = mean_squared_error(train_y,model.predict(train_x))
    test_error = mean_squared_error(test_y,test_pre)
    print('Algorithm {}, Train_error {}, Test_error {}'.format(model_name,train_error,test_error))
    #plt.scatter(test_y,test_pre)

Algorithm RF, Train_error 0.00865836445335093, Test_error 0.08623348376664657
Algorithm ADB, Train_error 0.24728317117334753, Test_error 0.22979300723222706
Algorithm GBR, Train_error 0.14639884313405788, Test_error 0.13822676171691764
Algorithm ETR, Train_error 7.779557638441807e-05, Test_error 0.08339469191538657
Algorithm BRR, Train_error 0.020304587657683344, Test_error 0.08033489408699486


# Regression chain models

 Data transformation

In [35]:
def pick_six(data, index_list):
    df = pd.DataFrame()
    for index in index_list:
        temp_data  = data[data['Index']==index]
        if len(temp_data)>=7:
            new_index = np.random.choice(list(range(1,len(temp_data)-1)),5,replace =False)
            
            temp_index = np.append(new_index,[0,len(temp_data)-1]) # cover the begin and end points
            
            #new_index = new_index+[len(temp_data)-1] # cover the begin and end points
            
            new_index = sorted(temp_index)
            df = pd.concat([df,temp_data.iloc[new_index,:]],axis=0)
    return df

In [36]:
from sklearn.preprocessing import MinMaxScaler

def pred_dataset(file_names):
    source_path = '/Users/kai/Documents/Desktop/CO2_adsorption/new_data'
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
   
    for file_name in file_names:
        temp_data = pd.read_excel(os.path.join(source_path,file_name+'-01-10-2022.xlsx'),skiprows= 1 )
        temp_data = temp_data.dropna(axis=0,how = 'any',subset = ["BET","Vt",'Vmic','Vmeso'])
        temp_data = temp_data[temp_data['Pressure']>0.1]
        temp_data = temp_data[temp_data['Vmic']<2]
        #temp_data = temp_data[temp_data['Pressure']<1]
        index = set(temp_data['Index'].values)
        test_index= np.random.choice(list(index),int(0.2*len(index)),replace=False)
        train_index = [x for x in index if x not in test_index]
        #train_x = temp_data.loc[~temp_data['Index'].isin( test_index)]
        #test_x = temp_data.loc[temp_data['Index'].isin(test_index)]
        train_x = pick_six(temp_data, train_index)
        test_x = pick_six(temp_data,test_index)
        train_df = pd.concat([train_df,train_x],axis=0)
        test_df = pd.concat([test_df,test_x],axis =0)
       
    return train_df,test_df

In [37]:
train_df,test_df= pred_dataset(['CO2','Methane','Ethane&Ethylene','CFCs','Hydrogen'])

In [38]:
train_df.shape

(16065, 25)

In [39]:
# with labels
def chain_trans(df):
    Labels = set(df['Label'].values)

    input_feature = ['S','V','L','BET',"Vmeso",'Vt','Vmic','Temp(K)']
    data_x = []
    data_y = []
    for label in Labels:
        temp = df[df['Label']==label]
        index_list = list(set(temp['Index'].values))

        for index in index_list:
            temp_df = temp[temp['Index']==index]
            temp_x_l = temp_df[input_feature].iloc[0,:].values
            temp_x_r = temp_df['logP'].values
            temp_y = temp_df[temp_df['Index']==index]['logQ'].values
            temp_x = temp_x_l.tolist()+temp_x_r.tolist()
            data_x.append(temp_x)
            data_y.append(temp_y.squeeze().tolist())
    return data_x,data_y
    

In [40]:
train_x_ch,train_y_ch = chain_trans(train_df)
test_x_ch,test_y_ch = chain_trans(test_df)

In [42]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,\
    GradientBoostingRegressor,ExtraTreesRegressor, BaggingRegressor,StackingRegressor,\
    VotingRegressor,HistGradientBoostingRegressor
model_list = [('RF',RandomForestRegressor(n_estimators=20)),('ADB',AdaBoostRegressor(n_estimators=30)),
('GBR',GradientBoostingRegressor(n_estimators=30)),('ETR',ExtraTreesRegressor(n_estimators=30)),
('BRR',BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=5),n_estimators=20))]
  
input_feature = ['S','V','L','BET',"Vmeso",'Vt','Vmic','Temp(K)','logP'] # without Vmic
output = ['logQ']
train_x= train_df[input_feature]
test_x = test_df[input_feature]
train_y = train_df[output]
test_y = test_df[output]


In [43]:
for model_name, model in model_list:

    model.fit(train_x,train_y)
    test_pre = model.predict(test_x)
    train_error = mean_squared_error(train_y,model.predict(train_x))
    test_error = mean_squared_error(test_y,test_pre)
    print('Algorithm {}, Train_error {}, Test_error {}'.format(model_name,train_error,test_error))
    

Algorithm RF, Train_error 0.009818809179768151, Test_error 0.09217768244209822
Algorithm ADB, Train_error 0.31923368937008334, Test_error 0.33582729690144936
Algorithm GBR, Train_error 0.21212323643411535, Test_error 0.21358801730153412
Algorithm ETR, Train_error 6.0491903210810845e-05, Test_error 0.08799835553724875
Algorithm BRR, Train_error 0.023547283860087722, Test_error 0.0879656475324935


In [44]:
train_x.shape

(16065, 9)

In [45]:
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler



model_list = [('RF',RandomForestRegressor(n_estimators=50)),('ADB',AdaBoostRegressor(n_estimators=150)),
('GBR',GradientBoostingRegressor(n_estimators=50)),('ETR',ExtraTreesRegressor(n_estimators=50)),
('BRR',BaggingRegressor(base_estimator=LGBMRegressor(objective = 'regression',num_leaves = 20,learning_rate =0.1)))
,('LGBM',LGBMRegressor(objective = 'regression',num_leaves = 150,learning_rate =0.1))]
scaler = MinMaxScaler()
train_x_ch = scaler.fit_transform(train_x_ch)
test_x_ch = scaler.transform(test_x_ch)
for name,model in model_list:
    chain = RegressorChain(base_estimator=model, order=[0,1,2,3,4,5,6]).fit(train_x_ch, train_y_ch)
    test_pred = chain.predict(test_x_ch)
    train_pred = chain.predict(train_x_ch)
    train_error = mean_squared_error(np.array(train_y_ch).reshape(-1,1),np.array(train_pred).reshape(-1,1))
    test_error = mean_squared_error(np.array(test_y_ch).reshape(-1,1),np.array(test_pred).reshape(-1,1))
    test_r2 = r2_score(np.array(test_y_ch).reshape(-1,1),np.array(test_pred).reshape(-1,1))
    print('Algorithm {}, Train_error {}, Test_error {}, Test_r2 {}'.format(name,train_error,test_error,test_r2))

Algorithm RF, Train_error 0.0346483260999338, Test_error 0.11747459149800388, Test_r2 0.8945796179012067
Algorithm ADB, Train_error 0.29258130379573527, Test_error 0.32258055150183323, Test_r2 0.7105198276212725
Algorithm GBR, Train_error 0.1384540540401468, Test_error 0.16551874220842816, Test_r2 0.8514653353919458
Algorithm ETR, Train_error 2.518429859931069e-23, Test_error 0.10305438531821852, Test_r2 0.9075201493474656
Algorithm BRR, Train_error 0.05340126485236442, Test_error 0.09402956046745255, Test_r2 0.9156189260447108
Algorithm LGBM, Train_error 0.027562126401032734, Test_error 0.10544715194011646, Test_r2 0.905372907392102


In [46]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor


model_list = [('RF',RandomForestRegressor(n_estimators=50)),('ADB',AdaBoostRegressor(n_estimators=50)),
('GBR',GradientBoostingRegressor(n_estimators=50)),('ETR',ExtraTreesRegressor(n_estimators=10)),
('LGBM',LGBMRegressor(objective = 'regression',num_leaves = 50,learning_rate =0.2))]


for name,model in model_list:
    chain = MultiOutputRegressor(estimator=model).fit(train_x_ch, train_y_ch)
    test_pred = chain.predict(test_x_ch)
    train_pred = chain.predict(train_x_ch)
    train_error = mean_squared_error(np.array(train_y_ch).reshape(-1,1),np.array(train_pred).reshape(-1,1))
    test_error = mean_squared_error(np.array(test_y_ch).reshape(-1,1),np.array(test_pred).reshape(-1,1))
    test_r2 = r2_score(np.array(test_y_ch).reshape(-1,1),np.array(test_pred).reshape(-1,1))
    print('Algorithm {}, Train_error {}, Test_error {}, Test_r2 {}'.format(name,train_error,test_error,test_r2))


Algorithm RF, Train_error 0.013965660541622045, Test_error 0.09470580886987506, Test_r2 0.9150120683057904
Algorithm ADB, Train_error 0.2678611451250686, Test_error 0.29202102273027813, Test_r2 0.7379436063190783
Algorithm GBR, Train_error 0.11626489418253619, Test_error 0.13253517108580828, Test_r2 0.8810644225340228
Algorithm ETR, Train_error 4.3888398762114285e-32, Test_error 0.09092177126640633, Test_r2 0.9184078212507212
Algorithm LGBM, Train_error 0.004353859177792087, Test_error 0.08526946985762889, Test_r2 0.9234801332005013
