In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
import pandascharm as pc

from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.preprocessing import OneHotEncoder
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import blosum as bl
from Bio.SubsMat.MatrixInfo import blosum62
from Bio.SubsMat.MatrixInfo import blosum45
from Bio import AlignIO
from Bio import SeqIO
from Bio.Align import AlignInfo

from sklearn.preprocessing import scale 
from sklearn import model_selection

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso


from sklearn.utils import shuffle
from sklearn.model_selection import cross_validate, GroupKFold
import scipy.stats as stats




In [2]:
def removeoutlier_col(df,cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[[cols]] < (Q1 - 1.5 * IQR)) |(df[[cols]] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out

In [3]:
blosum62.update(((b,a),val) for (a,b),val in list(blosum62.items()))
blosum45.update(((b,a),val) for (a,b),val in list(blosum45.items()))

def score_pairwise(seq1, seq2, matrix, gap_s, gap_e, gap = True):
    for A,B in zip(seq1, seq2):
        diag = ('-'==A) or ('-'==B)
        yield (gap_e if gap else gap_s) if diag else matrix[(A,B)]
        gap = diag


In [4]:
#Sequence Representation Methods with Temperature

def encode_temp(encoding, output, df_clean, aln, esm1b , temperature):
    
    df_clean = df_clean.set_index('Index')
    
    ClustalAlign = AlignIO.read(aln, 'clustal')
    summary_align = AlignInfo.SummaryInfo(ClustalAlign )
    dframe = pc.from_bioalignment(ClustalAlign).transpose()
    sequences = dframe.loc[df_clean.index]
    
    y = df_clean['Log'+output]

    scaler = preprocessing.StandardScaler()
  
    
    if encoding == 'One-Hot-Encoder':

        one_hot = OneHotEncoder()
        encoded = one_hot.fit(sequences)
        sequences_encoded = encoded.transform(sequences).toarray()
        X = np.concatenate((sequences_encoded,df_clean[[temperature]]), axis =1)
        

        
           
    if encoding == 'Bag-of-Words':

        X = pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df_clean['Sequence']])
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

        
    if encoding == 'bigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                st = i+j
                all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-1):
                try:
                    check = temp[k] + temp[k+1]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

    
    if encoding == 'trigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    st = i+j+k
                    all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-2):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

        
        
    if encoding == 'quadrogram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    for l in lst:
                        st = i+j+k+l
                        all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-3):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]+temp[k+3]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

    if encoding == 'BLOSUM62':

        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum62, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq,df_clean[[temperature]]), axis =1)

        
        
        
    if encoding == 'BLOSUM45':
        
        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum45, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq,df_clean[[temperature]]), axis =1)

        
    if encoding == 'ESM1b':
        

        encoded = esm1b.loc[df_clean.index]
        X = np.concatenate((encoded,df_clean[[temperature]]), axis =1)

    scaler.fit(X)
    X_scaled = scaler.transform(X)
    
    return X, y, X_scaled

In [5]:
def ml_process(encoding, output, df, aln, esm1b, temp=False):
    
    df_clean = removeoutlier_col(df,'Log' + output).copy()
    df_clean = df_clean.reset_index()

    # Create a mapping of unique sequences to unique codes
    sequence_to_code = {seq: f"ENZYME_{i+1}" for i, seq in enumerate(df_clean['Sequence'].unique())}

    # Map these codes to a new column in the DataFrame using .loc
    df_clean.loc[:, 'Sequence Code'] = df_clean['Sequence'].map(sequence_to_code)
    
    
    group_kfold = GroupKFold(n_splits=5)
    

    X, y, X_scaled = encode_temp(encoding, output, df_clean, aln, esm1b , 'Reaction Temperature')
    
    y_scaled = y

    X_scaled, y_scaled, group_names_scaled = shuffle(X_scaled, y_scaled,df_clean['Sequence Code'], random_state=101)
    
    X, y , group_names = shuffle(X, y, df_clean['Sequence Code'], random_state=101)
    

    lm = LinearRegression()   
    scores_lm = cross_validate(lm, X_scaled, y_scaled, cv=group_kfold, groups=group_names_scaled,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_lm.update({"Algorithm": "Linear regression", 'Sequence Representation Method': encoding})
                               
    lasso_reg=Lasso()
    scores_lasso = cross_validate(lasso_reg,  X_scaled,y_scaled, cv=group_kfold,groups=group_names_scaled,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_lasso.update({"Algorithm": "LASSO", 'Sequence Representation Method': encoding})
                               
    rf = RandomForestRegressor(random_state=42)
    scores_rf = cross_validate(rf,  X, y, cv=group_kfold,groups=group_names,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_rf.update({"Algorithm": "Random Forest", 'Sequence Representation Method': encoding})    
    
    tree_reg=DecisionTreeRegressor()
    scores_tree_reg = cross_validate(tree_reg,  X, y, cv=group_kfold,groups= group_names,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_tree_reg.update({"Algorithm": "Decision Tree", 'Sequence Representation Method': encoding})    
    
    svr_reg = SVR()
    scores_svr_reg = cross_validate(svr_reg,  X_scaled, y_scaled, cv=group_kfold,groups=group_names_scaled,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_svr_reg.update({"Algorithm": "SVR", 'Sequence Representation Method': encoding})    
        
    
    mlp_reg = MLPRegressor(random_state=101, max_iter=100)
    scores_mlp_reg = cross_validate(mlp_reg,  X_scaled, y_scaled, cv=group_kfold,groups=group_names_scaled,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_mlp_reg.update({"Algorithm": "Neural Network", 'Sequence Representation Method': encoding})    
    
    en = ElasticNet()
    scores_en = cross_validate(en, X_scaled,y_scaled, cv=group_kfold,groups=group_names_scaled,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_tree_reg.update({"Algorithm": "Elastic Network", 'Sequence Representation Method': encoding})    
    
    xgb_reg = xgb.XGBRegressor()
    scores_xgb_reg = cross_validate(xgb_reg,  X, y, cv=group_kfold,groups=group_names,scoring=('r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
    scores_xgb_reg.update({"Algorithm": "XGBoost", 'Sequence Representation Method': encoding})    
    

   

    dfResults = pd.concat([pd.DataFrame(scores_lm), pd.DataFrame(scores_lasso), pd.DataFrame(scores_rf), pd.DataFrame(scores_tree_reg), pd.DataFrame(scores_svr_reg), pd.DataFrame(scores_mlp_reg), pd.DataFrame(scores_tree_reg), pd.DataFrame(scores_xgb_reg)])
        
    
    return dfResults


In [6]:
enzyme = 'betaGlucosidasewithMutants'

df = pd.read_excel('betaGlucosidasewithMutantsOptimumTemperature.xlsx')

output = 'pNP-Glc kcat/Km (1/smM)'

methods = ['One-Hot-Encoder', 'Bag-of-Words', 'bigram', 'trigram', 'quadrogram', 'BLOSUM45', 'BLOSUM62', 'ESM1b']

aln = enzyme +'.aln'

x = datetime.datetime.now()
date = str(x.year)+str(x.month)+str(x.day)

df['Log' + output] = np.log10(df[output])


In [7]:
esm1b = pd.read_excel(enzyme+'ESM1b_embeddings.xlsx', index_col = 0)

In [8]:
summary=[]

for method in methods:
    dfR = ml_process(method, output , df, aln, esm1b, temp = False)
    print(dfR)
    summary.append(dfR)



     fit_time  score_time       test_r2  train_r2  \
0    1.219476    0.009766 -1.196104e+24  0.850747   
1    0.907027    0.015658 -1.277445e+25  0.850074   
2    1.007823    0.014537 -6.682194e+24  0.818981   
3    0.970747    0.010620 -3.322959e+24  0.801798   
4    1.051828    0.010001 -1.969130e+25  0.406966   
0    0.291666    0.009997 -1.902104e-02  0.000000   
1    0.271788    0.011488 -4.193754e-02  0.000000   
2    0.281553    0.009999 -1.207697e-01  0.000000   
3    0.274066    0.010002 -2.797670e-02  0.000000   
4    0.275151    0.011002 -3.599947e-02  0.000000   
0  116.759645    0.028699  3.227810e-01  0.981208   
1  133.206422    0.029068  3.627374e-01  0.981397   
2  168.758353    0.028474  3.774094e-01  0.981058   
3  138.357164    0.038734  3.135051e-01  0.981757   
4  146.796932    0.040748  4.110667e-01  0.975690   
0    2.865153    0.016003  2.686313e-01  1.000000   
1    2.549248    0.009535 -7.245546e-01  1.000000   
2    2.169460    0.008854  1.888156e-01  1.000



   fit_time  score_time   test_r2  train_r2  test_neg_root_mean_squared_error  \
0  0.003560    0.002992  0.186682  0.469574                         -1.203989   
1  0.002013    0.003000  0.189463  0.479266                         -1.149232   
2  0.002552    0.003713 -0.067882  0.474475                         -1.192053   
3  0.002520    0.004001  0.317997  0.442638                         -1.067499   
4  0.001542    0.003016  0.194690  0.483050                         -1.238978   
0  0.002555    0.002034 -0.019021  0.000000                         -1.347671   
1  0.002237    0.002087 -0.041938  0.000000                         -1.302993   
2  0.001005    0.002042 -0.120770  0.000000                         -1.221216   
3  0.002516    0.001999 -0.027977  0.000000                         -1.310588   
4  0.002190    0.001842 -0.035999  0.000000                         -1.405277   
0  1.783766    0.036012  0.311783  0.980950                         -1.107529   
1  1.885386    0.013844  0.0

    fit_time  score_time       test_r2  train_r2  \
0   0.039274    0.003240 -1.673704e+25  0.949028   
1   0.037699    0.004009 -7.087534e+23  0.953753   
2   0.037698    0.003254 -5.756945e+24  0.948988   
3   0.035608    0.004041 -2.067124e+23  0.953618   
4   0.034379    0.002998 -1.360831e+24  0.950306   
0   0.008750    0.004070 -1.902104e-02  0.000000   
1   0.007856    0.003511 -4.193754e-02  0.000000   
2   0.009009    0.003216 -1.207697e-01  0.000000   
3   0.009019    0.003847 -2.797670e-02  0.000000   
4   0.008241    0.003011 -3.599947e-02  0.000000   
0  14.707594    0.035451  4.575219e-02  0.981574   
1  15.978702    0.023034  1.438009e-01  0.982674   
2  15.461866    0.014829  2.371966e-01  0.982084   
3  14.861256    0.026276  4.071386e-01  0.983705   
4  15.159040    0.010954  3.352048e-01  0.979990   
0   0.149305    0.002013 -1.080943e+00  1.000000   
1   0.163905    0.002011  8.980557e-02  1.000000   
2   0.146183    0.003000 -2.071066e+00  1.000000   
3   0.150038

     fit_time  score_time       test_r2  train_r2  \
0    0.753189    0.005506 -2.885294e+25  0.887064   
1    0.787697    0.008051 -2.890121e+25  0.936224   
2    0.828870    0.010237 -4.704523e+25  0.923189   
3    0.554354    0.007066 -5.053959e+25  0.904220   
4    0.560504    0.007006 -8.508803e+25  0.922930   
0    0.142655    0.007001 -1.902104e-02  0.000000   
1    0.153600    0.008316 -4.193754e-02  0.000000   
2    0.227527    0.008002 -1.207697e-01  0.000000   
3    0.210687    0.010431 -2.797670e-02  0.000000   
4    0.202130    0.013144 -3.599947e-02  0.000000   
0  108.287102    0.043046  3.239412e-01  0.981206   
1  110.236134    0.037503 -2.601872e-02  0.980993   
2  106.681657    0.027590  3.726312e-01  0.982275   
3   94.790374    0.021387  4.594955e-01  0.983009   
4   97.669521    0.032148  1.550034e-01  0.978333   
0    1.512337    0.007939 -1.683445e-01  1.000000   
1    1.731652    0.009853 -8.744971e-01  1.000000   
2    1.355867    0.011981  2.016767e-01  1.000

      fit_time  score_time       test_r2  train_r2  \
0    19.747863    0.063868 -3.100881e+25 -2.331994   
1    17.218759    0.082350 -2.822381e+25 -8.965434   
2    16.341386    0.071433 -5.837626e+25 -5.074791   
3    16.242499    0.063771 -1.982057e+25 -2.287407   
4    17.187457    0.080607 -8.507872e+25 -8.521895   
0     2.981990    0.076728 -1.902104e-02  0.000000   
1     2.711012    0.058995 -4.193754e-02  0.000000   
2     3.193089    0.068202 -1.207697e-01  0.000000   
3     2.576385    0.064702 -2.797670e-02  0.000000   
4     2.800755    0.066564 -3.599947e-02  0.000000   
0   985.542831    0.122097  8.038266e-02  0.981138   
1  1047.534686    0.143501  3.489174e-01  0.982935   
2   845.864872    0.175442  1.923289e-01  0.981620   
3   776.874300    0.147515  4.389266e-01  0.983025   
4   831.812127    0.124299  3.573020e-01  0.979098   
0    13.761905    0.120254 -3.304551e-01  1.000000   
1    14.237372    0.115144  4.370944e-02  1.000000   
2    15.183994    0.110467 -

    fit_time  score_time       test_r2  train_r2  \
0   0.083304    0.003229 -1.391960e+19  0.949101   
1   0.064533    0.001531 -6.569080e+21  0.951056   
2   0.073484    0.002162 -1.203977e+18  0.949125   
3   0.070956    0.003001 -1.766048e+21  0.953131   
4   0.068830    0.003006 -9.178490e+20  0.946690   
0   0.010829    0.001521 -1.902104e-02  0.000000   
1   0.010758    0.002239 -4.193754e-02  0.000000   
2   0.014987    0.003534 -1.207697e-01  0.000000   
3   0.016539    0.003995 -2.797670e-02  0.000000   
4   0.013960    0.002008 -3.599947e-02  0.000000   
0  62.563951    0.011766  9.236655e-02  0.984213   
1  65.177163    0.011821  4.305424e-01  0.985344   
2  68.922789    0.017430  4.634578e-01  0.984234   
3  70.323695    0.011755  3.872698e-01  0.985581   
4  68.735588    0.011908  4.652166e-01  0.981425   
0   0.726726    0.003274 -7.380346e-02  1.000000   
1   0.716744    0.003662  1.076831e-01  1.000000   
2   0.656593    0.002587 -1.212857e+00  1.000000   
3   0.687891

    fit_time  score_time       test_r2  train_r2  \
0   0.085265    0.002998 -1.990798e+20  0.948953   
1   0.070805    0.001999 -9.859266e+20  0.952966   
2   0.069633    0.002137 -1.557252e+19  0.949121   
3   0.066850    0.002529 -6.839687e+21  0.944168   
4   0.070194    0.003199 -8.283067e+21  0.947708   
0   0.011712    0.002206 -1.902104e-02  0.000000   
1   0.010772    0.001997 -4.193754e-02  0.000000   
2   0.010696    0.002010 -1.207697e-01  0.000000   
3   0.012945    0.002519 -2.797670e-02  0.000000   
4   0.012130    0.003540 -3.599947e-02  0.000000   
0  56.597956    0.011255  2.282215e-01  0.983410   
1  59.365644    0.014382  3.776074e-01  0.984874   
2  60.202394    0.010793  5.367120e-01  0.983019   
3  56.877161    0.011773  3.893988e-01  0.984943   
4  60.269083    0.011222  4.699177e-01  0.981007   
0   0.719492    0.004901 -2.415066e-01  1.000000   
1   0.739031    0.005001 -1.151779e-01  1.000000   
2   0.821171    0.003291  2.130917e-01  1.000000   
3   0.592363

     fit_time  score_time       test_r2  train_r2  \
0    0.125349    0.002010 -8.547858e+08  0.956013   
1    0.099447    0.001999 -2.796423e+09  0.963792   
2    0.093581    0.002998 -4.797364e+21  0.957934   
3    0.095606    0.003165 -1.833537e+09  0.961202   
4    0.094219    0.002609 -1.965039e+09  0.958930   
0    0.020237    0.003042 -1.902104e-02  0.000000   
1    0.019895    0.002517 -4.193754e-02  0.000000   
2    0.017513    0.003238 -1.207697e-01  0.000000   
3    0.020489    0.002006 -2.797670e-02  0.000000   
4    0.019411    0.002995 -3.599947e-02  0.000000   
0  102.383012    0.012542  4.125494e-01  0.983702   
1   98.655777    0.011827  7.284346e-03  0.984865   
2   98.626826    0.018127  2.447714e-01  0.983640   
3   97.180071    0.021847  2.174205e-01  0.985325   
4   99.233421    0.014117  5.539554e-01  0.981556   
0    1.188466    0.004285 -1.731600e-01  1.000000   
1    1.156952    0.003003 -7.660798e-01  1.000000   
2    1.224949    0.003159 -4.766126e-01  1.000

In [9]:
result=pd.DataFrame()
for item in range(8):
    result=result.append(summary[item])
result.to_excel(date + 'Single Layer' + enzyme +'5 CV.xlsx')

  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])
  result=result.append(summary[item])


In [10]:
df_res=result

In [11]:
df_res['test_root_mean_squared_error']=df_res['test_neg_root_mean_squared_error'].abs()
df_res['test_mean_absolute_error']=df_res['test_neg_mean_absolute_error'].abs()

df_res['train_root_mean_squared_error']=df_res['train_neg_root_mean_squared_error'].abs()
df_res['train_mean_absolute_error']=df_res['train_neg_mean_absolute_error'].abs()

In [12]:
test_res=df_res.groupby(['Algorithm', 'Sequence Representation Method'], as_index=False).agg({'test_r2':['mean','std'],
                                                                                             'test_root_mean_squared_error':['mean','std'],
                                                                                             'test_mean_absolute_error':['mean','std'],
                                                                                             'train_r2':['mean','std'],
                                                                                             'train_root_mean_squared_error':['mean','std'],
                                                                                             'train_mean_absolute_error':['mean','std']})

In [13]:
test_res.to_excel(date + 'Single Layer' + enzyme +'5 CV mean and std.xlsx')