In [7]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.preprocessing import OneHotEncoder

from Bio import pairwise2
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score

from sklearn import preprocessing
from Bio.SeqUtils.ProtParam import ProteinAnalysis


from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline
from Bio.Align import AlignInfo
import pandascharm as pc

from Bio.SubsMat.MatrixInfo import blosum62
from Bio.SubsMat.MatrixInfo import blosum45

In [8]:
def removeoutlier_col(df,cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[[cols]] < (Q1 - 1.5 * IQR)) |(df[[cols]] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out


def RF(X_train, y_train, X_val):
    
    rf = RandomForestRegressor()
    rf.fit(X_train,np.ravel(y_train))
    RF_pred = rf.predict(X_val)

    return RF_pred

def XGBR(X_train, y_train, X_val):
    
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    XGB_pred = model.predict(X_val)
    
    return XGB_pred

def EN(X_train, y_train, X_val):
    
    model = ElasticNet()
    model.fit(X_train, y_train)
    EN_pred = model.predict(X_val)
    
    return EN_pred

In [9]:
blosum62.update(((b,a),val) for (a,b),val in list(blosum62.items()))
blosum45.update(((b,a),val) for (a,b),val in list(blosum45.items()))

def score_pairwise(seq1, seq2, matrix, gap_s, gap_e, gap = True):
    for A,B in zip(seq1, seq2):
        diag = ('-'==A) or ('-'==B)
        yield (gap_e if gap else gap_s) if diag else matrix[(A,B)]
        gap = diag

In [10]:
#Sequence Representation Methods with Temperature

def encode_temp(encoding, output, df_clean, aln, esm1b , temperature):

    ClustalAlign = AlignIO.read(aln, 'clustal')
    summary_align = AlignInfo.SummaryInfo(ClustalAlign )
    dframe = pc.from_bioalignment(ClustalAlign).transpose()
    sequences = dframe.loc[df_clean.index]
    
    y = df_clean['Percentage Activity Depending on Optimum Temp']

    scaler = preprocessing.StandardScaler()
  
    
    if encoding == 'One-Hot-Encoder':

        one_hot = OneHotEncoder()
        encoded = one_hot.fit(sequences)
        sequences_encoded = encoded.transform(sequences).toarray()
        X = np.concatenate((sequences_encoded,df_clean[[temperature]]), axis =1)
        

        
           
    if encoding == 'Bag-of-Words':

        X = pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df_clean['Sequence']])
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

        
    if encoding == 'bigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                st = i+j
                all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-1):
                try:
                    check = temp[k] + temp[k+1]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

    
    if encoding == 'trigram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    st = i+j+k
                    all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-2):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

        
        
    if encoding == 'quadrogram':
        
        X = df_clean['Sequence']

        example = df_clean['Sequence'][0]
        lst = ['E','G','L','Y','T','H','R','A','C','D','P','I','F','N','K','S','V','M','W','Q']
        all_dct = {}
        key = []
        for i in lst:
            for j in lst:
                for k in lst:
                    for l in lst:
                        st = i+j+k+l
                        all_dct[st] = []

        for example, id in zip(X,range(len(X))):

            temp = list(example)
            temp_dct = dict.fromkeys(all_dct.keys(),0)
            for k in range(len(temp)-3):
                try:
                    check = temp[k] + temp[k+1]+temp[k+2]+temp[k+3]
                    temp_dct[check] += 1
                except:
                    pass
            for key, value in temp_dct.items():
                all_dct[key].append(value)
                
        X = pd.DataFrame.from_dict(all_dct).set_index(df_clean.index)
        
        X = np.concatenate((X,df_clean[[temperature]]), axis =1)

    if encoding == 'BLOSUM62':

        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum62, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq,df_clean[[temperature]]), axis =1)

        
        
        
    if encoding == 'BLOSUM45':
        
        n = len(sequences)
        enc_seq = np.zeros((n,n))

        i = 0

        for a in list(sequences.index):
            j = 0
            for b in list(sequences.index):
                enc_seq[i,j] = sum(score_pairwise(sequences.loc[a], sequences.loc[b], blosum45, -5, -1))
                j += 1
            i += 1

        X = np.concatenate((enc_seq,df_clean[[temperature]]), axis =1)

        
    if encoding == 'ESM1b':
        

        encoded = esm1b.loc[df_clean.index]
        X = np.concatenate((encoded,df_clean[[temperature]]), axis =1)

    scaler.fit(X)
    X_scaled = scaler.transform(X)
    
    return X, y, X_scaled

In [17]:
def evaluate_performance_identity_based(enzyme , df, output, aln, esm1b, methods, state):

    summary=pd.DataFrame()
    
    df['Relative Temperature'] = df['Reaction Temperature'] - df["Temperature Optimum"]
    df['Log' + output]=np.log10(df[output])
    
    df_clean = removeoutlier_col(df,'Log' + output).copy()
    
    # Create a mapping of unique sequences to unique codes
    sequence_to_code = {seq: f"ENZYME_{i+1}" for i, seq in enumerate(df_clean['Sequence'].unique())}

    # Map these codes to a new column in the DataFrame using .loc
    df_clean.loc[:, 'Sequence Code'] = df_clean['Sequence'].map(sequence_to_code)
    
    df_clean = df_clean.set_index('Index')

        
    for method in methods:
        X, y, X_scaled = encode_temp(method, output, df_clean, aln, esm1b , 'Relative Temperature')
         
        
        splitter = GroupShuffleSplit(test_size=.20, n_splits=10, random_state = state)

        split = splitter.split(df_clean, groups=df_clean['Sequence Code'])
        
        train_inds, val_inds = next(split)
        
        X_train = X[train_inds]
        y_train = y[train_inds]
        
        X_val = X[val_inds]
        y_val = y[val_inds]
        

        y_predicted_EN = EN(X_train, y_train, X_val)

        summary['EN with ' + method] = y_predicted_EN


    X_train_seq = df_clean.loc[y_train.index]['Sequence']
    X_val_seq = df_clean.loc[y_val.index]['Sequence']

    summary['Percentage Activity Depending on Optimum Temp'] = y_val.values

    list_R2 = []

    for model in summary.columns[:-1]:
        r2=r2_score(summary['Percentage Activity Depending on Optimum Temp'], summary[model])
        list_R2.append(r2)


    x = np.reshape(list_R2, (len(matrix), len(summary.columns[:-1])))

    number_of_data = [len(a) for a in matrix]

    Results=pd.DataFrame(x, columns=summary.columns[:-1])

    return summary, list_R2, Results, number_of_data



IndentationError: unexpected indent (4024462765.py, line 48)

In [18]:
enzyme = 'betaGlucosidasewithMutants'

df = pd.read_excel('betaGlucosidasewithMutantsOptimumTemperature.xlsx')

output = 'pNP-Glc kcat/Km (1/smM)'
aln = enzyme + '.aln'

x = datetime.datetime.now()
date = str(x.year)+str(x.month)+str(x.day)



methods = ['ESM1b', 'bigram', 'trigram', 'quadrogram']


esm1b = pd.read_excel(enzyme+'ESM1b_embeddings.xlsx', index_col = 0)

In [19]:
list_summary = []
list_list_R2 = []
list_Results = []
random_state = [202 , 1, 42, 101, 2022,5 , 10, 22, 1995, 0]
list_number_of_data = []

for state in random_state:
    summary, list_R2, Results, number_of_data = evaluate_performance_identity_based(enzyme , 
                                                                                  df , output, aln, esm1b,
                                                                                  methods, state)
    print( "Run is completed")
    print(Results)
    list_summary.append(summary)
    list_list_R2.append(list_R2)
    list_Results.append(Results)
    list_number_of_data.append(number_of_data)

  y_train = y[train_inds]
  y_val = y[val_inds]
  example = df_clean['Sequence'][0]
  y_train = y[train_inds]
  y_val = y[val_inds]
  example = df_clean['Sequence'][0]
  y_train = y[train_inds]
  y_val = y[val_inds]
  example = df_clean['Sequence'][0]
  y_train = y[train_inds]
  y_val = y[val_inds]


NameError: name 'list_R2' is not defined

In [None]:
df_average = pd.concat([list_Results[0],list_Results[1],list_Results[2],list_Results[3], list_Results[4],
                       list_Results[5],list_Results[6],list_Results[7],list_Results[8], list_Results[9]])

average = df_average.groupby(level=0, sort=False).mean()
std = df_average.groupby(level=0, sort=False).std()
list_Results.append(average)
list_Results.append(std)



In [None]:
df_number_of_data = pd.DataFrame(data = list_number_of_data, columns = ['<100%', '80-99.99%', '0-79.99%', '80-99%', '>99%',
                                                              '<100% wild type','80-99.99% wild type', '0-79.99% wild type', 
                                                              '<100% mutant type','80-99.99% mutant type', '>99% mutant type'])
list_Results.append(df_number_of_data)

In [None]:
names = ['Trial 1','Trial 2','Trial 3','Trial 4','Trial 5','Trial 6','Trial 7','Trial 8','Trial 9','Trial 10']

writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20250224 Predicting Temperature Profiles betaGlucosidase - y_val values.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_summary)]
writer.close()

In [None]:
names = ['Trial 1','Trial 2','Trial 3','Trial 4','Trial 5','Trial 6','Trial 7','Trial 8','Trial 9','Trial 10', "Average", "STD", 'Number of Data Points']
writer=pd.ExcelWriter(r"C:\Users\memre\Desktop\Research\Predicting Enzyme Properties Based on Various Organisms\Code\Temperature Profile Prediction\20250224 Predicting Temperature Profiles betaGlucosidase - Summary based on Identity.xlsx")
_ = [A.to_excel(writer,sheet_name="{0}".format(names[i])) for i, A in enumerate(list_Results)]
writer.close()